diff --git a/bench/ctable/bench_append_regression.py b/bench/ctable/bench_append_regression.py new file mode 100644 index 00000000..03f875db --- /dev/null +++ b/bench/ctable/bench_append_regression.py @@ -0,0 +1,117 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark: append() overhead introduced by the new schema pipeline +# +# The new append() path routes every row through: +# _normalize_row_input → validate_row (Pydantic) → _coerce_row_to_storage +# +# This benchmark isolates how much each step costs, and shows the +# total overhead vs the raw NDArray write speed. + +from dataclasses import dataclass +from time import perf_counter + +import numpy as np + +import blosc2 +from blosc2.schema_compiler import compile_schema +from blosc2.schema_validation import build_validator_model, validate_row + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 5_000 +rng = np.random.default_rng(42) +data = [ + (int(i), float(rng.uniform(0, 100)), bool(i % 2)) + for i in range(N) +] +schema = compile_schema(Row) +# Warm up the Pydantic model cache +build_validator_model(schema) + +print(f"append() pipeline cost breakdown | N = {N:,} rows") +print("=" * 60) + +# ── 1. Raw NDArray writes (no CTable overhead at all) ──────────────────────── +ids = np.zeros(N, dtype=np.int64) +scores = np.zeros(N, dtype=np.float64) +flags = np.zeros(N, dtype=np.bool_) +mask = np.zeros(N, dtype=np.bool_) + +t0 = perf_counter() +for i, (id_, score, active) in enumerate(data): + ids[i] = id_ + scores[i] = score + flags[i] = active + mask[i] = True +t_raw = perf_counter() - t0 +print(f"{'Raw NumPy writes (baseline)':<40} {t_raw:.4f} s") + +# ── 2. _normalize_row_input only ───────────────────────────────────────────── +t_obj = blosc2.CTable(Row, expected_size=N, validate=False) +t0 = perf_counter() +for row in data: + _ = t_obj._normalize_row_input(row) +t_normalize = perf_counter() - t0 +print(f"{'_normalize_row_input only':<40} {t_normalize:.4f} s ({t_normalize/t_raw:.1f}x baseline)") + +# ── 3. Pydantic validate_row only ──────────────────────────────────────────── +row_dicts = [t_obj._normalize_row_input(row) for row in data] +t0 = perf_counter() +for rd in row_dicts: + _ = validate_row(schema, rd) +t_validate = perf_counter() - t0 +print(f"{'validate_row (Pydantic) only':<40} {t_validate:.4f} s ({t_validate/t_raw:.1f}x baseline)") + +# ── 4. _coerce_row_to_storage only ─────────────────────────────────────────── +t0 = perf_counter() +for rd in row_dicts: + _ = t_obj._coerce_row_to_storage(rd) +t_coerce = perf_counter() - t0 +print(f"{'_coerce_row_to_storage only':<40} {t_coerce:.4f} s ({t_coerce/t_raw:.1f}x baseline)") + +# ── 5. Full append(), validate=False (3 runs, take minimum) ───────────────── +RUNS = 3 +best_off = float("inf") +for _ in range(RUNS): + t_obj2 = blosc2.CTable(Row, expected_size=N, validate=False) + t0 = perf_counter() + for row in data: + t_obj2.append(row) + best_off = min(best_off, perf_counter() - t0) +t_append_off = best_off +print(f"{'Full append(), validate=False':<40} {t_append_off:.4f} s ({t_append_off/t_raw:.1f}x baseline)") + +# ── 6. Full append(), validate=True (3 runs, take minimum) ────────────────── +best_on = float("inf") +for _ in range(RUNS): + t_obj3 = blosc2.CTable(Row, expected_size=N, validate=True) + t0 = perf_counter() + for row in data: + t_obj3.append(row) + best_on = min(best_on, perf_counter() - t0) +t_append_on = best_on +print(f"{'Full append(), validate=True':<40} {t_append_on:.4f} s ({t_append_on/t_raw:.1f}x baseline)") + +print() +print("=" * 60) +pydantic_cost = max(t_append_on - t_append_off, 0.0) +print(f"{'Pydantic overhead in append()':<40} {pydantic_cost:.4f} s") +if t_append_on > 0: + print(f"{'Validation fraction of total':<40} {pydantic_cost/t_append_on*100:.1f}%") +print(f"{'Per-row Pydantic cost (isolated)':<40} {(t_validate/N)*1e6:.2f} µs/row") +print() +print(f"Note: append() is dominated by blosc2 I/O ({t_append_off/t_raw:.0f}x raw numpy),") +print(" not by the validation pipeline.") +print(" The main bottleneck is the last_true_pos backward scan per row.") diff --git a/bench/ctable/bench_pandas_roundtrip.py b/bench/ctable/bench_pandas_roundtrip.py new file mode 100644 index 00000000..03a09ffa --- /dev/null +++ b/bench/ctable/bench_pandas_roundtrip.py @@ -0,0 +1,209 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark: pandas ↔ CTable round-trip (with on-disk persistence) +# +# Pipeline measured in four isolated steps: +# +# 1. pandas → CTable : DataFrame.to_arrow() + CTable.from_arrow() +# 2. CTable.save() : write in-memory CTable to disk +# 3. CTable.load() : read disk table back into RAM +# 4. CTable → pandas : CTable.to_arrow().to_pandas() +# +# Plus the combined full round-trip (steps 1-4) is shown at the end. +# +# Each measurement is the minimum of NRUNS repetitions to reduce noise. +# Schema: id (int64), score (float64), active (bool), label (string ≤16). + +import os +import shutil +from time import perf_counter + +import numpy as np +import pandas as pd +import pyarrow as pa + +from blosc2 import CTable + +NRUNS = 3 +TABLE_DIR = "saved_ctable/bench_pandas" +SIZES = [1_000, 10_000, 100_000, 1_000_000] + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def sep(title: str) -> None: + print(f"\n{'─' * 60}") + print(f" {title}") + print(f"{'─' * 60}") + + +def tmin(fn, n: int = NRUNS) -> float: + """Minimum elapsed time (s) over *n* calls of *fn*.""" + best = float("inf") + for _ in range(n): + t0 = perf_counter() + fn() + best = min(best, perf_counter() - t0) + return best + + +def clean(path: str = TABLE_DIR) -> None: + if os.path.exists(path): + shutil.rmtree(path) + os.makedirs(path, exist_ok=True) + + +def make_dataframe(n: int) -> pd.DataFrame: + rng = np.random.default_rng(42) + return pd.DataFrame({ + "id": np.arange(n, dtype=np.int64), + "score": rng.uniform(0, 100, n).astype(np.float64), + "active": rng.integers(0, 2, n, dtype=bool), + "label": [f"r{i % 10000:05d}" for i in range(n)], + }) + + +# --------------------------------------------------------------------------- +# Section 1: pandas → CTable (in-memory) +# --------------------------------------------------------------------------- + +sep("1. pandas → CTable (from_arrow, in-memory)") +print(f"{'rows':>12} {'pandas→arrow (s)':>18} {'arrow→ctable (s)':>18} {'total (s)':>12}") +print(f"{'----':>12} {'----------------':>18} {'----------------':>18} {'---------':>12}") + +ctables: dict[int, CTable] = {} # keep for steps 2 & 4 + +for N in SIZES: + df = make_dataframe(N) + + def bench_to_arrow(df=df): + return pa.Table.from_pandas(df, preserve_index=False) + + def bench_from_arrow(df=df): + at = pa.Table.from_pandas(df, preserve_index=False) + return CTable.from_arrow(at) + + t_pa = tmin(bench_to_arrow) + t_ct = tmin(bench_from_arrow) - t_pa # from_arrow only + t_tot = t_pa + t_ct + + # Keep one CTable for later steps + at = pa.Table.from_pandas(df, preserve_index=False) + ctables[N] = CTable.from_arrow(at) + + print(f"{N:>12,} {t_pa:>18.4f} {t_ct:>18.4f} {t_tot:>12.4f}") + + +# --------------------------------------------------------------------------- +# Section 2: CTable.save() (in-memory → disk) +# --------------------------------------------------------------------------- + +sep("2. CTable.save() (in-memory → disk)") +print(f"{'rows':>12} {'save (s)':>14} {'compressed':>12} {'ratio':>8}") +print(f"{'----':>12} {'--------':>14} {'----------':>12} {'-----':>8}") + +for N in SIZES: + t = ctables[N] + path = os.path.join(TABLE_DIR, f"ct_{N}") + + def bench_save(t=t, path=path): + if os.path.exists(path): + shutil.rmtree(path) + t.save(path, overwrite=True) + + elapsed = tmin(bench_save) + # Final state for size info + t.save(path, overwrite=True) + cbytes = t.cbytes + nbytes = t.nbytes + ratio = nbytes / cbytes if cbytes > 0 else float("nan") + + def _fmt(n): + if n < 1024**2: + return f"{n / 1024:.1f} KB" + return f"{n / 1024**2:.1f} MB" + + print(f"{N:>12,} {elapsed:>14.4f} {_fmt(cbytes):>12} {ratio:>7.2f}x") + + +# --------------------------------------------------------------------------- +# Section 3: CTable.load() (disk → in-memory) +# --------------------------------------------------------------------------- + +sep("3. CTable.load() (disk → in-memory)") +print(f"{'rows':>12} {'load (s)':>14}") +print(f"{'----':>12} {'--------':>14}") + +for N in SIZES: + path = os.path.join(TABLE_DIR, f"ct_{N}") + + def bench_load(path=path): + return CTable.load(path) + + elapsed = tmin(bench_load) + print(f"{N:>12,} {elapsed:>14.4f}") + + +# --------------------------------------------------------------------------- +# Section 4: CTable → pandas (to_arrow → to_pandas) +# --------------------------------------------------------------------------- + +sep("4. CTable → pandas (to_arrow + to_pandas)") +print(f"{'rows':>12} {'ctable→arrow (s)':>18} {'arrow→pandas (s)':>18} {'total (s)':>12}") +print(f"{'----':>12} {'----------------':>18} {'----------------':>18} {'---------':>12}") + +for N in SIZES: + t = ctables[N] + at_cache = t.to_arrow() # pre-convert once so we can time each step cleanly + + def bench_to_arrow_ct(t=t): + return t.to_arrow() + + def bench_to_pandas(at=at_cache): + return at.to_pandas() + + t_arr = tmin(bench_to_arrow_ct) + t_pd = tmin(bench_to_pandas) + t_tot = t_arr + t_pd + + print(f"{N:>12,} {t_arr:>18.4f} {t_pd:>18.4f} {t_tot:>12.4f}") + + +# --------------------------------------------------------------------------- +# Section 5: Full round-trip (pandas → CTable → disk → load → pandas) +# --------------------------------------------------------------------------- + +sep("5. Full round-trip (pandas → CTable → save → load → pandas)") +print(f"{'rows':>12} {'round-trip (s)':>16}") +print(f"{'----':>12} {'---------------':>16}") + +for N in SIZES: + df = make_dataframe(N) + path = os.path.join(TABLE_DIR, f"rt_{N}") + + def bench_roundtrip(df=df, path=path): + # pandas → CTable + at = pa.Table.from_pandas(df, preserve_index=False) + t = CTable.from_arrow(at) + # save to disk + t.save(path, overwrite=True) + # load back + t2 = CTable.load(path) + # CTable → pandas + return t2.to_arrow().to_pandas() + + elapsed = tmin(bench_roundtrip) + print(f"{N:>12,} {elapsed:>16.4f}") + + +# Cleanup +clean() +print() diff --git a/bench/ctable/bench_persistency.py b/bench/ctable/bench_persistency.py new file mode 100644 index 00000000..71d26bee --- /dev/null +++ b/bench/ctable/bench_persistency.py @@ -0,0 +1,197 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark: persistent vs in-memory CTable +# +# Sections: +# 1. extend() — bulk creation: in-memory vs file-backed +# 2. open() — time to reopen an existing persistent table +# 3. append() — single-row append: in-memory vs file-backed (after reopen) +# 4. column read — materialising a full column: in-memory vs file-backed +# +# Each measurement is the minimum of NRUNS repetitions to reduce noise. + +import os +import shutil +from dataclasses import dataclass +from time import perf_counter + +import blosc2 + +NRUNS = 3 +TABLE_DIR = "saved_ctable/bench" + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +def sep(title: str) -> None: + print(f"\n{'─' * 60}") + print(f" {title}") + print(f"{'─' * 60}") + + +def tmin(fn, n: int = NRUNS) -> float: + """Return the minimum elapsed time (seconds) over *n* calls of *fn*.""" + best = float("inf") + for _ in range(n): + t0 = perf_counter() + fn() + best = min(best, perf_counter() - t0) + return best + + +def clean() -> None: + if os.path.exists(TABLE_DIR): + shutil.rmtree(TABLE_DIR) + os.makedirs(TABLE_DIR, exist_ok=True) + + +# --------------------------------------------------------------------------- +# Section 1: bulk creation — extend() +# --------------------------------------------------------------------------- + +sep("1. extend() — bulk insert: in-memory vs TreeStore-backed") + +SIZES = [1_000, 10_000, 100_000, 1_000_000] + +print(f"{'rows':>12} {'in-memory (s)':>16} {'store-backed (s)':>16} {'overhead':>10}") +print(f"{'----':>12} {'-------------':>16} {'---------------':>16} {'--------':>10}") + +for N in SIZES: + data = [(i, float(i % 100), i % 2 == 0) for i in range(N)] + + def bench_mem(N=N, data=data): + t = blosc2.CTable(Row, expected_size=N) + t.extend(data, validate=False) + + def bench_file(N=N, data=data): + clean() + t = blosc2.CTable(Row, urlpath=TABLE_DIR + "/ext", mode="w", expected_size=N) + t.extend(data, validate=False) + t.close() + + t_mem = tmin(bench_mem) + t_file = tmin(bench_file) + overhead = t_file / t_mem if t_mem > 0 else float("nan") + print(f"{N:>12,} {t_mem:>16.4f} {t_file:>16.4f} {overhead:>9.2f}x") + +# --------------------------------------------------------------------------- +# Section 2: open() — reopen an existing table +# --------------------------------------------------------------------------- + +sep("2. open() — time to reopen a persistent table") + +print(f"{'rows':>12} {'blosc2.open() (s)':>18} {'CTable.open() (s)':>20} {'CTable(..., mode=a) (s)':>24}") +print(f"{'----':>12} {'----------------':>18} {'------------------':>20} {'------------------------':>24}") + +for N in SIZES: + data = [(i, float(i % 100), i % 2 == 0) for i in range(N)] + clean() + path = TABLE_DIR + "/reopen" + t = blosc2.CTable(Row, urlpath=path, mode="w", expected_size=N) + t.extend(data, validate=False) + t.close() + + def bench_blosc2_open(path=path): + t2 = blosc2.open(path, mode="r") + _ = len(t2) + + def bench_open(path=path): + t2 = blosc2.CTable.open(path, mode="r") + _ = len(t2) + + def bench_ctor(path=path): + t2 = blosc2.CTable(Row, urlpath=path, mode="a") + _ = len(t2) + + t_b2_open = tmin(bench_blosc2_open) + t_open = tmin(bench_open) + t_ctor = tmin(bench_ctor) + print(f"{N:>12,} {t_b2_open:>18.4f} {t_open:>20.4f} {t_ctor:>24.4f}") + +# --------------------------------------------------------------------------- +# Section 3: append() — single-row inserts after reopen +# --------------------------------------------------------------------------- + +sep("3. append() — 1 000 single-row inserts: in-memory vs TreeStore-backed") + +APPEND_N = 1_000 +PREALLOCATE = 10_000 # avoid resize noise + +print(f"{'backend':>14} {'total (s)':>12} {'µs / row':>12}") +print(f"{'-------':>14} {'---------':>12} {'--------':>12}") + + +def bench_append_mem(): + t = blosc2.CTable(Row, expected_size=PREALLOCATE, validate=False) + for i in range(APPEND_N): + t.append((i, float(i % 100), True)) + + +clean() +path = TABLE_DIR + "/apath" +blosc2.CTable(Row, urlpath=path, mode="w", expected_size=PREALLOCATE) + + +def bench_append_file(): + t = blosc2.CTable(Row, urlpath=path, mode="a", validate=False) + for i in range(APPEND_N): + t.append((i, float(i % 100), True)) + + +for label, fn in [("in-memory", bench_append_mem), ("file-backed", bench_append_file)]: + # Reset file table before each run + if label == "file-backed": + clean() + t = blosc2.CTable(Row, urlpath=path, mode="w", expected_size=PREALLOCATE) + t.close() + elapsed = tmin(fn) + us_per_row = elapsed / APPEND_N * 1e6 + print(f"{label:>14} {elapsed:>12.4f} {us_per_row:>12.1f}") + +# --------------------------------------------------------------------------- +# Section 4: column read — to_numpy() after reopen +# --------------------------------------------------------------------------- + +sep("4. column read — to_numpy() on 'id': in-memory vs TreeStore-backed") + +print(f"{'rows':>12} {'in-memory (s)':>16} {'store-backed (s)':>16} {'ratio':>8}") +print(f"{'----':>12} {'-------------':>16} {'---------------':>16} {'-----':>8}") + +for N in SIZES: + data = [(i, float(i % 100), i % 2 == 0) for i in range(N)] + + t_mem_table = blosc2.CTable(Row, expected_size=N, validate=False) + t_mem_table.extend(data, validate=False) + + clean() + path = TABLE_DIR + "/read" + t_file_table = blosc2.CTable(Row, urlpath=path, mode="w", expected_size=N) + t_file_table.extend(data, validate=False) + t_file_table.close() + # Reopen read-only (simulates a real read workload) + t_ro = blosc2.CTable.open(path, mode="r") + + def bench_read_mem(t=t_mem_table): + _ = t["id"].to_numpy() + + def bench_read_file(t=t_ro): + _ = t["id"].to_numpy() + + t_m = tmin(bench_read_mem) + t_f = tmin(bench_read_file) + ratio = t_f / t_m if t_m > 0 else float("nan") + print(f"{N:>12,} {t_m:>16.4f} {t_f:>16.4f} {ratio:>7.2f}x") + +# Cleanup +clean() +print() diff --git a/bench/ctable/bench_validation.py b/bench/ctable/bench_validation.py new file mode 100644 index 00000000..7329a3ce --- /dev/null +++ b/bench/ctable/bench_validation.py @@ -0,0 +1,129 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark: cost of constraint validation +# +# Measures the overhead of validate=True vs validate=False for: +# 1. append() — row-by-row, Pydantic path +# 2. extend() — bulk insert, vectorized NumPy path +# +# at increasing batch sizes to show how validation cost scales. + +from dataclasses import dataclass +from time import perf_counter + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +def make_data(n: int): + rng = np.random.default_rng(42) + ids = np.arange(n, dtype=np.int64) + scores = rng.uniform(0, 100, n) + flags = rng.integers(0, 2, n, dtype=np.bool_) + return list(zip(ids.tolist(), scores.tolist(), flags.tolist(), strict=False)) + + +SIZES = [100, 1_000, 10_000, 100_000, 1_000_000] +APPEND_SIZES = [100, 1_000] # append row-by-row is slow at large N + +# ───────────────────────────────────────────────────────────────────────────── +# 1. append() — validate=True vs validate=False +# ───────────────────────────────────────────────────────────────────────────── +print("=" * 65) +print("1. append() — row-by-row (Pydantic validation per row)") +print("=" * 65) +print(f"{'N':>10} {'validate=True':>14} {'validate=False':>15} {'overhead':>10}") +print("-" * 65) + +for n in APPEND_SIZES: + data = make_data(n) + + t = blosc2.CTable(Row, expected_size=n, validate=True) + t0 = perf_counter() + for row in data: + t.append(row) + t_on = perf_counter() - t0 + + t = blosc2.CTable(Row, expected_size=n, validate=False) + t0 = perf_counter() + for row in data: + t.append(row) + t_off = perf_counter() - t0 + + overhead = (t_on / t_off) if t_off > 0 else float("inf") + print(f"{n:>10,} {t_on:>13.4f}s {t_off:>14.4f}s {overhead:>9.2f}x") + +# ───────────────────────────────────────────────────────────────────────────── +# 2. extend() — validate=True vs validate=False +# ───────────────────────────────────────────────────────────────────────────── +print() +print("=" * 65) +print("2. extend() — bulk insert (vectorized NumPy validation)") +print("=" * 65) +print(f"{'N':>10} {'validate=True':>14} {'validate=False':>15} {'overhead':>10}") +print("-" * 65) + +for n in SIZES: + data = make_data(n) + + t = blosc2.CTable(Row, expected_size=n, validate=True) + t0 = perf_counter() + t.extend(data) + t_on = perf_counter() - t0 + + t = blosc2.CTable(Row, expected_size=n, validate=False) + t0 = perf_counter() + t.extend(data) + t_off = perf_counter() - t0 + + overhead = (t_on / t_off) if t_off > 0 else float("inf") + print(f"{n:>10,} {t_on:>13.4f}s {t_off:>14.4f}s {overhead:>9.2f}x") + +# ───────────────────────────────────────────────────────────────────────────── +# 3. extend() — validate=True vs validate=False with structured NumPy array +# ───────────────────────────────────────────────────────────────────────────── +print() +print("=" * 65) +print("3. extend() with structured NumPy array") +print("=" * 65) +print(f"{'N':>10} {'validate=True':>14} {'validate=False':>15} {'overhead':>10}") +print("-" * 65) + +np_dtype = np.dtype([("id", np.int64), ("score", np.float64), ("active", np.bool_)]) + +for n in SIZES: + rng = np.random.default_rng(42) + arr = np.empty(n, dtype=np_dtype) + arr["id"] = np.arange(n, dtype=np.int64) + arr["score"] = rng.uniform(0, 100, n) + arr["active"] = rng.integers(0, 2, n, dtype=np.bool_) + + t = blosc2.CTable(Row, expected_size=n, validate=True) + t0 = perf_counter() + t.extend(arr) + t_on = perf_counter() - t0 + + t = blosc2.CTable(Row, expected_size=n, validate=False) + t0 = perf_counter() + t.extend(arr) + t_off = perf_counter() - t0 + + overhead = (t_on / t_off) if t_off > 0 else float("inf") + print(f"{n:>10,} {t_on:>13.4f}s {t_off:>14.4f}s {overhead:>9.2f}x") + +print() +print("Note: 'overhead' = validate=True time / validate=False time.") +print(" 1.00x means validation is free; 2.00x means it doubles the time.") diff --git a/bench/ctable/compact.py b/bench/ctable/compact.py new file mode 100644 index 00000000..a4817b0a --- /dev/null +++ b/bench/ctable/compact.py @@ -0,0 +1,75 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring compact() time and memory gain after deletions +# of varying fractions of the table. + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 + +print(f"compact() benchmark | N = {N:,}\n") + +# Build base data once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +delete_fractions = [0.1, 0.25, 0.5, 0.75, 0.9] + +print("=" * 75) +print(f"{'DELETED':>10} {'ROWS LEFT':>10} {'TIME (s)':>12} {'CBYTES BEFORE':>15} {'CBYTES AFTER':>14}") +print("-" * 75) + +for frac in delete_fractions: + ct = blosc2.CTable(Row, expected_size=N) + ct.extend(DATA) + + n_delete = int(N * frac) + ct.delete(list(range(n_delete))) + + cbytes_before = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes + + t0 = time() + ct.compact() + t_compact = time() - t0 + + cbytes_after = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes + + print( + f"{frac*100:>9.0f}%" + f" {N - n_delete:>10,}" + f" {t_compact:>12.4f}" + f" {cbytes_before / 1024**2:>13.2f} MB" + f" {cbytes_after / 1024**2:>12.2f} MB" + ) + +print("-" * 75) diff --git a/bench/ctable/ctable_v_pandas.py b/bench/ctable/ctable_v_pandas.py new file mode 100644 index 00000000..3b7a6d52 --- /dev/null +++ b/bench/ctable/ctable_v_pandas.py @@ -0,0 +1,121 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark comparing CTable vs pandas DataFrame for: +# 1. Creation from a NumPy structured array +# 2. Column access (full column) +# 3. Filtering (where/query) +# 4. Row iteration + +from dataclasses import dataclass +from time import time + +import numpy as np +import pandas as pd + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 +rng = np.random.default_rng(42) + +print(f"CTable vs pandas benchmark | N = {N:,}\n") + +# Build base data once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.empty(N, dtype=np_dtype) +DATA["id"] = np.arange(N, dtype=np.int64) +DATA["c_val"] = rng.standard_normal(N) + 1j * rng.standard_normal(N) +DATA["score"] = rng.uniform(0, 100, N) +DATA["active"] = rng.integers(0, 2, N, dtype=np.bool_) + +print("=" * 65) +print(f"{'OPERATION':<30} {'CTable':>12} {'pandas':>12} {'SPEEDUP':>10}") +print("-" * 65) + +# 1. Creation +t0 = time() +ct = blosc2.CTable(Row, expected_size=N) +ct.extend(DATA) +t_ct_create = time() - t0 + +t0 = time() +df = pd.DataFrame(DATA) +t_pd_create = time() - t0 + +print(f"{'Creation':<30} {t_ct_create:>12.4f} {t_pd_create:>12.4f} {t_pd_create/t_ct_create:>9.2f}x") + +# 2. Column access (full column) +t0 = time() +arr = ct["score"] +t_ct_col = time() - t0 + +t0 = time() +arr = df["score"] +t_pd_col = time() - t0 + +print(f"{'Column access (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.2f}x") + +# 2.5 Column access (full column) +t0 = time() +arr = ct["score"].to_numpy() +t_ct_col = time() - t0 + +t0 = time() +arr = df["score"].to_numpy() +t_pd_col = time() - t0 + +print(f"{'Column access to numpy (full)':<30} {t_ct_col:>12.4f} {t_pd_col:>12.4f} {t_pd_col/t_ct_col:>9.3f}x") + +# 3. Filtering +t0 = time() +result_ct = ct.where((ct["id"] > 250_000) & (ct["id"] < 750_000)) +t_ct_filter = time() - t0 + +t0 = time() +result_pd = df.query("250000 < id < 750000") +t_pd_filter = time() - t0 + +print(f"{'Filter (id 250k-750k)':<30} {t_ct_filter:>12.4f} {t_pd_filter:>12.4f} {t_pd_filter/t_ct_filter:>9.2f}x") + +# 4. Row iteration +t0 = time() +for _val in ct["score"]: + pass +t_ct_iter = time() - t0 + +t0 = time() +for _val in df["score"]: + pass +t_pd_iter = time() - t0 + +print(f"{'Row iteration':<30} {t_ct_iter:>12.4f} {t_pd_iter:>12.4f} {t_pd_iter/t_ct_iter:>9.2f}x") + +print("-" * 65) + +# Memory +ct_cbytes = sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes +ct_nbytes = sum(col.nbytes for col in ct._cols.values()) + ct._valid_rows.nbytes +pd_nbytes = df.memory_usage(deep=True).sum() + +print(f"\nMemory — CTable compressed: {ct_cbytes / 1024**2:.2f} MB") +print(f"Memory — CTable uncompressed: {ct_nbytes / 1024**2:.2f} MB") +print(f"Memory — pandas: {pd_nbytes / 1024**2:.2f} MB") +print(f"Compression ratio CTable: {ct_nbytes / ct_cbytes:.2f}x") diff --git a/bench/ctable/delete.py b/bench/ctable/delete.py new file mode 100644 index 00000000..79f59580 --- /dev/null +++ b/bench/ctable/delete.py @@ -0,0 +1,76 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring delete() performance with different index types: +# int, slice, and list — with varying sizes. + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 + +print(f"delete() benchmark | N = {N:,}\n") + +# Build base data once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +delete_cases = [ + ("int", 0), + ("slice small", slice(0, 100)), + ("slice large", slice(0, 100_000)), + ("slice full", slice(0, N)), + ("list small", list(range(100))), + ("list large", list(range(100_000))), + ("list full", list(range(N))), +] + +print("=" * 60) +print(f"{'CASE':<20} {'ROWS DELETED':>14} {'TIME (s)':>12}") +print("-" * 60) + +for label, key in delete_cases: + ct = blosc2.CTable(Row, expected_size=N) + ct.extend(DATA) + + if isinstance(key, int): + n_deleted = 1 + elif isinstance(key, slice): + n_deleted = len(range(*key.indices(N))) + else: + n_deleted = len(key) + + t0 = time() + ct.delete(key) + t_delete = time() - t0 + print(f"{label:<20} {n_deleted:>14,} {t_delete:>12.6f}") + +print("-" * 60) diff --git a/bench/ctable/expected_size.py b/bench/ctable/expected_size.py new file mode 100644 index 00000000..e199d589 --- /dev/null +++ b/bench/ctable/expected_size.py @@ -0,0 +1,69 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring the overhead of resize() when expected_size +# is too small (M rows) vs correctly sized (N rows) during extend(). + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + + +M = 779 +N = 62_500 +MAX_N = 1_000_000 +print(f"expected_size benchmark | wrong expected_size = {M}") + +# Pre-generate full dataset once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(MAX_N) + ], + dtype=np_dtype, +) + +while N <= MAX_N: + print("-" * 80) + print(f"N = {N:,} rows") + + # 1. extend() with correct expected_size = N + ct_correct = blosc2.CTable(Row, expected_size=N) + t0 = time() + ct_correct.extend(DATA[:N]) + t_correct = time() - t0 + print(f"extend() expected_size=N ({N:>8,}): {t_correct:.4f} s rows: {len(ct_correct):,}") + + # 2. extend() with wrong expected_size = M (forces resize) + ct_wrong = blosc2.CTable(Row, expected_size=M) + t0 = time() + ct_wrong.extend(DATA[:N]) + t_wrong = time() - t0 + print(f"extend() expected_size=M ({M:>8,}): {t_wrong:.4f} s rows: {len(ct_wrong):,}") + + # Summary + print(f" Slowdown from wrong expected_size: {t_wrong / t_correct:.2f}x") + + N *= 2 diff --git a/bench/ctable/extend.py b/bench/ctable/extend.py new file mode 100644 index 00000000..5e1090ba --- /dev/null +++ b/bench/ctable/extend.py @@ -0,0 +1,105 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring CTable creation time from three different sources: +# 1. Python list of lists (1M rows) +# 2. NumPy structured array (1M rows) — list of named tuples +# 3. An existing CTable (previously created from Python lists, 1M rows) + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 +print(f"CTable creation benchmark with {N:,} rows\n") + +# --------------------------------------------------------------------------- +# Base data generation (not part of the benchmark timing) +# --------------------------------------------------------------------------- +print("Generating base data...") + +t0 = time() +data_list = [ + [i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0] + for i in range(N) +] +t_gen_list = time() - t0 +print(f" Python list generated in: {t_gen_list:.4f} s") + +t0 = time() +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +data_np = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) +t_gen_np = time() - t0 +print(f" NumPy structured array generated: {t_gen_np:.4f} s\n") + +# --------------------------------------------------------------------------- +# 1. Creation from a Python list of lists +# --------------------------------------------------------------------------- +print("CTable from Python list of lists") +t0 = time() +ct_from_list = blosc2.CTable(Row, expected_size=N) +ct_from_list.extend(data_list) +t_from_list = time() - t0 +print(f" extend() time (Python list): {t_from_list:.4f} s") +print(f" Rows: {len(ct_from_list):,}") + +# --------------------------------------------------------------------------- +# 2. Creation from a NumPy structured array (list of named tuples) +# --------------------------------------------------------------------------- +print("CTable from NumPy structured array") +t0 = time() +ct_from_np = blosc2.CTable(Row, expected_size=N) +ct_from_np.extend(data_np) +t_from_np = time() - t0 +print(f" extend() time (NumPy struct): {t_from_np:.4f} s") +print(f" Rows: {len(ct_from_np):,}") + + +# --------------------------------------------------------------------------- +# 3. Creation from an existing CTable (ct_from_list, already built above) +# --------------------------------------------------------------------------- +print("CTable from an existing CTable") +t0 = time() +ct_from_ctable = blosc2.CTable(Row, expected_size=N) +ct_from_ctable.extend(ct_from_list) +t_from_ctable = time() - t0 +print(f" extend() time (CTable): {t_from_ctable:.4f} s") +print(f" Rows: {len(ct_from_ctable):,}") + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- +print("\n") +print("=" * 60) +print(f"{'SOURCE':<30} {'TIME (s)':>12} {'SPEEDUP vs list':>18}") +print("-" * 60) +print(f"{'Python list of lists':<30} {t_from_list:>12.4f} {'1.00x':>18}") +print(f"{'NumPy structured array':<30} {t_from_np:>12.4f} {t_from_list / t_from_np:>17.2f}x") +print(f"{'Existing CTable':<30} {t_from_ctable:>12.4f} {t_from_list / t_from_ctable:>17.2f}x") diff --git a/bench/ctable/extend_vs_apend.py b/bench/ctable/extend_vs_apend.py new file mode 100644 index 00000000..db63206b --- /dev/null +++ b/bench/ctable/extend_vs_apend.py @@ -0,0 +1,76 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for comparing append() (row by row) vs extend() (bulk), +# to find the crossover point where extend() becomes worth it. + +from dataclasses import dataclass +from time import time + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +# Parameter — change N to test different crossover points +N = 2 +print("append() vs extend() benchmark") +for i in range(6): + print("\n") + print("%" * 100) + + + # Base data generation + data_list = [ + [i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0] for i in range(N) + ] + + # 1. N individual append() calls + print(f"{N} individual append() calls") + ct_append = blosc2.CTable(Row, expected_size=N) + t0 = time() + for row in data_list: + ct_append.append(row) + t_append = time() - t0 + print(f" Time: {t_append:.6f} s") + print(f" Rows: {len(ct_append):,}") + + # 2. N individual extend() calls (one row at a time) + print(f"{N} individual extend() calls (one row at a time)") + ct_extend_one = blosc2.CTable(Row, expected_size=N) + t0 = time() + for row in data_list: + ct_extend_one.extend([row]) + t_extend_one = time() - t0 + print(f" Time: {t_extend_one:.6f} s") + print(f" Rows: {len(ct_extend_one):,}") + + # 3. Single extend() call with all N rows at once + print(f"Single extend() call with all {N} rows at once") + ct_extend_bulk = blosc2.CTable(Row, expected_size=N) + t0 = time() + ct_extend_bulk.extend(data_list) + t_extend_bulk = time() - t0 + print(f" Time: {t_extend_bulk:.6f} s") + print(f" Rows: {len(ct_extend_bulk):,}") + + # Summary + print("=" * 70) + print(f"{'METHOD':<35} {'TIME (s)':>12} {'SPEEDUP vs append':>20}") + print("-" * 70) + print(f"{'append() x N':<35} {t_append:>12.6f} {'1.00x':>20}") + print(f"{'extend() x N (one row each)':<35} {t_extend_one:>12.6f} {t_append / t_extend_one:>19.2f}x") + print(f"{'extend() x 1 (all at once)':<35} {t_extend_bulk:>12.6f} {t_append / t_extend_bulk:>19.2f}x") + print("-" * 70) + + N=N*2 diff --git a/bench/ctable/index.py b/bench/ctable/index.py new file mode 100644 index 00000000..3298bde2 --- /dev/null +++ b/bench/ctable/index.py @@ -0,0 +1,63 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring Column[int] access (single row by logical index), +# which exercises _find_physical_index() traversal over chunk metadata. + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 +indices = [0, N // 4, N // 2, (3 * N) // 4, N - 1] + +print(f"Column[int] access benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(Row, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 60) +print(f"{'INDEX':<15} {'POSITION':>12} {'TIME (s)':>12}") +print("-" * 60) + +col = ct["score"] +for idx in indices: + t0 = time() + val = col[idx] + t_access = time() - t0 + position = f"{idx / N * 100:.0f}% into array" + print(f"{idx:<15,} {position:>12} {t_access:.6f}") + +print("-" * 60) diff --git a/bench/ctable/iter_rows.py b/bench/ctable/iter_rows.py new file mode 100644 index 00000000..51203ba4 --- /dev/null +++ b/bench/ctable/iter_rows.py @@ -0,0 +1,97 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass +from time import time + +import blosc2 +from blosc2 import CTable + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100)) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000 # start small, increase when confident + +data = [(i, float(i % 100), i % 2 == 0) for i in range(N)] +tabla = CTable(Row, new_data=data) + +print(f"Table created with {len(tabla)} rows\n") + +# ------------------------------------------------------------------- +# Test 1: iterate without accessing any column (minimum cost) +# ------------------------------------------------------------------- +t0 = time() +for _row in tabla: + pass +t1 = time() +print(f"[Test 1] Iter without accessing columns: {(t1 - t0)*1000:.3f} ms") + +# ------------------------------------------------------------------- +# Test 2: iterate accessing a single column (real_pos cached once) +# ------------------------------------------------------------------- +t0 = time() +for row in tabla: + _ = row["id"] +t1 = time() +print(f"[Test 2] Iter accessing 'id': {(t1 - t0)*1000:.3f} ms") + +# ------------------------------------------------------------------- +# Test 3: iterate accessing all columns (real_pos cached once per row) +# ------------------------------------------------------------------- +t0 = time() +for row in tabla: + _ = row["id"] + _ = row["score"] + _ = row["active"] +t1 = time() +print(f"[Test 3] Iter accessing 3 columns: {(t1 - t0)*1000:.3f} ms") + +# ------------------------------------------------------------------- +# Test 4: correctness — values match expected +# ------------------------------------------------------------------- +errors = 0 +for row in tabla: + if row["id"] != row._nrow: + errors += 1 + if row["score"] != float(row._nrow % 100): + errors += 1 + if row["active"] != (row._nrow % 2 == 0): + errors += 1 + +print(f"\n[Test 4] Correctness errors: {errors} (expected: 0)") + +# ------------------------------------------------------------------- +# Test 5: with holes (deleted rows) +# ------------------------------------------------------------------- +tabla2 = CTable(Row, new_data=data) +tabla2.delete(list(range(0, N, 2))) # delete even rows, keep odd ones + +print(f"\nTable with holes: {len(tabla2)} rows (expected: {N // 2})") + +t0 = time() +ids = [] +for row in tabla2: + ids.append(row["id"]) +t1 = time() + +expected_ids = [i for i in range(N) if i % 2 != 0] +ok = ids == expected_ids +print(f"[Test 5] Iter with holes ({N//2} rows): {(t1 - t0)*1000:.3f} ms | correctness: {ok}") + +# ------------------------------------------------------------------- +# Test 6: real_pos is cached correctly (not recomputed) +# ------------------------------------------------------------------- +row0 = next(iter(tabla)) +assert row0._real_pos is None, "real_pos should be None before first access" +_ = row0["id"] +assert row0._real_pos is not None, "real_pos should be cached after first access" +print(f"\n[Test 6] real_pos caching: OK (real_pos={row0._real_pos})") diff --git a/bench/ctable/iteration_column.py b/bench/ctable/iteration_column.py new file mode 100644 index 00000000..b1ac3703 --- /dev/null +++ b/bench/ctable/iteration_column.py @@ -0,0 +1,79 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for comparing full column iteration strategies: +# 1. for val in ct["score"] — Python iterator via __iter__ +# 2. np.array(list(ct["score"])) — materialize via list then convert +# 3. ct["score"][0:N].to_array() — slice view + to_array() + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 + +print(f"Column iteration benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(Row, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 60) + +col = ct["score"] + +# 1. Python iterator +t0 = time() +for _val in col: + pass +t_iter = time() - t0 +print(f"for val in col: {t_iter:.4f} s") + +# 2. list() + np.array() +t0 = time() +arr = np.array(list(col)) +t_list = time() - t0 +print(f"np.array(list(col)): {t_list:.4f} s") + +# 3. slice view + to_array() +t0 = time() +arr = col[0:N].to_numpy() +for _val in arr: + pass +t_toarray = time() - t0 +print(f"col[0:N].to_array(): {t_toarray:.4f} s") + +print("=" * 60) +print(f"Speedup to_array vs iter: {t_iter / t_toarray:.2f}x") +print(f"Speedup to_array vs list: {t_list / t_toarray:.2f}x") diff --git a/bench/ctable/print.py b/bench/ctable/print.py new file mode 100644 index 00000000..6efb80bf --- /dev/null +++ b/bench/ctable/print.py @@ -0,0 +1,108 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark: iterative ingestion comparison — Pandas vs CTable +# Data source: randomly generated numpy structured array + +import time +from dataclasses import dataclass + +import numpy as np +import pandas as pd + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64()) + name: str = blosc2.field(blosc2.string(max_length=9), default="") + score: float = blosc2.field(blosc2.float64(ge=0), default=0.0) + + +NAMES = ["benchmark", "alpha", "beta", "gamma", "delta", + "epsilon", "zeta", "eta", "theta", "iota"] + +N = 100_000 +rng = np.random.default_rng(42) + +np_dtype = np.dtype([("id", np.int64), ("name", " np.ndarray: + arr = np.empty(n, dtype=np_dtype) + arr["id"] = np.arange(n, dtype=np.int64) + arr["name"] = np.array([rng.choice(NAMES) for _ in range(n)], dtype=" DataFrame) ---") +data = make_data(N) + +t0 = time.perf_counter() +df = pd.DataFrame(data) +t_pandas = time.perf_counter() - t0 + +mem_pandas = df.memory_usage(deep=True).sum() / (1024 ** 2) +print(f"Total time: {t_pandas:.4f} s") +print(f"Memory (RAM): {mem_pandas:.2f} MB") + +print("\n--- PANDAS: First 10 rows ---") +t0_print = time.perf_counter() +print(df.head(10).to_string()) +t_print_pandas = time.perf_counter() - t0_print +print(f"\nPrint time: {t_print_pandas:.6f} s") + +# ───────────────────────────────────────────────────────────── +# 2. BLOSC2 CTable +# ───────────────────────────────────────────────────────────── +print("\n" + "=" * 60) +print("--- 2. BLOSC2 CTable (structured array -> extend) ---") +data = make_data(N) + +t0 = time.perf_counter() +ct = blosc2.CTable(Row, expected_size=N) +ct.extend(data) +t_blosc = time.perf_counter() - t0 + +fields = ct.col_names +mem_blosc_c = (sum(col.cbytes for col in ct._cols.values()) + ct._valid_rows.cbytes) / (1024 ** 2) +mem_blosc_uc = (sum(col.nbytes for col in ct._cols.values()) + ct._valid_rows.nbytes) / (1024 ** 2) + +print(f"Total time: {t_blosc:.4f} s") +print(f"Memory (uncompressed): {mem_blosc_uc:.2f} MB") +print(f"Memory (compressed): {mem_blosc_c:.2f} MB") + +print("\n--- BLOSC2: First 10 rows ---") +t0_print = time.perf_counter() +print(ct.head(10)) +t_print_blosc = time.perf_counter() - t0_print +print(f"\nPrint time: {t_print_blosc:.6f} s") + +# ───────────────────────────────────────────────────────────── +# SUMMARY +# ───────────────────────────────────────────────────────────── +print("\n" + "=" * 60) +print("--- SUMMARY ---") +speedup = t_pandas / t_blosc +direction = "faster" if t_blosc < t_pandas else "slower" + +print(f"{'METRIC':<30} {'Pandas':>12} {'Blosc2':>12}") +print("-" * 55) +print(f"{'Ingestion time (s)':<30} {t_pandas:>12.4f} {t_blosc:>12.4f}") +print(f"{'Memory (MB)':<30} {mem_pandas:>12.2f} {mem_blosc_c:>12.2f}") +print(f"{'Print time (s)':<30} {t_print_pandas:>12.6f} {t_print_blosc:>12.6f}") +print("-" * 55) +print(f"\nSpeedup: {speedup:.2f}x {direction}") +print(f"Compression ratio: {mem_blosc_uc / mem_blosc_c:.2f}x") +print(f"Blosc2 vs Pandas size: {mem_blosc_c / mem_pandas * 100:.1f}%") diff --git a/bench/ctable/row_acces.py b/bench/ctable/row_acces.py new file mode 100644 index 00000000..050d0309 --- /dev/null +++ b/bench/ctable/row_acces.py @@ -0,0 +1,62 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring row[int] access (full row via _RowIndexer), +# testing access at different positions across the array. + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 +indices = [0, N // 4, N // 2, (3 * N) // 4, N - 1] + +print(f"row[int] access benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(Row, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 60) +print(f"{'INDEX':<15} {'POSITION':>12} {'TIME (s)':>12}") +print("-" * 60) + +for idx in indices: + t0 = time() + row = ct.row[idx] + t_access = time() - t0 + position = f"{idx / N * 100:.0f}% into array" + print(f"{idx:<15,} {position:>12} {t_access:.6f}") + +print("-" * 60) diff --git a/bench/ctable/slice.py b/bench/ctable/slice.py new file mode 100644 index 00000000..a41c50a6 --- /dev/null +++ b/bench/ctable/slice.py @@ -0,0 +1,71 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring Column[slice] access with slices of different +# sizes and positions: small, large, and middle of the array. + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 +slices = [ + ("small — start", slice(0, 100)), + ("small — middle", slice(N // 2, N // 2 + 100)), + ("small — end", slice(N - 100, N)), + ("large — start", slice(0, 100_000)), + ("large — middle", slice(N // 2 - 50_000, N // 2 + 50_000)), + ("large — end", slice(N - 100_000, N)), + ("full — all", slice(0, N)), +] + +print(f"Column[slice] access benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(Row, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 65) +print(f"{'SLICE':<25} {'ROWS':>8} {'TIME (s)':>12}") +print("-" * 65) + +col = ct["score"] +for label, s in slices: + t0 = time() + val = col[s] + t_access = time() - t0 + n_rows = s.stop - s.start + print(f"{label:<25} {n_rows:>8,} {t_access:>12.6f}") + +print("-" * 65) diff --git a/bench/ctable/slice_steps.py b/bench/ctable/slice_steps.py new file mode 100644 index 00000000..0a3fb358 --- /dev/null +++ b/bench/ctable/slice_steps.py @@ -0,0 +1,61 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring Column[::step].to_array() with varying step sizes. + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 +steps = [1, 2, 4, 8, 16, 100, 1000] + +print(f"Column[::step].to_array() benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(Row, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 60) +print(f"{'STEP':<10} {'ROWS RETURNED':>15} {'TIME (s)':>12}") +print("-" * 60) + +col = ct["score"] +for step in steps: + t0 = time() + arr = col[::step].to_numpy() + t_total = time() - t0 + print(f"::{ step:<8} {len(arr):>15,} {t_total:>12.6f}") + +print("-" * 60) diff --git a/bench/ctable/slice_to_array.py b/bench/ctable/slice_to_array.py new file mode 100644 index 00000000..7c58080e --- /dev/null +++ b/bench/ctable/slice_to_array.py @@ -0,0 +1,71 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring Column[slice] + to_array() with slices of +# different sizes and positions: small, large, and middle of the array. + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 +slices = [ + ("small — start", slice(0, 100)), + ("small — middle", slice(N // 2, N // 2 + 100)), + ("small — end", slice(N - 100, N)), + ("large — start", slice(0, 100_000)), + ("large — middle", slice(N // 2 - 50_000, N // 2 + 50_000)), + ("large — end", slice(N - 100_000, N)), + ("full — all", slice(0, N)), +] + +print(f"Column[slice].to_array() benchmark | N = {N:,}\n") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(Row, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 65) +print(f"{'SLICE':<25} {'ROWS':>8} {'TIME (s)':>12}") +print("-" * 65) + +col = ct["score"] +for label, s in slices: + t0 = time() + arr = col[s].to_numpy() + t_total = time() - t0 + n_rows = s.stop - s.start + print(f"{label:<25} {n_rows:>8,} {t_total:>12.6f}") + +print("-" * 65) diff --git a/bench/ctable/speed_iter.py b/bench/ctable/speed_iter.py new file mode 100644 index 00000000..10afdc36 --- /dev/null +++ b/bench/ctable/speed_iter.py @@ -0,0 +1,40 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass +from time import time + +import blosc2 +from blosc2 import CTable + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100)) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 # start small, increase when confident + +data = [(i, float(i % 100), i % 2 == 0) for i in range(N)] +tabla = CTable(Row, new_data=data) + +print(f"Table created with {len(tabla)} rows\n") + +# ------------------------------------------------------------------- +# Test 1: iterate without accessing any column (minimum cost) +# ------------------------------------------------------------------- +i=0 +t0 = time() +for row in tabla: + i=(i+1)%10000 + if i==0: + _ = row["score"] + +t1 = time() +print(f"[Test 1] Iter without accessing columns: {(t1 - t0):.3f} s") diff --git a/bench/ctable/where_chain.py b/bench/ctable/where_chain.py new file mode 100644 index 00000000..d2a6092d --- /dev/null +++ b/bench/ctable/where_chain.py @@ -0,0 +1,73 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for comparing chained where() calls vs a single combined filter. +# Filters: 250k < id < 750k, active == False, 25.0 < score < 75.0 + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 + +print(f"where() chained vs combined benchmark | N = {N:,}") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(Row, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 70) + +# 1. Three chained where() calls +t0 = time() +r1 = ct.where(ct["id"] > 250_000) +r2 = r1.where(ct["id"] < 750_000) +r3 = r2.where(ct["score"] > 25.0) +r4 = r3.where(ct["score"] < 75.0) +r5 = r4.where(not ct["active"]) +t_chained = time() - t0 +print(f"Chained where() (5 calls): {t_chained:.6f} s rows: {len(r5):,}") + +# 2. Single combined where() call +t0 = time() +result = ct.where( + (ct["id"] > 250_000) & (ct["id"] < 750_000) & + (not ct["active"]) & + (ct["score"] > 25.0) & (ct["score"] < 75.0) +) +t_combined = time() - t0 +print(f"Combined where() (1 call): {t_combined:.6f} s rows: {len(result):,}") + +print("=" * 70) +print(f"Speedup combined vs chained: {t_chained / t_combined:.2f}x") diff --git a/bench/ctable/where_selective.py b/bench/ctable/where_selective.py new file mode 100644 index 00000000..c0ba6f78 --- /dev/null +++ b/bench/ctable/where_selective.py @@ -0,0 +1,62 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Benchmark for measuring where() performance with varying selectivity. +# Filter: id < threshold, with thresholds covering 1%, 10%, 50%, 90%, 100% + +from dataclasses import dataclass +from time import time + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +N = 1_000_000 +thresholds = [10,10_000, 100_000,250_000, 500_000,750_000 ,900_000, 999_990, 1_000_000] + +print(f"where() selectivity benchmark | N = {N:,}") + +# Build CTable once +np_dtype = np.dtype([ + ("id", np.int64), + ("c_val", np.complex128), + ("score", np.float64), + ("active", np.bool_), +]) +DATA = np.array( + [ + (i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0) + for i in range(N) + ], + dtype=np_dtype, +) + +ct = blosc2.CTable(Row, expected_size=N) +ct.extend(DATA) + +print(f"CTable built with {len(ct):,} rows\n") +print("=" * 70) +print(f"{'THRESHOLD':<15} {'ROWS RETURNED':>15} {'SELECTIVITY':>13} {'TIME (s)':>12}") +print("-" * 70) + +for threshold in thresholds: + t0 = time() + result = ct.where(ct["id"] < threshold) + t_where = time() - t0 + selectivity = threshold / N * 100 + print(f"id < {threshold:<10,} {len(result):>15,} {selectivity:>12.1f}% {t_where:>12.6f}") + +print("-" * 70) diff --git a/doc/getting_started/tutorials.rst b/doc/getting_started/tutorials.rst index d2786d12..c446589b 100644 --- a/doc/getting_started/tutorials.rst +++ b/doc/getting_started/tutorials.rst @@ -20,3 +20,4 @@ Tutorials tutorials/12.batcharray tutorials/13.containers tutorials/14.indexing-arrays + tutorials/15.indexing-ctables diff --git a/doc/getting_started/tutorials/15.indexing-ctables.ipynb b/doc/getting_started/tutorials/15.indexing-ctables.ipynb new file mode 100644 index 00000000..6c8ea4da --- /dev/null +++ b/doc/getting_started/tutorials/15.indexing-ctables.ipynb @@ -0,0 +1,472 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "44fdf4b9", + "metadata": {}, + "source": [ + "# Indexing CTables\n", + "\n", + "CTable supports **persistent, table-owned indexes** that speed up `where()` queries on numeric columns. \n", + "An index maps sorted-value ranges to the chunk positions that contain matching rows, allowing Blosc2 to skip large parts of the table without reading every row.\n", + "\n", + "This tutorial covers:\n", + "\n", + "1. Creating an index on a CTable column\n", + "2. Querying with an index (automatic)\n", + "3. Stale detection and automatic scan fallback\n", + "4. Rebuilding and dropping indexes\n", + "5. Persistent tables: indexes survive close/reopen\n", + "6. Views and indexes\n" + ] + }, + { + "cell_type": "markdown", + "id": "da26cc61", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "We will use a simple measurement table with three numeric columns.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b23746ca", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:02.229246Z", + "start_time": "2026-04-15T12:29:00.966071Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Table: 500 rows\n" + ] + } + ], + "source": [ + "import dataclasses\n", + "\n", + "import numpy as np\n", + "\n", + "import blosc2\n", + "\n", + "\n", + "@dataclasses.dataclass\n", + "class Measurement:\n", + " sensor_id: int = blosc2.field(blosc2.int32())\n", + " temperature: float = blosc2.field(blosc2.float64())\n", + " region: int = blosc2.field(blosc2.int32())\n", + "\n", + "\n", + "N = 500\n", + "t = blosc2.CTable(Measurement)\n", + "rng = np.random.default_rng(42)\n", + "for i in range(N):\n", + " t.append([i, 15.0 + rng.random() * 25, int(rng.integers(0, 4))])\n", + "\n", + "print(f\"Table: {N} rows\")" + ] + }, + { + "cell_type": "markdown", + "id": "2be47ee8", + "metadata": {}, + "source": [ + "## Creating an index\n", + "\n", + "Call `create_index(col_name)` to build a bucket index on a column. \n", + "The returned `CTableIndex` handle shows the column name, kind, and whether the index is stale.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2ac1f281", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:12.081628Z", + "start_time": "2026-04-15T12:29:12.033154Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "stale? False\n", + "all indexes: []\n" + ] + } + ], + "source": [ + "idx = t.create_index(\"sensor_id\")\n", + "print(idx)\n", + "print(\"stale?\", idx.stale)\n", + "print(\"all indexes:\", t.indexes)" + ] + }, + { + "cell_type": "markdown", + "id": "792416cc", + "metadata": {}, + "source": [ + "## Querying with an index\n", + "\n", + "`where()` automatically uses an available (non-stale) index when the filter expression matches the indexed column. \n", + "The result is identical to a full scan.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "dcc2dc87", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:18.333378Z", + "start_time": "2026-04-15T12:29:18.283229Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows sensor_id > 450: 49\n", + "sensor_ids: [451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499]\n" + ] + } + ], + "source": [ + "result = t.where(t[\"sensor_id\"] > 450)\n", + "print(\"Rows sensor_id > 450:\", len(result))\n", + "print(\"sensor_ids:\", sorted(int(v) for v in result[\"sensor_id\"].to_numpy()))" + ] + }, + { + "cell_type": "markdown", + "id": "8b3b9725", + "metadata": {}, + "source": [ + "## Stale detection\n", + "\n", + "Any mutation — `append`, `extend`, `Column.__setitem__`, `Column.assign`, `sort_by`, `compact` —\n", + "marks all indexes **stale**. \n", + "When an index is stale, `where()` falls back to a full scan automatically so results are always correct.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b0132381", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:24.936335Z", + "start_time": "2026-04-15T12:29:24.884590Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stale after append? True\n", + "Found row: 1\n" + ] + } + ], + "source": [ + "t.append([9999, 30.0, 1]) # any mutation marks indexes stale\n", + "\n", + "idx = t.index(\"sensor_id\")\n", + "print(\"stale after append?\", idx.stale)\n", + "\n", + "# Query still works — scan fallback\n", + "result_stale = t.where(t[\"sensor_id\"] == 9999)\n", + "print(\"Found row:\", len(result_stale))" + ] + }, + { + "cell_type": "markdown", + "id": "110f792f", + "metadata": {}, + "source": [ + "Note: `delete()` only bumps the *visibility epoch* (it does not change column values) so it does **not** mark indexes stale.\n", + "\n", + "## Rebuilding an index\n", + "\n", + "`rebuild_index(col_name)` drops the old index and builds a fresh one from the current table state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "dc4d2897", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:31.023914Z", + "start_time": "2026-04-15T12:29:30.970979Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stale after rebuild? False\n", + "Found row via rebuilt index: 1\n" + ] + } + ], + "source": [ + "idx = t.rebuild_index(\"sensor_id\")\n", + "print(\"stale after rebuild?\", idx.stale)\n", + "\n", + "result_rebuilt = t.where(t[\"sensor_id\"] == 9999)\n", + "print(\"Found row via rebuilt index:\", len(result_rebuilt))" + ] + }, + { + "cell_type": "markdown", + "id": "38363aa3", + "metadata": {}, + "source": [ + "## Dropping an index\n", + "\n", + "`drop_index(col_name)` removes the index from the catalog and deletes any sidecar files (for persistent tables).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e1583b4f", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:34.666155Z", + "start_time": "2026-04-15T12:29:34.628535Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Indexes after drop: []\n" + ] + } + ], + "source": [ + "t.drop_index(\"sensor_id\")\n", + "print(\"Indexes after drop:\", t.indexes)" + ] + }, + { + "cell_type": "markdown", + "id": "aab1e6ec", + "metadata": {}, + "source": [ + "## Persistent tables\n", + "\n", + "Indexes on persistent tables (tables with a `urlpath`) survive close and reopen because the catalog is stored inside the table's own `/_meta` sidecar and the index data lives under `/_indexes//`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "85d42133", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:41.237153Z", + "start_time": "2026-04-15T12:29:39.916230Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created: \n", + "Sidecar files: 7\n", + "Rows > 280 (before close): 19\n" + ] + } + ], + "source": [ + "import shutil\n", + "import tempfile\n", + "from pathlib import Path\n", + "\n", + "tmpdir = Path(tempfile.mkdtemp())\n", + "path = str(tmpdir / \"sensors.b2d\")\n", + "\n", + "# Create a persistent table and build an index\n", + "pt = blosc2.CTable(Measurement, urlpath=path, mode=\"w\")\n", + "rng2 = np.random.default_rng(0)\n", + "for i in range(300):\n", + " pt.append([i, 15.0 + rng2.random() * 25, int(rng2.integers(0, 4))])\n", + "\n", + "pidx = pt.create_index(\"sensor_id\")\n", + "print(\"Created:\", pidx)\n", + "\n", + "# Sidecar files\n", + "index_dir = Path(path) / \"_indexes\" / \"sensor_id\"\n", + "print(\"Sidecar files:\", len(list(index_dir.glob(\"**/*.b2nd\"))))\n", + "\n", + "# Query before close\n", + "r1 = pt.where(pt[\"sensor_id\"] > 280)\n", + "print(\"Rows > 280 (before close):\", len(r1))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "149ddba5", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:45.139325Z", + "start_time": "2026-04-15T12:29:45.095849Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Indexes after reopen: []\n", + "Rows > 280 (after reopen): 19\n", + "Results match ✓\n" + ] + } + ], + "source": [ + "# Close and reopen — catalog is preserved\n", + "del pt\n", + "pt2 = blosc2.open(path)\n", + "\n", + "print(\"Indexes after reopen:\", pt2.indexes)\n", + "\n", + "r2 = pt2.where(pt2[\"sensor_id\"] > 280)\n", + "print(\"Rows > 280 (after reopen):\", len(r2))\n", + "\n", + "ids1 = sorted(int(v) for v in r1[\"sensor_id\"].to_numpy())\n", + "ids2 = sorted(int(v) for v in r2[\"sensor_id\"].to_numpy())\n", + "assert ids1 == ids2, \"Results differ!\"\n", + "print(\"Results match ✓\")\n", + "\n", + "shutil.rmtree(tmpdir, ignore_errors=True)" + ] + }, + { + "cell_type": "markdown", + "id": "2743e784", + "metadata": {}, + "source": [ + "## Views and indexes\n", + "\n", + "A *view* (the result of `where()`) is a filtered window into the underlying table. \n", + "Index management methods (`create_index`, `drop_index`, `rebuild_index`, `compact_index`) are **not** available on views — they raise `ValueError`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "83db418b", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:51.038611Z", + "start_time": "2026-04-15T12:29:50.906410Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "View type: CTable\n", + "create_index on view: Cannot create an index on a view.\n", + "drop_index on view: Cannot drop an index from a view.\n" + ] + } + ], + "source": [ + "t2 = blosc2.CTable(Measurement)\n", + "for i in range(50):\n", + " t2.append([i, 20.0, i % 3])\n", + "t2.create_index(\"sensor_id\")\n", + "\n", + "view = t2.where(t2[\"sensor_id\"] > 10)\n", + "print(\"View type:\", type(view).__name__)\n", + "\n", + "try:\n", + " view.create_index(\"sensor_id\")\n", + "except ValueError as e:\n", + " print(\"create_index on view:\", e)\n", + "\n", + "try:\n", + " view.drop_index(\"sensor_id\")\n", + "except ValueError as e:\n", + " print(\"drop_index on view:\", e)" + ] + }, + { + "cell_type": "markdown", + "id": "f5e87579", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "| Operation | Method |\n", + "|---|---|\n", + "| Build index | `t.create_index(col)` |\n", + "| Query (auto) | `t.where(expr)` — uses index when fresh |\n", + "| Check if stale | `t.index(col).stale` |\n", + "| Rebuild | `t.rebuild_index(col)` |\n", + "| Drop | `t.drop_index(col)` |\n", + "| Compact (full indexes) | `t.compact_index(col)` |\n", + "| List all | `t.indexes` |\n", + "\n", + "Key behaviours:\n", + "\n", + "- **Mutations** (`append`, `extend`, `setitem`, `assign`, `sort_by`, `compact`) mark indexes stale.\n", + "- **Stale indexes** trigger automatic scan fallback — no user intervention needed.\n", + "- **Persistent indexes** survive table close and reopen.\n", + "- **Views** cannot own indexes; only root tables can.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "363827fec805190a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/reference/classes.rst b/doc/reference/classes.rst index f33ba4e2..39e22b6a 100644 --- a/doc/reference/classes.rst +++ b/doc/reference/classes.rst @@ -7,6 +7,8 @@ Main Classes ------------ .. autosummary:: + CTable + Column NDArray NDField LazyArray @@ -28,6 +30,7 @@ Main Classes .. toctree:: :maxdepth: 1 + ctable ndarray index_class lazyarray diff --git a/doc/reference/ctable.rst b/doc/reference/ctable.rst new file mode 100644 index 00000000..2d6b3273 --- /dev/null +++ b/doc/reference/ctable.rst @@ -0,0 +1,339 @@ +.. _CTable: + +CTable +====== + +A columnar compressed table backed by one :class:`~blosc2.NDArray` per column. +Each column is stored, compressed, and queried independently; rows are never +materialised in their entirety unless you explicitly call :meth:`~blosc2.CTable.to_arrow` +or iterate with :meth:`~blosc2.CTable.__iter__`. + +.. currentmodule:: blosc2 + +.. autoclass:: CTable + :members: + :member-order: groupwise + + .. rubric:: Special methods + + .. autosummary:: + + CTable.__len__ + CTable.__iter__ + CTable.__getitem__ + CTable.__repr__ + CTable.__str__ + + .. automethod:: __len__ + .. automethod:: __iter__ + .. automethod:: __getitem__ + .. automethod:: __repr__ + .. automethod:: __str__ + + +Construction +------------ + +.. autosummary:: + + CTable.__init__ + CTable.open + CTable.load + CTable.from_arrow + CTable.from_csv + +.. automethod:: CTable.__init__ +.. automethod:: CTable.open +.. automethod:: CTable.load +.. automethod:: CTable.from_arrow +.. automethod:: CTable.from_csv + + +Attributes +---------- + +.. autosummary:: + + CTable.col_names + CTable.nrows + CTable.ncols + CTable.cbytes + CTable.nbytes + CTable.schema + CTable.base + +.. autoproperty:: CTable.nrows +.. autoproperty:: CTable.ncols +.. autoproperty:: CTable.cbytes +.. autoproperty:: CTable.nbytes +.. autoproperty:: CTable.schema + + +Inserting data +-------------- + +.. autosummary:: + + CTable.append + CTable.extend + +.. automethod:: CTable.append +.. automethod:: CTable.extend + + +Querying +-------- + +.. autosummary:: + + CTable.where + CTable.select + CTable.head + CTable.tail + CTable.sample + CTable.sort_by + +.. automethod:: CTable.where +.. automethod:: CTable.select +.. automethod:: CTable.head +.. automethod:: CTable.tail +.. automethod:: CTable.sample +.. automethod:: CTable.sort_by + + +Aggregates & statistics +----------------------- + +.. autosummary:: + + CTable.describe + CTable.cov + +.. automethod:: CTable.describe +.. automethod:: CTable.cov + + +Mutations +--------- + +.. autosummary:: + + CTable.delete + CTable.compact + CTable.add_column + CTable.drop_column + CTable.rename_column + +.. automethod:: CTable.delete +.. automethod:: CTable.compact +.. automethod:: CTable.add_column +.. automethod:: CTable.drop_column +.. automethod:: CTable.rename_column + + +Persistence +----------- + +.. autosummary:: + + CTable.save + CTable.to_csv + CTable.to_arrow + +.. automethod:: CTable.save +.. automethod:: CTable.to_csv +.. automethod:: CTable.to_arrow + + +Inspection +---------- + +.. autosummary:: + + CTable.info + CTable.schema_dict + CTable.column_schema + +.. automethod:: CTable.info +.. automethod:: CTable.schema_dict +.. automethod:: CTable.column_schema + + +---- + +.. _Column: + +Column +====== + +A lazy column accessor returned by ``table["col_name"]`` or ``table.col_name``. +All index operations and aggregates apply the table's tombstone mask +(``_valid_rows``) so deleted rows are silently excluded. + +.. autoclass:: Column + :members: + :member-order: groupwise + + .. rubric:: Special methods + + .. autosummary:: + + Column.__len__ + Column.__iter__ + Column.__getitem__ + Column.__setitem__ + + .. automethod:: __len__ + .. automethod:: __iter__ + .. automethod:: __getitem__ + .. automethod:: __setitem__ + + +Attributes +---------- + +.. autosummary:: + + Column.dtype + Column.null_value + +.. autoproperty:: Column.dtype +.. autoproperty:: Column.null_value + + +Data access +----------- + +.. autosummary:: + + Column.to_numpy + Column.iter_chunks + Column.assign + +.. automethod:: Column.to_numpy +.. automethod:: Column.iter_chunks +.. automethod:: Column.assign + + +Nullable helpers +---------------- + +.. autosummary:: + + Column.is_null + Column.notnull + Column.null_count + +.. automethod:: Column.is_null +.. automethod:: Column.notnull +.. automethod:: Column.null_count + + +Unique values +------------- + +.. autosummary:: + + Column.unique + Column.value_counts + +.. automethod:: Column.unique +.. automethod:: Column.value_counts + + +Aggregates +---------- + +Null sentinel values are automatically excluded from all aggregates. + +.. autosummary:: + + Column.sum + Column.min + Column.max + Column.mean + Column.std + Column.any + Column.all + +.. automethod:: Column.sum +.. automethod:: Column.min +.. automethod:: Column.max +.. automethod:: Column.mean +.. automethod:: Column.std +.. automethod:: Column.any +.. automethod:: Column.all + + +---- + +.. _SchemaSpecs: + +Schema Specs +============ + +Schema specs are passed to :func:`field` to declare a column's type, +storage constraints, and optional null sentinel. They are also +available directly in the ``blosc2`` namespace (e.g. ``blosc2.int64``). + +.. currentmodule:: blosc2 + +.. autofunction:: field + +Numeric +------- + +.. autosummary:: + + int8 + int16 + int32 + int64 + uint8 + uint16 + uint32 + uint64 + float32 + float64 + +.. autoclass:: int8 +.. autoclass:: int16 +.. autoclass:: int32 +.. autoclass:: int64 +.. autoclass:: uint8 +.. autoclass:: uint16 +.. autoclass:: uint32 +.. autoclass:: uint64 +.. autoclass:: float32 +.. autoclass:: float64 + +Complex +------- + +.. autosummary:: + + complex64 + complex128 + +.. autoclass:: complex64 +.. autoclass:: complex128 + +Boolean +------- + +.. autosummary:: + + bool + +.. autoclass:: bool + +Text & binary +------------- + +.. autosummary:: + + string + bytes + +.. autoclass:: string +.. autoclass:: bytes diff --git a/examples/ctable/aggregates.py b/examples/ctable/aggregates.py new file mode 100644 index 00000000..9ab7debb --- /dev/null +++ b/examples/ctable/aggregates.py @@ -0,0 +1,74 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Column aggregates: sum, min, max, mean, std, unique, value_counts, +# describe, covariance matrix, and null-aware aggregation. + +from dataclasses import dataclass + +import numpy as np + +import blosc2 + + +@dataclass +class Reading: + sensor_id: int = blosc2.field(blosc2.int32(ge=0, le=9)) + # null_value=-999 means "sensor offline" — excluded from aggregates + temperature: float = blosc2.field(blosc2.float64(ge=-50.0, le=60.0, null_value=-999.0), default=-999.0) + humidity: float = blosc2.field(blosc2.float64(ge=0.0, le=100.0), default=50.0) + alert: bool = blosc2.field(blosc2.bool(), default=False) + + +rng = np.random.default_rng(42) +N = 500 + +station_ids = rng.integers(0, 10, size=N).astype(np.int32) +temperatures = rng.normal(20.0, 8.0, size=N).clip(-50, 60).astype(np.float64) +humidities = rng.uniform(30.0, 90.0, size=N).astype(np.float64) +alerts = rng.random(N) < 0.05 + +# Simulate ~5 % of sensors being offline (temperature = null sentinel) +offline = rng.random(N) < 0.05 +temperatures[offline] = -999.0 + +data = list( + zip(station_ids.tolist(), temperatures.tolist(), humidities.tolist(), alerts.tolist(), strict=False) +) + +t = blosc2.CTable(Reading, new_data=data) +print(f"Table: {len(t)} rows ({t['temperature'].null_count()} offline sensors)\n") + +# -- per-column aggregates (null sentinels are skipped automatically) -------- +temp = t["temperature"] +print(f"temperature null : {temp.null_count()} offline readings") +print(f"temperature sum : {temp.sum():.2f}") +print(f"temperature mean : {temp.mean():.2f}") +print(f"temperature std : {temp.std():.2f}") +print(f"temperature min : {temp.min():.2f}") +print(f"temperature max : {temp.max():.2f}") + +print(f"\nalert any : {t['alert'].any()}") +print(f"alert all : {t['alert'].all()}") + +# -- unique / value_counts -------------------------------------------------- +print(f"\nsensor_id unique values : {t['sensor_id'].unique()}") +print(f"sensor_id value_counts : {t['sensor_id'].value_counts()}") + +# -- describe(): per-column summary printed to stdout ----------------------- +print() +t.describe() + +# -- cov(): covariance matrix of numeric columns ---------------------------- +numeric = t.select(["sensor_id", "temperature", "humidity"]) +cov = numeric.cov() +labels = ["sensor_id", "temperature", "humidity"] +col_w = 14 +print("\nCovariance matrix:") +print(" " * 14 + "".join(f"{lbl:>{col_w}}" for lbl in labels)) +for i, row_label in enumerate(labels): + print(f"{row_label:<14}" + "".join(f"{cov[i, j]:>{col_w}.4f}" for j in range(3))) diff --git a/examples/ctable/arrow_interop.py b/examples/ctable/arrow_interop.py new file mode 100644 index 00000000..0139e0ef --- /dev/null +++ b/examples/ctable/arrow_interop.py @@ -0,0 +1,82 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Arrow interop: CTable ↔ pyarrow.Table, and pandas round-trip via Arrow. +# +# Requires: pip install pyarrow pandas + +from dataclasses import dataclass + +import pyarrow as pa + +import blosc2 + + +@dataclass +class Stock: + ticker: str = blosc2.field(blosc2.string(max_length=8), default="") + open: float = blosc2.field(blosc2.float64(ge=0), default=0.0) + close: float = blosc2.field(blosc2.float64(ge=0), default=0.0) + volume: int = blosc2.field(blosc2.int64(ge=0), default=0) + + +data = [ + ("AAPL", 182.5, 184.2, 58_000_000), + ("GOOG", 141.3, 140.8, 21_000_000), + ("MSFT", 378.9, 380.1, 19_000_000), + ("AMZN", 185.6, 187.3, 35_000_000), + ("NVDA", 875.4, 902.1, 42_000_000), +] + +t = blosc2.CTable(Stock, new_data=data) +print("CTable:") +print(t) + +# -- to_arrow() ------------------------------------------------------------- +at = t.to_arrow() +print(f"Arrow table: {len(at)} rows, schema={at.schema}\n") + +# -- from_arrow(): schema is inferred from Arrow types --------------------- +at2 = pa.table( + { + "x": pa.array([1.0, 2.0, 3.0], type=pa.float32()), + "y": pa.array([10, 20, 30], type=pa.int32()), + "label": pa.array(["a", "bb", "ccc"], type=pa.string()), + } +) +t2 = blosc2.CTable.from_arrow(at2) +print("CTable from Arrow (inferred schema):") +print(t2) +print(f" label dtype: {t2['label'].dtype} (max_length inferred from data)") + +# -- pandas round-trip ------------------------------------------------------ +try: + import pandas as pd + + df_original = pd.DataFrame( + { + "ticker": ["TSLA", "META", "AMD"], + "open": [245.1, 502.3, 168.7], + "close": [248.5, 498.1, 171.2], + "volume": [80_000_000, 15_000_000, 28_000_000], + } + ) + print("\nOriginal DataFrame:") + print(df_original) + + # pandas → Arrow → CTable + t_from_pd = blosc2.CTable.from_arrow(pa.Table.from_pandas(df_original, preserve_index=False)) + print("\nCTable from pandas:") + print(t_from_pd) + + # CTable → Arrow → pandas + df_back = t_from_pd.to_arrow().to_pandas() + print("\nDataFrame round-tripped through CTable:") + print(df_back) + +except ImportError: + print("pandas not installed — skipping pandas round-trip demo.") diff --git a/examples/ctable/basics.py b/examples/ctable/basics.py new file mode 100644 index 00000000..0c402cb1 --- /dev/null +++ b/examples/ctable/basics.py @@ -0,0 +1,61 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# CTable basics: creation, append, extend, head/tail, len. + +from dataclasses import dataclass + +import numpy as np + +import blosc2 + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + price: float = blosc2.field(blosc2.float64(ge=0), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +# -- Create an empty table -------------------------------------------------- +t = blosc2.CTable(Row) +print(f"Empty table: {len(t)} rows") + +# -- append(): one row at a time -------------------------------------------- +t.append(Row(id=0, price=1.5, active=True)) +t.append(Row(id=1, price=2.3, active=False)) +print(f"After 2 appends: {len(t)} rows") + +# -- extend(): bulk load from a list of tuples ------------------------------ +bulk = [(i, float(i) * 0.5, i % 2 == 0) for i in range(2, 10)] +t.extend(bulk) +print(f"After extend: {len(t)} rows") + +# -- extend() from a structured numpy array --------------------------------- +arr = np.zeros(5, dtype=[("id", np.int64), ("price", np.float64), ("active", np.bool_)]) +arr["id"] = np.arange(10, 15) +arr["price"] = np.linspace(10.0, 14.0, 5) +arr["active"] = [True, False, True, False, True] +t.extend(arr) +print(f"After numpy extend: {len(t)} rows\n") + +# -- display: head / tail / full table -------------------------------------- +print("head(3):") +print(t.head(3)) + +print("tail(3):") +print(t.tail(3)) + +print("Full table:") +print(t) + +# -- basic properties ------------------------------------------------------- +print(f"nrows : {t.nrows}") +print(f"ncols : {t.ncols}") +print(f"columns: {t.col_names}") +print(f"cbytes : {t.cbytes:,} B (compressed)") +print(f"nbytes : {t.nbytes:,} B (uncompressed)") diff --git a/examples/ctable/csv_interop.py b/examples/ctable/csv_interop.py new file mode 100644 index 00000000..41a76fd9 --- /dev/null +++ b/examples/ctable/csv_interop.py @@ -0,0 +1,82 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# CSV interop: generate a weather CSV, load it into a CTable, write it back. + +import csv +import shutil +import tempfile +from dataclasses import dataclass +from pathlib import Path + +import numpy as np + +import blosc2 + + +@dataclass +class WeatherReading: + station_id: int = blosc2.field(blosc2.int32(ge=0, le=9999)) + temperature: float = blosc2.field(blosc2.float32(ge=-80.0, le=60.0), default=20.0) + humidity: float = blosc2.field(blosc2.float32(ge=0.0, le=100.0), default=50.0) + wind_speed: float = blosc2.field(blosc2.float32(ge=0.0, le=200.0), default=0.0) + pressure: float = blosc2.field(blosc2.float32(ge=800.0, le=1100.0), default=1013.0) + day_of_year: int = blosc2.field(blosc2.int16(ge=1, le=365), default=1) + + +# -- Generate a weather CSV ------------------------------------------------- +rng = np.random.default_rng(42) +N = 1_000 + +station_ids = rng.integers(0, 100, size=N).tolist() +temperatures = [round(v, 2) for v in rng.normal(15.0, 12.0, N).clip(-80, 60).tolist()] +humidities = [round(v, 2) for v in rng.uniform(20.0, 95.0, N).tolist()] +wind_speeds = [round(v, 2) for v in rng.exponential(10.0, N).clip(0, 200).tolist()] +pressures = [round(v, 2) for v in rng.normal(1013.0, 8.0, N).clip(800, 1100).tolist()] +days = rng.integers(1, 366, size=N).tolist() + +rows = list(zip(station_ids, temperatures, humidities, wind_speeds, pressures, days, strict=False)) + +tmpdir = Path(tempfile.mkdtemp(prefix="blosc2_csv_")) +csv_in = tmpdir / "weather.csv" +csv_out = tmpdir / "weather_out.csv" + +# Write the CSV manually so the example is self-contained +with open(csv_in, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["station_id", "temperature", "humidity", "wind_speed", "pressure", "day_of_year"]) + writer.writerows(rows) + +print(f"Generated {N} rows → {csv_in}") + +# -- from_csv(): load into CTable ------------------------------------------- +t = blosc2.CTable.from_csv(str(csv_in), WeatherReading) +print(f"Loaded into CTable: {len(t)} rows") +print(t.head()) + +# -- apply a filter before exporting ---------------------------------------- +cold_days = t.where(t["temperature"] < 0) +print(f"\nCold days (temp < 0°C): {len(cold_days)} rows") +print(cold_days.head()) + +# -- to_csv(): write back to CSV -------------------------------------------- +t.to_csv(str(csv_out)) +print(f"\nFull table written to {csv_out}") + +# Verify round-trip row count +with open(csv_out) as f: + lines = f.readlines() +assert len(lines) == N + 1 # header + data rows +print(f"Round-trip verified: {len(lines) - 1} data rows in output CSV.") + +# -- TSV variant ------------------------------------------------------------ +tsv_out = tmpdir / "weather.tsv" +t.to_csv(str(tsv_out), sep="\t") +print(f"TSV variant written to {tsv_out}") + +shutil.rmtree(tmpdir) +print("Temporary files removed.") diff --git a/examples/ctable/ctable_tutorial.ipynb b/examples/ctable/ctable_tutorial.ipynb new file mode 100644 index 00000000..27e98b45 --- /dev/null +++ b/examples/ctable/ctable_tutorial.ipynb @@ -0,0 +1,2018 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a8bf6f00", + "metadata": {}, + "source": [ + "# CTable Tutorial\n", + "\n", + "**CTable** is a columnar compressed table built on top of `blosc2.NDArray`. \n", + "It stores each column independently as a compressed array, giving you:\n", + "\n", + "- **Compression** — data lives compressed in RAM and on disk.\n", + "- **Schema** — every column has a declared type and optional constraints.\n", + "- **Speed** — bulk operations stay in NumPy; no row-by-row Python overhead.\n", + "- **Persistence** — tables can be saved to and loaded from disk transparently.\n", + "\n", + "This notebook walks through the full API, starting from the very basics and finishing with a real-world analysis of climate data across ten world cities." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "a4073a3e", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:24.795843222Z", + "start_time": "2026-04-14T12:39:24.555798389Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:13.708034Z", + "iopub.status.busy": "2026-04-07T12:06:13.707898Z", + "iopub.status.idle": "2026-04-07T12:06:14.162620Z", + "shell.execute_reply": "2026-04-07T12:06:14.161981Z" + } + }, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "import blosc2\n", + "from blosc2 import CTable" + ] + }, + { + "cell_type": "markdown", + "id": "1637a7b2", + "metadata": {}, + "source": [ + "---\n", + "## Part 1 — The Basics\n", + "\n", + "### 1.1 Defining a schema\n", + "\n", + "Every CTable is typed. You define the schema with a plain Python `@dataclass`.\n", + "Each field gets a **spec** — a blosc2 type that carries the NumPy dtype and optional constraints." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "c97f9123", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:25.040587457Z", + "start_time": "2026-04-14T12:39:24.896936343Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:14.164585Z", + "iopub.status.busy": "2026-04-07T12:06:14.164404Z", + "iopub.status.idle": "2026-04-07T12:06:14.168886Z", + "shell.execute_reply": "2026-04-07T12:06:14.168381Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Empty table: 0 rows, columns: ['id', 'location', 'temperature', 'active']\n" + ] + } + ], + "source": [ + "@dataclass\n", + "class Sensor:\n", + " id: int = blosc2.field(blosc2.int32(ge=0))\n", + " location: str = blosc2.field(blosc2.string(max_length=16), default=\"\")\n", + " temperature: float = blosc2.field(blosc2.float64(ge=-80, le=60), default=20.0)\n", + " active: bool = blosc2.field(blosc2.bool(), default=True)\n", + "\n", + "\n", + "# Create an empty in-memory table\n", + "t = CTable(Sensor, expected_size=50)\n", + "print(f\"Empty table: {len(t)} rows, columns: {t.col_names}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c27913d6", + "metadata": {}, + "source": [ + "### 1.2 Appending rows\n", + "\n", + "`append()` adds one row at a time. The row is validated against the schema before writing." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "fdc64a5b", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:25.223196169Z", + "start_time": "2026-04-14T12:39:25.094103593Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:14.170432Z", + "iopub.status.busy": "2026-04-07T12:06:14.170315Z", + "iopub.status.idle": "2026-04-07T12:06:14.231985Z", + "shell.execute_reply": "2026-04-07T12:06:14.231362Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " id location temperature active \n", + " int32 35)\n", + "print(f\"Days above 35 °C: {len(very_hot)} ({len(very_hot) / len(climate) * 100:.1f}% of all readings)\")\n", + "print(very_hot.head(8))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "ba2d719b", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:26.233532577Z", + "start_time": "2026-04-14T12:39:26.164808834Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:14.342416Z", + "iopub.status.busy": "2026-04-07T12:06:14.342298Z", + "iopub.status.idle": "2026-04-07T12:06:14.358545Z", + "shell.execute_reply": "2026-04-07T12:06:14.357991Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Moscow below freezing: 148 days out of 365\n", + " city day temperature humidity wind_speed pressure \n", + " 10} {'Min':>7} {'Max':>7} {'Std':>7}\")\n", + "print(\"-\" * 50)\n", + "for city in CITY_PROFILES:\n", + " v = climate.where(climate[\"city\"] == city)\n", + " col = v[\"temperature\"]\n", + " print(f\"{city:<12} {col.mean():>9.1f}° {col.min():>6.1f}° {col.max():>6.1f}° {col.std():>6.1f}°\")" + ] + }, + { + "cell_type": "markdown", + "id": "49dcbad7", + "metadata": {}, + "source": [ + "### 4.2 `describe()` — full summary in one call" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "7254f3b1", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:26.710706134Z", + "start_time": "2026-04-14T12:39:26.628790601Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:14.520839Z", + "iopub.status.busy": "2026-04-07T12:06:14.520722Z", + "iopub.status.idle": "2026-04-07T12:06:14.542317Z", + "shell.execute_reply": "2026-04-07T12:06:14.541649Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CTable 3,650 rows × 4 cols\n", + "\n", + " temperature [float32]\n", + " count : 3,650\n", + " mean : 16.04\n", + " std : 10.72\n", + " min : -17.54\n", + " max : 39.75\n", + "\n", + " humidity [float32]\n", + " count : 3,650\n", + " mean : 63.48\n", + " std : 16.02\n", + " min : 8.894\n", + " max : 99.81\n", + "\n", + " wind_speed [float32]\n", + " count : 3,650\n", + " mean : 15.63\n", + " std : 4.874\n", + " min : 8.005\n", + " max : 47.48\n", + "\n", + " pressure [float32]\n", + " count : 3,650\n", + " mean : 1013\n", + " std : 5.328\n", + " min : 991.1\n", + " max : 1036\n", + "\n" + ] + } + ], + "source": [ + "# describe() on a select() view — only numeric columns\n", + "climate.select([\"temperature\", \"humidity\", \"wind_speed\", \"pressure\"]).describe()" + ] + }, + { + "cell_type": "markdown", + "id": "817dbc1f", + "metadata": {}, + "source": [ + "### 4.3 Covariance matrix\n", + "\n", + "`cov()` requires all columns to be numeric (int, float, or bool). \n", + "It returns a standard `numpy.ndarray`." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "6d0dd2c1", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:26.797016145Z", + "start_time": "2026-04-14T12:39:26.714612755Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:14.543869Z", + "iopub.status.busy": "2026-04-07T12:06:14.543748Z", + "iopub.status.idle": "2026-04-07T12:06:14.559277Z", + "shell.execute_reply": "2026-04-07T12:06:14.558718Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Covariance matrix (all cities, full year):\n", + " temp humidity wind pressure\n", + "temp 114.963 0.018 -3.523 -0.207\n", + "humidity 0.018 256.861 10.773 6.652\n", + "wind -3.523 10.773 23.760 -2.650\n", + "pressure -0.207 6.652 -2.650 28.394\n", + "\n", + "Correlation matrix:\n", + " temp humidity wind pressure\n", + "temp 1.000 0.000 -0.067 -0.004\n", + "humidity 0.000 1.000 0.138 0.078\n", + "wind -0.067 0.138 1.000 -0.102\n", + "pressure -0.004 0.078 -0.102 1.000\n" + ] + } + ], + "source": [ + "numeric = climate.select([\"temperature\", \"humidity\", \"wind_speed\", \"pressure\"])\n", + "cov = numeric.cov()\n", + "\n", + "labels = [\"temp\", \"humidity\", \"wind\", \"pressure\"]\n", + "col_w = 12\n", + "print(\"Covariance matrix (all cities, full year):\")\n", + "print(\" \" * 10 + \"\".join(f\"{lbl:>{col_w}}\" for lbl in labels))\n", + "for i, lbl in enumerate(labels):\n", + " print(f\"{lbl:<10}\" + \"\".join(f\"{cov[i, j]:>{col_w}.3f}\" for j in range(4)))\n", + "\n", + "# And the correlation matrix for easier interpretation\n", + "corr = np.corrcoef(\n", + " np.stack([numeric[c].to_numpy() for c in [\"temperature\", \"humidity\", \"wind_speed\", \"pressure\"]])\n", + ")\n", + "print(\"\\nCorrelation matrix:\")\n", + "print(\" \" * 10 + \"\".join(f\"{lbl:>{col_w}}\" for lbl in labels))\n", + "for i, lbl in enumerate(labels):\n", + " print(f\"{lbl:<10}\" + \"\".join(f\"{corr[i, j]:>{col_w}.3f}\" for j in range(4)))" + ] + }, + { + "cell_type": "markdown", + "id": "c10a694e", + "metadata": {}, + "source": [ + "---\n", + "## Part 5 — Analysis: Summer in Madrid\n", + "\n", + "Summer in the northern hemisphere runs roughly from the **summer solstice (day 172, June 21)** \n", + "to the **autumnal equinox (day 264, September 22)**.\n", + "\n", + "Let's zoom in on Madrid during those months and compare it with a few other cities." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "89e89177", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:26.878728686Z", + "start_time": "2026-04-14T12:39:26.800640315Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:14.560797Z", + "iopub.status.busy": "2026-04-07T12:06:14.560666Z", + "iopub.status.idle": "2026-04-07T12:06:14.576880Z", + "shell.execute_reply": "2026-04-07T12:06:14.576245Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Madrid summer readings : 93 days\n", + " mean temperature : 25.8 °C\n", + " max temperature : 31.4 °C\n", + " mean humidity : 43.8 %\n", + " mean wind speed : 15.8 km/h\n" + ] + } + ], + "source": [ + "SUMMER_START = 172 # June 21\n", + "SUMMER_END = 264 # September 22\n", + "\n", + "madrid = climate.where(climate[\"city\"] == \"Madrid\")\n", + "madrid_summer = madrid.where((madrid[\"day\"] >= SUMMER_START) & (madrid[\"day\"] <= SUMMER_END))\n", + "\n", + "print(f\"Madrid summer readings : {len(madrid_summer)} days\")\n", + "print(f\" mean temperature : {madrid_summer['temperature'].mean():.1f} °C\")\n", + "print(f\" max temperature : {madrid_summer['temperature'].max():.1f} °C\")\n", + "print(f\" mean humidity : {madrid_summer['humidity'].mean():.1f} %\")\n", + "print(f\" mean wind speed : {madrid_summer['wind_speed'].mean():.1f} km/h\")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "a439fecd", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:27.020397605Z", + "start_time": "2026-04-14T12:39:26.880962534Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:14.578621Z", + "iopub.status.busy": "2026-04-07T12:06:14.578475Z", + "iopub.status.idle": "2026-04-07T12:06:14.693971Z", + "shell.execute_reply": "2026-04-07T12:06:14.693318Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "City Summer mean Summer max Summer humidity\n", + "----------------------------------------------------------\n", + "Madrid 25.8°C 31.4°C 43.8% \n", + "London 16.5°C 22.7°C 74.6% \n", + "Cairo 33.5°C 39.7°C 34.4% \n", + "Moscow 20.1°C 26.3°C 69.3% \n", + "Tokyo 25.1°C 31.0°C 73.0% \n", + "Sydney 24.6°C 30.9°C 63.8% (S. summer)\n" + ] + } + ], + "source": [ + "# Compare summer stats across several cities\n", + "compare_cities = [\"Madrid\", \"London\", \"Cairo\", \"Moscow\", \"Tokyo\", \"Sydney\"]\n", + "\n", + "print(f\"{'City':<12} {'Summer mean':>12} {'Summer max':>11} {'Summer humidity':>16}\")\n", + "print(\"-\" * 58)\n", + "for city in compare_cities:\n", + " v = climate.where(climate[\"city\"] == city)\n", + " # For Sydney (S. hemisphere) 'summer' is Jan-Mar, i.e. days 1-80 or 355-365\n", + " if city == \"Sydney\":\n", + " s = v.where((v[\"day\"] <= 80) | (v[\"day\"] >= 355))\n", + " label = \"(S. summer)\"\n", + " else:\n", + " s = v.where((v[\"day\"] >= SUMMER_START) & (v[\"day\"] <= SUMMER_END))\n", + " label = \"\"\n", + " mean_t = s[\"temperature\"].mean()\n", + " max_t = s[\"temperature\"].max()\n", + " mean_h = s[\"humidity\"].mean()\n", + " print(f\"{city:<12} {mean_t:>10.1f}°C {max_t:>9.1f}°C {mean_h:>14.1f}% {label}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "4e2161ee", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:27.105416029Z", + "start_time": "2026-04-14T12:39:27.022601226Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:14.696603Z", + "iopub.status.busy": "2026-04-07T12:06:14.695965Z", + "iopub.status.idle": "2026-04-07T12:06:14.752771Z", + "shell.execute_reply": "2026-04-07T12:06:14.751433Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 hottest days in Madrid:\n", + " city day temperature humidity \n", + " " + ] + }, + "jetTransient": { + "display_id": null + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_cities = {\n", + " \"Madrid\": \"#e63946\",\n", + " \"London\": \"#457b9d\",\n", + " \"Moscow\": \"#2d6a4f\",\n", + " \"Cairo\": \"#f4a261\",\n", + " \"Sydney\": \"#a8dadc\",\n", + "}\n", + "\n", + "fig, ax = plt.subplots(figsize=(12, 5))\n", + "\n", + "for city, color in plot_cities.items():\n", + " v = climate.where(climate[\"city\"] == city)\n", + " d = v[\"day\"].to_numpy().astype(int)\n", + " t = v[\"temperature\"].to_numpy()\n", + " order = np.argsort(d)\n", + " ax.plot(d[order], t[order], label=city, color=color, linewidth=1.5, alpha=0.85)\n", + "\n", + "ax.axvspan(SUMMER_START, SUMMER_END, alpha=0.10, color=\"gold\", label=\"N. summer\")\n", + "ax.set_xlabel(\"Day of year\")\n", + "ax.set_ylabel(\"Temperature (°C)\")\n", + "ax.set_title(\"Daily temperature — selected cities\")\n", + "ax.legend(loc=\"upper left\")\n", + "ax.grid(True, linestyle=\"--\", alpha=0.4)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "4fbb0c53", + "metadata": {}, + "source": [ + "### 5.2 Summer temperature distribution — Madrid vs London\n", + "\n", + "A simple histogram comparison of how often each city exceeds different temperature thresholds." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "94e141a4", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:27.585476061Z", + "start_time": "2026-04-14T12:39:27.354159759Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:15.038502Z", + "iopub.status.busy": "2026-04-07T12:06:15.038229Z", + "iopub.status.idle": "2026-04-07T12:06:15.564804Z", + "shell.execute_reply": "2026-04-07T12:06:15.563854Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3kAAAGGCAYAAADGq0gwAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAlWVJREFUeJzs3Xd4FOXexvHvbnovJCSkkEKN7ViPir2CCnbA3js2wIIVC2AHPa967IoCKtiOvSEoqKh0gRDSSYH03pPd94+FlWU31MBMwv25rlzszs7O/vbh3pl9dmaesfSJS7AjIiIiIiIiPYLV6AJERERERESk66iTJyIiIiIi0oOokyciIiIiItKDqJMnIiIiIiLSg6iTJyIiIiIi0oOokyciIiIiItKDqJMnIiIiIiLSg6iTJyIiIiIi0oOokyciIiIiItKDqJMnIiLd2h+LfmPatKnO+0cffRTFRQUcffRRe/y1x48bS3FRgcu04qICJk96fI+/NsCoUSMpLiogISFhr7xeTzZt2lT+WPTbDs2bkJBAcVEBo0aN7NLl7iuUW5E9T508kR5i8ODBvPbaK/z5x+/kZGeyZPFffPD+TK65+iqjSzOdk08+ifHjxhpdhildeeUVO/TFtSe67bZbGTZ0qNFleGTm2rpacVEBxUUFPPvM0x4fv/fee5zzREZE7OXquq8/Fv3G9OlvG12GiOwl6uSJ9ACHH34Y33z9Jfvttx8zZ73Pgw8+xPvvv4/NZufaa681ujzTOeXkkxk/fpzRZZjSlVdcwaiR3buTt2jRH6Sk9mfRoj926nm333YrQ4ftXEfq+Rf+Q0pq/516zq7orLaPPvqYlNT+FBYW7vEa9qampmbOPPMMfHx83B4795yzaWpqNqCqfxQWFpKS2p+PPvrY0DpERDrjbXQBIrL7br/9Nurq6jjzzOHU1ta6PNarVy+DqjJWgL8/Tc3GfhE0A39/f5pN0A57sw673U5LS8sefY2AgACampro6Oigo6Njj77Wtthstj3+Xo0wf/58Tj/9NE4+6SS++/575/TDDz+MpKQkvvzqK4afddZer8vLywur1UpbW1uPbHcR6Tm0J0+kB0hOSiJj3Tq3Dh5ARUWF8/a2ziMpLipwOYRx87lGqakp/N9/XmBt+mr+Xrmcu+++C4C4uD68/dabZKxdw/JlS7jxxhtclrf5vKgRI4YzbuydLFn8F+sy0nnttVcICQnB19eXRx+dyMoVy8hct5ZpU5/D19fXra7zzz+Pb7/5iuysTFav+pv/vvwScXF9XOb5aM5sfpr7IwceeCCffPwR2VnrmHDfvR7batq0qVy96RDWzYd8bXlOlcVi4brrrmXeTz+Sk53JiuVLeeqpJwgLC3NZzuZDn44++ii++dpR39wff3CeB3bGGcOY++MP5GRn8u03X3HA/vu71ZG5bi19+/Zl1swZZGVmsHTJYsbeeYdbzTtb0wknnMA3X39FTnYml112KQCjR41i9uwPWLliGbk5WcyfN5crrrjc7fmDBw9iyJCjne3y0ZzZgOdzz8DzuTXbqiM0NJRHH53I4r/+IDcni18XLmDMLTdjsVg8/n9t7Y47bmfx4j/JzlrHnDkfMnDgQLd5PJ2Tl5KSzOuvvcryZUvIyc5k8eI/+e/LLxESEgI4shAUFMToTe+nuKjAeZ7f5vc+YMAAXnrx/1iz+m/+99kn22wXgPPOO5cFv8x3ZuDII490ebyzc7W2Xua2auvs3KYrr7yCeT/9SG5OFkuXLGbK5EmEhoa6zLP5czNgwADmzPmQ7Kx1LFn8F7fcfJPnxt+LNm7cyKI//uC88851mX7+eeexZk06GWsz3J7z73//m1df/S9//bmI3JwsFv/1B488MhF/f3+3eYcNHcpPcx2fp5/m/siwYcPc5tm8vrzpxhu57rpr+e3XheTlZjNw4IBO16U7slxPpk9/m99/W+jxsc8//4xvvv7Kef/4447js08/Jn3NKjLXrWXBL/OZMMHz+m5neXl5ceedd/DbrwvJzcnij0W/MWHCvW7r5s2f8X8fcQRfffkFOdmZ/P7bQi688AK3ZQ4cOJDZsz8gO8vxubvjjtuxWjx//ezuuRUxE+3JE+kBCguLOOywQxk0aBAZGe5ffnbHK/99mczMLKY88SSnnHIyY++8g+rqai6/7FIW/vobk6c8wfnnncvEhx9i+fIV/PGH6yFyt906hubmZl566SWSk5O55pqraW9rx2azERYWxnNTp3HooYcwevQo1q9fz7TnX3A+9/bbb+Oeu+/iiy++ZNb7H9ArMpJrrrmaTz7+iNOHnuHSqY2ICGfmjHf53/8+5+NPPqG8rNzj+5kxYwaxMTGccMLx3Hrb7W6PP/3Uk4waNZIPP5zNm2+9Td/ERK6++ioO2P8Azjn3PNrb253zpiQn89KLLzJjxgw+/uQTbrrpRqa/8zb3TriP+ybcy/Tp7wJw661jeOXV/3LccSdgt9udz7davZg58z2WLl3KpElTOOmkE7j77rvw9vbmmWef26Wa+vXrx8svOWqaOWsW2dnZAFxxxeWsW7eO77//gY72dk477TSefGIKVouVd6ZPB2DixEeZNOkxGhoaeOE//wfQaTtuj6c6Avz9+fjjOfSJjeW9GTMpKiri8MMP4777JtA7pjcTJz66zWXeffddjL3zDn6cO5ef5s7jwAMP4P1ZM/H1dT+kb0s+Pj7MmjkDX18/3nr7HcpKS4mNjeXUU08lNDSUuro6br3tdp595mmWL1/BjJkzAcjPz3dZzmuv/pfc3DyefOrp7XZKjzrqKM4+ewRvvvU2rS0tXHnlFcya+R5nnjVipz+jO1LblsaPG8v48eP45ZdfePfd9+jXrx9XXHE5//rXv9zyEhYWxqyZ7/H1N9/wxRdfctZZZ/Lggw+QvnYt8+bN36k6u9qnn37G4489SmBgII2NjXh5eTF8+Fm89trr+Pn5uc0/YvhZBAQEMP3d96iqquKQgw/mmquvok+fWG688WbnfCccfzyvv/4q69Zl8sSTTxEREcG0qc+yYcNGj3WMHj0SPz9/Zs6cSUtrK9VV1Vis7p2UnV3ulj7//Av+7z8v8K9//YsVK1Y4p8fHx3P4YYfx2GOTAEeHafr0t0lPX8uzzz5HS2srKcnJHHH44dt9jR3x7LPPMHrUSL748ktefe01DjnkYG6/7VYG9O/Ptddd7zJvSnIyr732Cu9/8CFz5nzERReN4vlpU1m58m/WrVsHQHR0NB/N+RAvL29eeuklGhubuPSySzzu1e8puRUxC3XyRHqAV155lRkz3uWH779l+fLl/PHHnyxc+Cu//vaby4ZxVyxbvpx7770PgBkzZvLnH78z8eGHeOKJJ3np5f8C8Nln/2PZ0sVcdNFot06el5c3518w0llHr169OOecs5k3bz6XX3ElANOnv0tKcjIXXTTa2cmLj4/nrvHjeOrpZ/i//3vRubyvv/mW77/7hiuvvMJlekxMDPfcO4EZM2Zu8/0sWbKUnJwcTjjheD755FOXx/59xBFceukljBlzG59+9plz+q+//c77s2YwYvhwl+n9+/dnxNnnsGTJUgAy12Xy/vszefaZpzn++BMpKi4GoLqmhmeefoqjjjqS339f5Hx+QIA/8+fN56GHJwLwzvTpTJ/+NrfccjNvvvkWlVVVO11TakoKF19yGT///LPLe7vgwpEuX6zefmc6M2e8xw03XO/s5H373Xfcc8/dVFZWurXNzvJUxx133E5yUhKnDx1Gbm4e4MhUycYSbr75Jl599TWKizd4XF5kZCS33HwTP/z4I1deebVz+r333sMdt9+2zVoGDhxAUlIS199wI1999bVz+pY/KHzyyac89eQT5K9f3+l7X7MmnTG3bvu1NktLG8zQYWfy999/A/C//33OL7/8zN13jee662/YzrNd7Uhtm0VGRnLrrWOYP/9nLr3scuePCllZ2UyZMokLzj+fD2fPds7fp08st91+Bx9/7Ngz+f77H/DnH79z8cUXGf5l+auvvmbypMcZNmwon3zyKSeccDyRkZF8+tn/uGj0KLf5J095wiXjM2fOIi8vjwkT7iU+Ls75eXzggfsoKyvn3PPOp66uDoBFvy/igw9mUVDgvle2T58+DDnmOCorK53TPI0KubPL3dJ3331Pc3Mz55w9wqWTd/aI4dhsNr744gsAjj/+OPz8/LjsssuprKra5jJ31n77pTF61EhmzpzF3fc49gxOn/4uFeUV3HzzTQwZcjS//fa7c/7+/ftz7nkX8OeffwLw+RdfsPivP7ho9Cgee9zRKR0z5haioqI486wRLF++HIDZc+bw68JfXF67J+VWxCx0uKZID/DLggWMOPtcvv/+B/bbbz/GjLmF99+fydIlf3H6aaft1rJnzfrAedtms7FixUqsVivvv//P9NraWrKzs0nq29ft+R999JFLR3PpsmVYrVY++PBDl/mWLltOXFwcXl5eAJx55hlYrVa++OJLIiMinH9lpaXk5uZyzJCjXZ7f3NzMhx/OZncMH34WNTU1/PzLLy6v+ffKldTX1zNkq9fMyMhwdvA2vzeAhb/+6vxCCbBs0/Skvklur/n2O++43n97On5+fhx33HG7VFN+fr5bBw9w+fIbEhJCZEQEvy9aRHJykvOQxa7kqY7hw8/ijz/+pKa6xuW9LFi4EG9vb7fDGbd0/HGOL7dvvfWOy/TXX39ju7XU1jq+cJ94wgkEeDh0b0e9+96MHZ538eLFzg4eQFFxMd9//z0nnngCVg97gbrK5nZ6/Y03XPYaz5w1i9raWk459WSX+evr651flAHa2tpYvnyFx8/y3lZTU8P8+T9z7rnnAHDeueeyePFiioqKPM6/ZcYDAgKIjIjgr8VLsFqtHHDAAQD07t2bAw44gDlz5jg7YuBYh3a2h/Xrr79x6eB5sivL3VJ9fT3z5s1nxIjhLtPPPvtsli5d6lyfbD56YejQ03f4EOcddfLJjmy8+trrLtNfefU1AE495RSX6RkZGc4OHkBlZSXZOTn03SI7p5x8EouXLHF28DbP9+mnn7ksqyflVsQstCdPpIdYsWIF111/Az4+Puy3336cccYwrr/uOl577RVOO30YmZmZu7Tcrb9Q1dbV0dTU7PYrcm1tHREehjPfsrMDOL8AFW89vbYWLy8vQkNDqKqqJiUlBavVym+/LvBYV9tWeyg3biyhra1tx95UJ1JSUggLC2PV3ys8Ph4VFeVyv6ios/fmujdqcycjLNz1HLqOjg7y89e7TMvJyQEgMTFhl2pa38kegyMOP5y77hrHYYcdRmBgoMtjoSEhLl9Mu4KnOlJTUth/v/1YtWqlx+ds/V62lJAQD0Bubq7L9MrKSqqqqrdZS0FBAa+8+ho33XgD559/Hn/88Sfff/8DH3/yyU6974KC9dufaZOcreoEx/9tYGAgvXr1oqysbIeXtTM2t1N2do7L9La2NtavX09CvOseqA0b3PecVtfUkJY2eJuvEx4e7nHkyx1RXV29w5/VTz/7jP+88DzxcXEMGzaUSZOndDpvfFwcd919F6efdhoREeEuj4WEOn7I+CdHeW7Pz87O4cADD3Cb3tlnaku7stytff75F5xxxjAOP/wwFi9eQlJSEv/610HOPf2b57nk4ot47rlnuf/++1i48Fe+/uYbvvzyK5fO0a5ISIino6ODvDzX91BWVkZ1dTXxm97jZluv/wBqqmtc1nPx8fHOH7+2tPkw8i1f2zF9z+ZWZF+iTp5ID9PW1saKFStYsWIFOTk5PD9tKiOGn8XUac93+iVgW3sWbB5GDrTZPI8m6OmX5c5GHuzosHleBpZNNVmw2WxcetkVHl+voaHB5X5XjNxotVopKyvzeK4euA5iA9DRSTt4ajP4573tyZo8tUNSUhIffvg+2dnZPPLoYxQXF9PW1sbJJ5/MjTdc7/H8oq11lh2vTp7rqQ6LxcLPP//Cy//9r8fn5Gz1Ba8rPfbY48yePYehQ0/nhOOP5/HHH+XW28YwYsTZO3TOFEBzVw/b39nncdPe7L2h08/hdvYSvfH6a257kXfUBReOdDlseVu+//4HWltbef6Fafj6+vLF5194nM9qtfLBB7MIDw/n5ZdfJisrm8amRmJjY3nh+Wm7tfe0y//fO/H9Dz/Q2NjIiBHDWbx4CSNGDKejo4Mvv/xn0JXm5mbOO/9CjjlmCKeccgonnXgC55xzNgsWLuTiiy/FZvP8/7kzdrSz2Nn6b1fWcztrV3Mrsi9RJ0+kB1uxwrHHpHdMb8Bx+BPgNlqZp/NLjJafl4/VaqWgYD05Oe57RXZHZ19i8vPzOe64Y/nrr8V7Zbh/Ly8vkpL6ury/1NRUAAoKCrusptNOOxV/f3+uuuoalz2rQ4YMcZu3s7bZMjtbDnizM9nJz88nKCiQBQs8jyK4LYWFjj3KKSkprF//zx61yMhIt702nVm7di1r167lhRf+w+GHH8bn//uMyy+/nKeffgbY8S+3OyI1JcV9WmoqjY2Nzo55dU2N22cR/tmrsaUdrW1zO/Xrl+rSTj4+PiQmJrJg4c63vSePPvY44Vvtmd5Ra9ak7/C8zc3NfPvdd1x4wQXMnftTp+ehpaUNpl+/ftx+x50u1647ftNhz5v9k6Nkt2X065e6w3VtrSuW29TUxI8/zmX4WcN55JHHOOfsEfzxx5+UlJS4zGe321m48FcWLvyVRx+F2267lfsm3MsxxwzZpc/Wlu/By8uLlJQUsrKynNOjoqIIDw+nqNDzYbLbUlRURIqHz0K/fv3cXtsxfc/mVmRfonPyRHqAzn5RP+Xkk4B/DoGpr6+noqKCo7Y69+mqK6/YswXugq+/+Zb29nbGjR3r8fEd/WLvSWNjI+De2f38iy/x9vbmTg+XMXAcSur+hXx3XX3VVa73r76S1tZW55earqjJuWdxi1+5Q0JCGD3KffCKxqZGwsLcl5m3aTTHLbMTEBDAyJEXbvf1N/viiy85/PDDOeGEE9weCw0NdZ6P6ckvCxbQ2trKNddc5TL9+uuv2+7rBgcHuy07PX0tHR0d+G0xNHxjYyNhXfR/fPjhh3PgAf8cohcX14fTTz+dn3/+xbm3JT8vn7CwMJdDzHr37s0ZHobd39HaflmwgJaWFq695hqX6RdffBFhYWHM/fGnXX1LLv7++28WLFi4S3+bfzDYUa+88irPPTeV5194odN5Nu/Z2XpPzrXXubZDaWkpq1atYuTIkS7noh5/3HEMGjRop+raE8v93+ef06dPLJdccjH7778/n3/huucyPDzc7TmrV68G8HgJmp3x00+ObFx//bUu02+8wTGq5o9z5+70Muf+NI/DDzuMgw8+2DktMjLS7dIYeyu3IvsS7ckT6QEmPf44AQH+fPPtd2RlZeHr48vhhx/G2WePYP369S4Dksya9T633XYrzz7zNCtWruSoI4907j0yk/z8fJ5++hnuv/8+EhMT+Pbb76hvaKBvYiLDzhjGzBmzeOXVV3dp2Ss3DYjx+OOPMn/+z9g6bPzv889ZtGgR7773Hrffdiv777cfP//yC+1t7aSkJjP8rOE8PHGiy+iMu6upqZkTTzqR55+fyrJlyzn5pBM57dRTeeE//+cc6KEravr5l19oaWlh+jtvM2PGTIKCArnkkkuoqCgnNjbGZd6/V/7NFVdczh133E5ebh7lFeX8+utv/PzzLxQWFvLcc8/w3/++QofNxkWjR1FRUbnDe/P++99XOP3003h3+tvMnj2HlX//TWBgIIMHD2b4WWdy5JFHd7qnprKykldefY3bb7uVd999h5/mzuOAA/bnpJNOcjtkdWvHHnMMkyY/zpdffkVOTg7eXl5ccMEFdHR08NXX/7Tdyr//5rjjjuWGG66nZGMJ6wvWs2zZ8h16b1tLT1/LrFkzXC6hAPDsc/9cGuN/n/+PBx64jzffeIM333qLgIAArrzicnJycjjooINclrejtVVWVvLiiy8xfvw4Zs2cwfff/0C/fqlceeUVLFu2nI8/+cTtOWa3Zk36dvf+ZWVlkZubx8MPPUhsbCz1dXWcedaZhIe5722c8sRTvPfuO3z26Sd88OGHhIeHc83VV7F2bQZBQYEelr5jumK5P/00j7q6Oh5+6EHa29vdPttjx97BUUceyY9zf6KosJBeUVFceeUVFBcX8+eff213+SnJydxxh/uh36tWrWLu3J/4cPYcLr/sMsJCw/h90SIOPvhgRo8ayTfffOsysuaOevnl/3LhBeczc8Z7vPnmm85LKBQWFblc57Mn5lbEaOrkifQAjz0+iRHDz+KUk0/isksvwcfHh6LiYqZPf5fnX/iPy+F1055/gV69enHWWWcyYsRw5s2bz6WXXd7pwB5GevGll8nOyeGG669n3KYLtRcXF/PLz7/w/Q/f7/Jyv/76G9588y3OOedsLjj/fKxWK//7/HMAJky4n5Ur/+byyy7jvgn30t7eTkFBIZ988gl//bW4S97XZjZbB5deejlPPjGFhx58gPr6ep57bipTpz3vMt/u1pSdncMNN97EPffczUMPPUhZWSnvvvseFRWVTJv2nMu8U6c9T3xCPLfcfBMhISH89tvv/Pqr41Ic1157PVOemMzdd99FWVkZr7/xJjU1NTy/6cLc29PU3Mz5F4zk9ttvY/jws7jwwguor68nJyeHZ5+bSu12BkF56qmnaWlu5vLLL+eYIUNYumwZF19yKe+9+842n7d6zRp+nv8zp516KrGxsTQ1N7FmzRouu/wKli79Z1CIRx99jKefeop777mbgIAAPpw9Z5c7eYsWLWLxkiWMGzeW+Lg4MjMzuXPsONLT1zrnqaqq5tprr2fixId58IH7KSgoYMoTT5KakuLWyduZ2p6bOo2KikquvvpKHnnkYaqrq5kxcxZPPvnUbl9Sxaza29u58qqrmfT4o9x26xhaWlr45ptvefudd5j74w8u886fP58bbryJe++5m/sm3Et+fj5jx93F0KGnM+Too3a5hq5YbktLC99//wMXXHA+v/zyi9sPGN9//wOJCYlcNHo0kZERVFZWsWjRIp597rkdGkSof//+3HvP3W7TZ816n7lzf+Kuu+5mfX4+o0aNZNiwoZSVlfGf/3uRqVOn7VgjbKW0tJQLR45m0uOPMWbMGKqqq3jvvRmUbCxh6tRnXebdF3MrsidZ+sQldN1JCCIiskOmTZvK8LPOZMBAjQYnIiIiXUvn5ImIiIiIiPQg6uSJiIiIiIj0IOrkiYiIiIiI9CA6J09ERERERKQH0Z48ERERERGRHkSdPBERERERkR5kn7hOXkxMDA0NDUaXISIiIiIisluCgoIoKSnZ5jw9vpMXExPDsqVdewFjERERERERoxxy6OHb7Oj1+E7e5j14hxx6uKn25qWmppCTk2t0GSJulM3uweLtQ9pFdwOQ/sEz2NvbDK5oz1IujeFvsfB9wiAATi/MoNmusdq2pFyKGSmXPVtQUBDLli7ebr+mx3fyNmtoaKC+vt7oMpza2tpNVY/IZspm92Dx9qGxxdGxq6+v7/GdPOXSGO0WC/bGRsCRM3XyXCmXYkbKpYAGXjFMfX2d0SWIeKRsihkpl2JGyqWYkXIpoE6eYcrKyo0uQcQjZVPMSLkUM1IuxYyUSwF18gyTkpJidAkiHimbYkbKpZiRcilmpFwK7EPn5ImI9Ch2aK2rct4W2VOK21uNLkH2IQEB/kRERGK1WIwupduKjo6mrrbW6DJkF9jsdqqqKmlqat7tZamTZ5Di4mKjSxDxSNnsHuwdbaz76Hmjy9hrlEtjNNvtjCjKMroM01Iuu47FYmHkhRdw1FFHGV1Kt2e1WrHZbEaXIbth0aJFzPnoY+y7MdiVOnkG8fX1NboEEY+UTTEj5VLMSLnsOo4O3pF88eWX5OTk0tHebnRJ3ZaXtxcd7R1GlyG7wMvbm9TUFEYMPwuA2XM+2uVlqZNnkKioKMrKyowuQ8SNsilmpFyKGSmXXSMgIICjjjqKL778knnz5htdTrfn7+9Pc/PuH+4nxsjPzwdgxPDhfPHll7t86KahA6/ceusYvv7qS9ZlpLNyxTLeevMN+vVLdZnnozmzKS4qcPl78skpBlUsImIOFi9vUoffQOrwG7B46fc62TP8LBbejU3h3dgU/HSOlOwhERERALqAt8gmmz8LERGRu7wMQ78ZHH3UUbwzfTrLl6/A29uLCRPu5f1ZMznhxJNpampyzjdjxkyeefY55/0tH+uuMjIyjC5BxCNls5uwWAiMjnfe7umUS2NYgP39Apy3xZVy2TU2D7KiQzS7hvbidX+bPwu7MwCRoZ28Sy+73OX+nXeOY9XfKzjooIP4448/nNObmpt63OEQKSnJZGfnGF2GiBtlU8xIuRQzUi7FjPz8fGlp0ai4+zpTXScvNDQUgOrqapfp5593Hqv+XsFPc3/kvgn3EuDvb0B1XcvX18/oEkQ8UjbFjJRLMSPlUva0hIQEiosK2H///bY53/hxY/nh+28BsFg8f72fNm0qb735RpfXKOZkmhM5LBYLjz46kT///NPl8IdPP/uMwsIiSkpKSEsbzAMP3E+/fv247vobPC7H19fXZbSroKCgPV77rmhoaDC6BBGPlE0xI+VSzEi53POmRSfu1dcbW1awU/NPmzaV0aNG8u577zFhwv0uj02ZPImrrrqSD2fPYezYcV1Zppv/vvIqb739NoAunyCAiTp5U6ZMZvCgQZx73vku02fOnOW8vXbtWkpLS5kz+0OSkpKco89s6bZbxzB+vPsHadCgQTQ2NpKRkUFycjJ+fn40NjZSXFxM//79ASgp2YjFYqV3794AZGZmkpiQgH9AAM3NzRQUFDBgwAAAykpL6bDZiI2NBSA7O5vY2FiCgoJobW0hNzePQYMGAVBeXk5raytxcXEA5Obm4u3tRVpaGm1trWRlZZOWlgZAZWUlTU2NxMcnAJCfl0dEZCShoaHYOjrIWLeOtLTBgIXq6irq6upJTHSsANevX09YaChh4eHYbTbWZmQwaOBArF5e1NbWUFVVTVJSEgCFhYUEBQU6T+hMT09nwIABeHt7U1dXS3l5BSkpKYDjOkB+fn706tXL+f+QmpqCr68fDQ0NlJRsJDW1HwAbN27Ay8ub6OhoANatW0dSUl/8/PxpamqkqGjL9i4BICYmBoCsrCzi4+MICAikpaWZ/Pz1DBw40NHeZWV0dLQTG9sHgJycbGJi/mnvnJxcBg8eDEBFRQUtLS0u7R0V1YuQkFDa29vJzMx0tndVVSUNDY0kJGxq7/x8IiLCCQ0Nc7b34EGDsFit1FRXU1NbS9++fQEoKCggJCSY8PAIwE56+tot2ruWqspKkpKTASgqKiQgIJDIyH/au3//fvj4+FJfX0dZWblLe/v6+hIVFQU4zvlISUl2tvfGjRvp129ze2/Ey2olesvMJiY6RtZqaqKgsNCZ2dLSUux2GzExsc72jouLIzAwkJaWFvLyHJn18fGmqSmKtrZ2+vTZ3N459O4dTXBwCG2trWTn5Djbu7KigqbmZuLjHeeH5eXl0atX5Fbt7chsVVUVDfX1JGyZ2bAwwsLCnJl1tndNDTU1Nc72LiwoICg4eNMJ+o723jKzFRWVJDvbu4gAf38it8hsv9RUfHwd7V1aWkZqqmOQpw0bNuDj401UVLSzvc2wjoiOjnK0d2friETHZxkgLi6O0KDATtcRh519hSMvVfUE+fsSEuCL3W4nr7SGpOhQrFYr9U2t1DW10icy2PFeqxsI8PUmNNCxpyK3pJq+UaF4eVlpaG6lpqGFuF4hjvda04Cvtxct6353tndXryN8fLzZsMFH6wj27jqi76Y2Ase1t9I2tXd5eZnWEX5+tLW14ePjY851RDf6HuHlZcXHxwdfPz+8vLywWCx4e//zFdVqtTpOCrWDzW5z3AfsNsc1xCxWx3lLNpsNq2WLeW02rF47N6+/vz/t7e3Y7XZ8fHwAaGlpwdvbGy8vL+x2Oy0tLfhvOqrMarFQVFTEueecwxNPPEVdXR1eXl4EBARw3nnnUlhYiJfVsdyOjg46OjqcOyNaW1vx8rLitWnwrObmZvz8/LBYLI5529vx83Osgy0WK97e3s522XJecIxXYbPZnPW7z+uLl5cXVqsVi8XiXG5bWxsWwHvTe21ubsbX19d5rb3W1lbne21rawNwaRcfHx+P8+5MG7a3t2Oz2ZztsjPzOtrQCy8vL7DbaW5pwd/PDza34VbtbbVaPbZhR0cH7Vu0987M29bW5pLZrduwra3NZd4t27Cz9nZ8FqyEhoWStulIx83riM3bz+2x9IlL2PWr7HWRyZMeZ+jQ0znv/AspKNj2LygBAQFkZ63j4ksu4+eff3Z73NOevGVLFzNwUBr19fVdXvuuSktLIz093egyRNwom92DxduH/S9/EIDV703C3t7W6bx9T7lkr9S0fu6s7c+0i5RLY/hbLPza19F5OGZ9Os27cWHenki57BoJ8fGMGzeWqVOnUVhU5PJYd9iTFxYaSlJSEi++9BKffvoZAOedey63jLmZgvUF1NTWMnbsOE488UTuvOM2Bg0aRIfNxpIlS3j44UdcdlocfPDBPP3UE/Tv35+MjHW88J//4603X+e004eyevUajj76KD7+aA6XXnY5995zN4MHD+biSy5lyNFHM2zYUE47fRj+/v60trby0EMPctHoUXTYbHzw/gdERUcRGhLKNdde15VNJnvAtj4TwcHBrMtI327fxvBz8iZPepxhw4YxctTo7XbwAA7Yf38ASktLPD7e2tpKfX2980+HUohIT9Xe3EB7s9ZxsmdVdbRT1aFRD0W25YMPP+Si0aOc9y+6aBQffjjbZZ7AwABefe11zjhzOKNHX4TdZufNN1537o0LDAzk3elvs25dJsPOOIvnpk7l4Yce9Ph6999/H1OmPMkJJ55Mevpat8dvuvEGRo0cybjxd3HuuecTHh7OGcOGdeE7FrMz9HDNKVMmc96553D1NddRX9/g3P1YV1dHc3MzSUlJnHfeucyd+xNVVVXsl5bGI49M5PffF3kMdHeyceNGo0sQ8UjZ7B7s7W2sff9po8vYa5RLYzTb7ZxauM7oMkxLuZTNPv74E+6bcK/zsOTDDz+Cm28ew5Cjj3bO8/XX37g8Z9y48axatZKBAweSkZHBeeedi9VqZfxdd9PS0sK6devo06cPTz35hNvrPfvMc/yyYIHHWtra2rjuuut48cUX+eYbx2As9064jxNPPKGr3q50A4Z28q660nGeyCcfz3GZfufYccyePYe2tlaOO/ZYrrvuWgIDAijesIGvv/6a51/4jxHldikvq+E7UUU8UjbFjJRLMSPlUjarrKxk7tyfGD1qJBaLhbk/zaWyqsplnpSUZO6+6y4OOeRgIiMjnecWxsfHkZGRwYABA1iTnk5LS4vzOUuWLPH4eitWruy0ltCQEGJjY1i6bLlzWkdHBytWrHTuNZSez9BOXlz8to+zLi7ewAUXjtxL1exd0b17U15RYXQZIm6UTTEj5VLMSLmULX3w4YdMnvQ4APc/4H6Y5fR33qawsIi777mXjRtLsFqtzJ83F18fX7d5t6exsbHTx7YctEb2XfoJSkSkG7J4eZMy7CpShl2FxUsbdNkz/CwWXo1J4tWYJPy0B0Bkm+bNm4+Pjy/ePj7Mn+86OGBERDj9+/fn+Rf+w8KFv5KVlUV4WJjLPJmZmeyXluYciRHg0EMP3ek66urr2bixhEMPOdg5zcvLi4MOOnCnlyXdl74ZGCQzM9PoEkQ8Uja7CYuFoD4pzts9nXJpDAtwuH+Q87a4Ui5lSzabjRNOPMl5e0vV1TVUVlZy2WWXUFpaSnx8HPffd5/LPJ9++hkT7r2HZ555iv/7v5dITEzgpptu3Ok6mpubefPNNxlz6xhyc/PIysrihhuuJ3TTUPyyb9CePINsviaNiNkom2JGyqWYkXIpW9s8uvvW7HY7N98yhoMOPJCf5v7AI49M5PFJk13maWxs5MqrriZt8GC+/+4bJtx7D5MnT9npGnx9fXnl1df4+OOPef75qXz++WfUNzTwzbff7vL7ku5He/IMsvmijiJmo2yKGSmXYkbK5Z63s9et29vGjh23zce3vCbdggULOfGkU1we33p8iqVLl3Ha6cM6nef33xd5HNPiuanTeG7qNMBx8fjW1lYmTnyUiRMf3bE3Ij2O9uQZpLmpyegSRDxSNsWMlEsxI+VSzGjrQ0Vl36ROnkEKCguNLkHEI2VTzEi5FDNSLsWMWltbjS5BTECdPIMMGDDA6BJEPFI2xYyUSzEj5VLMSIcRC+icPBGRbsvWpl9rZc9r0qFfIiLdjjp5BiktLTW6BBGPlM3uwd7expoZk7c/Yw+hXBqj2W7n2IK1RpdhWsqlmFFbW5vRJYgJ6HBNg9jt+mVUzEnZFDNSLsWMlEsRMSt18gwSExNrdAkiHimbYkbKpZiRcilm5OPjY3QJYgI6XFNEpBuyeHnT96TRAKyf9yH2jnaDK5KeyBcLz0QnAHB3WSGt2A2uSEREdoQ6eQbJysoyugQRj5TNbsJiISRxoPN2T6dcGsNqgWMDQ5y31cdzpVyKGbW0tBhdgpiADtc0SFxcnNEliHikbIoZKZdiRsqlGKG4qIBhQ4d2+rgO1xTQnjzDBAYGGl2CiEfKppiRcilmpFzueX1PuWSvvt76ubN2av5p06YSFhrKNddet4cq2nlWq/bhiPbkGUa70sWslE0xI+VSzEi5FDOy6dqWgjp5hsnLyzO6BBGPlE0xI+VSzEi5lG056qij+OrLL8jNyWLZ0sXcf98EvLy8nI9/NGc2jz/2KA8+cD+rV/3N8mVLGD9urMsyUlKS+eTjj8jJzmT+vLkcf9xxbq8zePBgZs/+gOysTFatWsmkxx9z2cs8bdpU3nrzDW668UaWLV3MqlUrmTJ5Et7eOqCvJ1MnzyCDBg0yugQRj5RNMSPlUsxIuZTOxMbGMuO96axYsYLTThvKffc9wMUXX8Sdd9zuMt/IkRfS2NjI8BEjmDR5CmPH3unsyFksFt54/XXa2loZPuJs7p1wPw88cJ/L8wMCApg1cwY11TWcedZwbrzxJo4//ngmT57kMt+QIUeTlJzEyJGjufPOsYwaNZJRo0bu2UYQQ6mTJyIiIiLSha688gqKi4u5/4EHycrO5tvvvuPZ56Zy4403YNliROT09LVMnfY8ubl5fPTRx6xYsZJjjz0GgOOPO47+/ftx+x1jWbMmnT/++IMnnnza5XXOO+9c/Pz8uP2OO8nIyODXX3/jkUcf5cILzicqKso5X01NDQ9squXHH+fy49y5HHfssXunMcQQ2k9rkPLyMqNLEPFI2ewe7O1trHp7otFl7DXKpTGa7XYOy19jdBmmpVxKZwb078+SJUtdpv31118EBwcT16cPRcXFAKSnp7vMU1pa6uyc9R/Qn+LiYkpKSpyPL1myxPV1BgxgTfoampqanNP++ONPvLy86NevH+Xl5QBkrFvncq5eaUkpg9MGd8E7FbPSnjyDtLXpwsViTsqmmJFyKWakXMruamtvc7lvt9ux7ObomHa7+wUt27fKqh07Vou6AT2Z/ncN0qdPH6NLEPFI2RQzUi7FjJRL6UxmVhaHHXaoy7QjjjiCuro6ijds2KFlZGVmERcXR+/evZ3TDj3UdZmZmZnsl7YfAQEBzmlHHXUkHR0dZGdn78Y7kO5OnTwRkW7I4uVN4omjSDxxFBYvHXkve4YvFp6KSuCpqAR8sWz/CSL7oJDQEPbffz+XvxkzZhIXF8fkSY/Tv18/hp5+OneNH8drr73ucU+bJ78sWEBOTg4vPD+N/fZL49///jcT7r3HZZ5PP/mUlpYWXnhhGoMGDWLIkKOZ+PDDfPTxJ85DNWXfpG8GBsnJyTG6BBGPlM1uwmIhLGV/AAoXfmpwMXuecmkMqwVODQoFYGJFEezYd9N9hnIpAMcMGcIP33/nMm3WrPe57PIreejBB/jhh++orq7m/fc/4PkX/rPDy7Xb7Vx73fU89+yzfPXlFxQWFvLgQxN5f9YM5zxNzc1ccullPPbYI3z91Zc0NTfx9dff8Mgjj3bRu5PuSp08g/TuHU1BQaHRZYi4UTbFjJRLMSPlcs9bP3eW0SVs09ix4xg7dlynj581fESnj104cpTbtGuuvc7lfk5OLuedf4HLtLj4RJf7a9euZdSoi5z3fXx8aGv751w/T/VNnKhOYE+nwzUNEhwcYnQJIh4pm2JGyqWYkXIpZrTlBddl36VOnkHaWluNLkHEI2VTzEi5FDNSLsWMdvScP+nZ1MkzSLaO4xeTUjbFjJRLMSPlUsyopaXF6BLEBNTJM8jgwboApZiTsilmpFyKGSmXYkb+/v5GlyAmoIFXREREpEtMi07c/kxdYGxZwV55HRGR7kqdPINUVlQYXYKIR8pm92Bvb2P1e5Oct3s65dIYzXY7x6xPd94WV8qlmFF7e7vRJYgJqJNnkKbmZqNLEPFI2ew+9oXO3WbKpXHUueuccilmZLPZjC5BTEDn5BkkPj7e6BJEPFI2xYyUSzEj5VLMyNfX1+gSxAS0J09EpBuyWL2IG+K4yG7xb19gt3UYXJH0RD5YeKBXHwAmV2ygDe3VExHpDrQnzyB5eXlGlyDikbLZTVitRAw4hIgBh4C156/KlUtjeFlgRHA4I4LD8bIYXY35KJeyNyQkJFBcVMD++++3zfnGjxvLD99/u81LKEybNpW33nyjq0sUE+r53wxMqlevSKNLEPFI2RQzUi7FjJRLmTZtKsVFBTz55BS3x6ZMnkRxUQHTpk3dK7X895VXGTX6Iry9950D9S655GI+/eRj1qz+mzWr/+bDD2Zx8MEHu8yz+f9oy7+ZM97b5nKtVit3330Xi37/leysTH77dSF33nmHyzzR0dHMeO9dli5ZzORJj2OxuP4SlpyczLSpz7F48Z/k5mSx6PdfefmlFznooIO65L1vjzp5BgkJCTW6BBGPlE0xI+VSzEi5FICioiLOOftsl+vT+fn5ce6551BYWLhXavDy8qKxsZGqqmq8vLz2ymuawZCjj+az//2PkaNGc/bZ51JcvIH3Z80gNjbWZb6ffprHvw4+1Pl3y5hbt7ncMWNu4corLueBBx/ihBNPYvKUKdxy801ce83VznnuufsuVqxcyWWXX07fvn0595xznI8ddNBBfPvNV6SmpnDvvRM48aRTuPa668nKymLiww91bSN0Qp08g2h4WzErZVPMSLkUM1IuBeDvv1dRXLyBM84Y5px25hlnUFRczKpVq13mPfHEE/ns049JX7OKVatWMn362yQlJbnMc/DBB/P9d9+Qk53JN19/xQEHHODy+NFHH0VxUQEnnXQi337zFXm52fz730c4D9e0bxoR12q1MnHiw87XevCB+7Fs57DrUaNGkr5mFaeeegoLfplPdtY6XnvtFQL8/Rk58kL+WPQba1b/zeOPPYp1i1MFfH19efihB1my+C+yMjP48ovPOfroo5yPR0SE8/JLL7Jk8V9kZ61j7o8/uHSKAD6aM5vHH3uUBx+4n9Wr/mb5siWMHzd2m/XeetvtTJ/+LqtXryErO5vxd92N1Wrl2GOPcZmvtbWVsrIy519NTc02l3v44Yfx3XffM3fuTxQWFvLVV1/z88+/uOwlDAsPY+3ataSnr2X9+vWEhv3zo8/z06aSm5vHueddwNy5P5Gfn8/q1WuYOu15rr7m2m2+dldRJ88gmZmZRpcg4pGyKWakXIoZKZd7nr/F0umfL5YdntfPsmPz7qoPPvyQi0aPct6/6KJRfPjhbLf5AgMDePW11znjzOGMHn0RdpudN9943XmoX2BgIO9Of5t16zIZdsZZPDd1Kg8/9KDH17z//vuYMuVJTjjxZNLT1zqnbz4n76Ybb2DUyJGMG38X5557PuHh4ZwxbJjHZW0pICCAa6+5hptvHsMll17OkKOP5s033+CUk0/mssuv5PY77uSyyy5l+PCznM+ZPOlxDjvsUG6+ZQynnHo6X375FTNnvEdKSjIAfn7+rFz5N1dceSUnnXwqM2fO5D//ed7t0MqRIy+ksbGR4SNGMGnyFMaOvZPjjztuuzVvWbu3tw/V1dUu048++ihWrljGgl/m88QTU4iICN/mchYvXsKxxx5DamoKAPvtl8a//30EP82b55znxRdfZtLjj5GXm82BBx7AnDkfAXDA/vszePAgXn31NWeHe0u1tbU7/H52x75z0K7JpKUNdvlAipiFsilmpFyKGSmXe96vfdM6fWxhYx13lBU47/+YMIiATgaiWtzcwI0l+c77X8YPIMLL/WvwYflrdqnOjz/+hPsm3Ou8rMbhhx/BzTePYcjRR7vM9/XX37jcHzduPKtWrWTgwIFkZGRw3nnnYrVaGX/X3bS0tLBu3Tr69OnDU08+4faazz7zHL8sWOA23d/fn+bmZq677jpefPFFvvnmWwDunXAfJ554wnbfi6+vLxPuu5/8fEd7ffnVV1x4wQUc9K9DaGxsJDMzk99++50hQ47m88+/ID4ujtGjR3HEv4+ipKQEgFdefZWTTjqB0aNH8+STT7Fx40ZeefVV52u89fY7nHDiCZw9YjjLly93Tk9PX8vUac8DkJubx9VXXcWxxx7j8X168sAD91NSUsKCBQud0+bPm883X3/D+oICkpOSmDDhHma89x4jzj6n02sKvvjiS4QEB/PLz/Pp6OjAy8uLJ596mk8//cw5z8qVKzn0sCOIjIykrKzMOT1lU8cwKytrh2reU9TJM4yGKROzUjb3lr6nXLLrT97iF+fEE0dBj79gtXIpZqRcikNlZSVz5/7E6FEjsVgszP1pLpVVVW7zpaQkc/ddd3HIIQcTGRnpPOQxPj6OjIwMBgwYwJr0dJcRMpcsWeLxNVesXNlpPSEhIcTGxrB02XLntI6ODlasWOk2QMjWGhsbnR08gPKycgoKCmhsbHROKysvI6pXFACD0wbj7e3NwgU/uyzH19eXqqpqwHHo6O2338aI4cOJjY3F19cHX19fmpqaXJ6Tnp7ucr+0tJSoqKht1rvZrWNu4Zyzz+bCkSNd2u9/n3/uvL127VrWpKez6PdfGTLkaBYu/NXjss4eMYLzzz+PMWNuI2PdOvbffz8effQRSkpKnHvswNGmW3bwgO22796iTp5Bqjx88EXMQNnsJux26gqznLd7OuXSGM12O6cUZDhviyvlcs87Zn16p4/ZtorkqYUZnc67dXqHF3X9obYffPghkyc9DsD9D3g+xHL6O29TWFjE3ffcy8aNJVitVubPm4uvz85fwHzLTteW2tvb8fHx2enlbdbW1uZy3263u51/arfj7KAGBQXR3t7OsDPOpKPDdc9YQ0MDALfcfBPXXXsND098hLVr19LY2MSjj050e99t7e6vbdmBywTddOONjBlzC6MvumS7e9fXr19PRUUFycnJnXbyHnroAV588WVnB3Ht2rUkJCRw261jXDp5nuRk5wDQv39/Vq1evc159ySdk2eQhvp6o0sQ8UjZ7D7sto595iLoyqVxqm0dVO8jOdtZyuWe12y3d/rXulXXbVvztth3bN7dMW/efHx8fPH28WH+/J/dHo+ICKd///48/8J/WLjwV7KysggPC3OZJzMzk/3S0vDz83NOO/TQQ3eqDpvNRl1dHRs3lnDoIQc7p3t5eXHQQQfu3JvaAatWrcLb25tevaLIy8tz+du8l+uIIw7nu+++55NPPmXNmnTy8/NJTU3tkte/5eabuPPO27n0sstZuY29m5v16RNLREQEpSWlnc7jHxCAze7aYe3o6NihDueq1avJyMjgxhtv8LhXLzR074zKq06eQRISE40uQcQjZVPMSLkUM1IuZUs2m40TTjyJE0882eO5XtXVNVRWVnLZZZeQnJzMMccMYeLEh13m+fTTz7Db7TzzzFMMGDCAk08+iZtuunGn6vD1dewde/PNNxlz6xiGDR1K/379eGLK5D3SwcjJyeXjjz/hPy9M44wzhpGYmMjBBx/MrbeO4ZRTTnbMk5vH8ccfx+GHH0b//v15+qknid7BwzC3ZcwtN3P33XcxbvxdFBQUEh0dTXR0NIGBgYBjIJuHHnyAQw89hISEBI499hjefutNcvPymP/zPx3xDz98n6uvutJ5/4cffuT222/jlFNOJiEhgWHDhnHjDdfz7abzG7dn7Li7SE1N4bNPP+bkk0+ib9++pKUN5vbbb+Ptt97c7fe9I3S4pohIt2TBPyIagOaqMtwPRhLZfT5YGBcZA8DUyhLalDORbarfxt5du93OzbeM4fHHHuWnuT+QnZPDQw9N5JOP5zjnaWxs5MqrruapJ5/g++++ITMzk8mTp/DmG6/vdC2vvPoavWN68/zzU7HZbHzw4Wy++fZbQvfA9R3HjhvPnXfczsSHHyI2NpbKyiqWLl3Kjz/OBeCFF/5DUt++zJo5g6amJmbMnMW3332327VcccXl+Pn58cbrr7lMf+65qTw3dRo2m420tDRGjryQ0NBQSkpK+PnnX3j6mWdpbW11zp+clERkZKTz/oMPPsQ999zFE1Mm06tXFCUlJbw3YybTNg0Ksz3Lly/njDPP4vbbb+OZp58mMjKC0tJSFi9ewsSJj+zWe95Rlj5xCT16jR0cHMy6jHQGDkrb5gdvbwsKCnIepyxiJsrm3rO7A6+EJg4EoLZgnSnOy1s/d9YeW7ZyaQx/i8U5uuEx69O3ezjbtOi9s2dr7BYjKhpJuewaCfHxjBs3lqlTp1FYVGR0Od2e1WrtdNRI6R629ZnY0b6NDtc0SNhWx2CLmIWyKWakXIoZKZdiRl5eXkaXICZgaCfv1lvH8PVXX7IuI52VK5bx1ptv0K+f60mYfn5+TJk8iVWrVpK5bi2vv/bqDg+lambaMIhZKZtiRsqlmJFyKWakTp6AwZ28o486inemT2f4iHO46OJL8Pbx5v1ZMwkICHDO88gjEznttFO58cabOP+CkcTExvDmG69tY6ndg1270cWklE0xI+VSzEi5FFMyweH7YjxDB1659LLLXe7feec4Vv29goMOOog//viDkJAQLr5oNGNuvY1ff/0NgHFjx/PLL/M59NBDWLp0mRFld4m1GZ1fy0XESMqmmJFyKWakXIoZNW9xIXDZd5nqnLzNw7pWV1cDcNBBB+Lr68uCBQud82RlZ1NYWMhhhx1mRIldZvCgQUaXIOKRsilmpFyKGSmXYkb+W1xjT/ZdprmEgsVi4dFHJ/Lnn3+SsemXsd7RvWlpaaG2ttZl3rKycnpHR3tcjq+vr/P6IOAY+cqMduRiiiJGUDbFjJRLMSPlUkzJwwW4Zd9jmk7elCmTGTxoEOeed/5uLee2W8cwfvw4t+mDBg2isbGRjIwMkpOT8fPzo7GxkeLiYvr37w9ASclGLBYrvXv3BiAzM5PEhAT8AwJobm6moKCAAQMGAFBWWkqHzUZsbCwA2dnZxMbGEhQURGtrC7m5eQza9AtfeXk5ra2txMXFAZCbm0tAgD9paWm0tbWSlZVNWppjiOrKykqamhqJj08AID8vj4jISEJDQ7F1dJCxbh1paYMBC9XVVdTV1ZO46WKs69evJyw0lLDwcOw2G2szMhg0cCBWLy9qa2uoqqomKSkJgMLCQoKCAomIcFwTJD09nQEDBuDt7U1dXS3l5RWkpKQAUFxcjJ+fH7169QJg7dq1pKam4OvrR0NDAyUlG0lN7QfAxo0b8PLyJnpTJ3zdunUkJfXFz8+fpqZGioq2bO8SAGJiHNdgysrKIj4+joCAQFpamsnPX8/AgY4h4svKyujoaCc2tg8AOTnZxMT80945ObkMHjwYgIqKClpaWlzaOyqqFyEhobS3t5OZmels76qqShoaGklI2NTe+flERIQTGhrmbO/BgwZhsVqpqa6mpraWvn37AlBQUEBISDDh4RGAnfT0tVu0dy1VlZUkJScDUFRUSEBAoPMaLOnp6fTv3w8fH1/q6+soKyt3aW9fX1/nAEMZGRmkpCQ723vjxo3067e5vTfiZbUSvWVmExPx9/enuamJgsJCZ2ZLS0ux223ExMQ62zsuLo7AwEBaWlrIy3NktlevSKKjo2hra6dPn83tnUPv3tEEB4fQ1tpKdk6Os70rKypoam4mPj4egLy8PHr1ityqvR2ZraqqoqG+3nkB4fXr1xMWFkZYWJgzs872rqmhpqbG2d6FBQUEBQcTEfFPe2+Z2YqKSpKd7V1EgL8/kVtktl9qKj6+jvYuLS0jNdUxyNOGDRvw8fEmKira2d57ax2REhMOQHVDM23tNqLDHBdvLaqoIyLYn0A/H9o7bBSU1zrnrWlsoaW1nejwIGwtZZTWNBAdGkCwvy82m438slpSeoeBxUJdYwsNLW3ERgQ78lJVT5C/LyEBvtjtdvJKa0iKDsVqtVLf1EpdUyt9Ih3zllQ3EODrTWig4xfh3JJq+kaF4uVlpaG5lZqGFuJ6hTjea00Dvt5eBG36XO2JdUSvXpH4+PhoHbGX1xGxcXHc6W+ntbWVNouFtE1tWF5e5nEdYY1Pwd7Whr2gEGuqo157dQ32lhasMY4abEXFWMLDsAQFQUc7trwCrP2SAQv22lrsjU1YYx3bBVvxBiwhwVhCQsBuw5aTjzU1ibSoYFOsI4KCAvHx8THke0R0dJRjndwDvkd4eVnx8fHB188PLy8vLBYL3t6Or6jNzc34+vo6LwvQ1taG36Y9VW1tbQD4+PgA0NLSgo+Pj3Pe1tZW/P39AWhvb8dut3uc12630dKy7Xm9vb3x8vLCbrfT0tLiMq/NZnPuYGhtbcXLy8t1Xj8/sFjo6Oigo6Njq3mteHn98179/PywbJ63vR3fTe+1tbUVq9Xq0i5bztve3u5sF7vdjre391bz+mKxeG5DC+C96b1u3d5btuHutveOtuGutjd2O83bae8dbcOdmbetrW2XM9tZezs+C1ZCw0JJ23Sk4+Z1RHQnO7q2Zorr5E2e9DhDh57OeedfSEHBP9e+OeaYIcyZ/SGD0/Z32Zv35x+/8/obb/L662+4LcvTnrxlSxfrOnkiO0jZ3Ht26zp5JqTr5Imukye7QtfJ61q6Tl731yOukzd50uMMGzaMkaNGu3TwAFau/JvW1laOPfYY57R+/VJJSEhgyZIlHpfX2tpKfX2988+sK9/NvzyKmI2yKWakXIoZKZdiRlvu7JB9l6GdvClTJnP++ecx5tbbqK9vIDo6mujoaOcu2bq6Ot7/4EMemfgwQ4YczYEHHsi0qc+xePHibj2ypohIV/ALj8YvfMcO2xDZFd7AHeG9uSO8t3nO7xARiosKGDZ0qNFliIkZ2sm76sorCAsL45OP57Bi+VLn39lnj3DO88gjj/Ljj3N5/bXX+PSTjygtLePa624wsOquUVhgjkNNRLambHYTFgt+oZH4hUbuEyfZK5fG8LZYuCIsiivCovDeB3K2s5RLmTZtKm+96X76kJFaW1uNLsEQRx55JNPfeYulSxZvsxPcv39/3nn7LdamryYrM4Ovv/qS+E3nu3oyatRIiosKXP5ysjNd5omOjmbGe++ydMliJk96HMtW68vk5ORNO6r+JDcni0W//8rLL73IQQcdtPtvvBOG/jAXF7/9Y/dbWlq4/4EHuf+BB/dCRXtPUHAwdSY6R1BkM2VTzEi5FDNSLsWM9tVz8gIDA1i9Jp33P5jNW2++7nGepKQkPvvsEz54/wOeffY56urrGTRw4HavLVhbW8txx5/ovG/f6oLz99x9FytWrmTKE09w34QJnHvOOXz62WcAHHTQQcz+8H0yMjK4994JZGVlExwcxNDTT2fiww9xwYUjd+t9d8bwc/L2VY7Rv0TMR9kUM1IuxYyUS9meo446iq++/ILcnCyWLV3M/fdNcIwEuclHc2bz+GOP8uAD97N61d8sX7aE8ePGuiwjJSWZTz7+iJzsTObPm8vxxx3n9jqDBw9m9uwPyM7KZMXypTz91JMEBgY6H9+8x/GmG29k2dLFrFq1kimTJzlHhPRk/Lix/PD9t1w0ejR//bmIzHVrmTJlMlarlVtuvonly5awcsUybr/9NpfnhYaG8uwzT/P3yuVkrF3D7NkfsN9+ac7Hk5KSePutN1mxfCmZ69by9Vdfctxxx7os449Fv3Hbbbcy9blnWZeRzl9/LuLSS7c9WNm8efN5+uln+PbbbzudZ8K99/DTTz8xafIUVq1eTX5+Pt//8AMVFRXbXLbdbqesrMz5V15e7vJ4WHgYa9euJT19LevXryc0LNT52PPTppKbm8e5513A3Lk/kZ+fz+rVa5g67Xmuvubabb7u7lAnzzCGD2oq0gllU8xIuRQzUi73NIu3T+d/Xt5dPm9Xio2NZcZ701mxYgWnnTaU++57gIsvvog777jdZb6RIy+ksbGR4SNGMGnyFMaOvdPZkbNYLLzx+uu0tbUyfMTZ3Dvhfh544D6X5wcEBDBr5gxqqms486zh3Hrb7Rx33LFMnjzJZb4hQ44mKTmJkSNHc+edYxk1aiSjRm17L1JSUhInnXwil1x6ObeMuZWLLxrNe+9Op0+fPlxw4UgmT36CCffewyGHHOx8zmuv/peoqCguvewKhp1xJqv+XsXsDz8gPDwcgKCgQOb+9BOjRl/M6UOHMW/+fN55+223QyZvvPEGVqxcyelDz2D69Hd58okp9OuXujP/BS4sFgunnHIyOTm5zJo5g5UrlvHlF5/v0LmNQUFB/PnH7yz+6w/efutN5+V7NnvxxZeZ9Phj5OVmc+CBBzBnzkcAHLD//gwePIhXX33Nbe8f4HYt8K6k86gNkp6+1ugSRDxSNsWMlEsxI+Vyz9v/8s5P16krWEf+jzOd99Muugerj+eRJRs25JL77TvO+4NGjsXbP8htvlVvT9z1Yrdy5ZVXUFxc7DzlKCs7m5jYGB64/z6mTnve+aU/PX0tU6c9D0Bubh5XX3UVxx57DL8sWMDxxx1H//79uOTSy5zXDn3iyaeZNfM95+ucd965+Pn5cfsdd9LU1ERGRgYPPPgQ0995m8mTpzj3OtXU1PDAAw9is9nIys7mx7lzOe7YY5k16/1O34PVamXcuLtoaGggMzOT3377nX79Urns8iuw2+1kZ+cwZszNHDNkCMuWLeffRxzBwQcfzEH/OsR5buBjj09i6NChnHXWmcycOYs1a9JZsybd+RrPPPMsZwwbxumnn8bb70x3Tv/pp5+YPv1dAF586WWuv/46hgwZQnZ2zi79f0RFRREcHMytY27hqaefYfKUKZx04om88cZrXDhyNIsWLfL4vOzsbMaNv4v09HRCQkK5+aYb+Px/n3LSyaewYcNGAFauXMmhhx1BZGQkZWVlzuembLpeaFZW1i7VvDvUyTPIgAEDyMzM3P6MInuZsilmpFyKGSmXsi0D+vdnyZKlLtP++usvgoODievTh6LiYsBxIfktlZaWEhUVBUD/Af0pLi52dvAAt8uIDRgwgDXpa2hqagLAz8+Pv/5ajJeXF/369XN28jLWrXM5V6+0pJTBaYO3+R4KCgpcLkdWVl5Gh63DZa9UWVk5vTbVu99++xEUFMTqVStdluPv709yUhIAgYGB3DV+HKeccjK9e/fG29sbf39/4uPjXZ6TvmardikrI6pXr23Wuy1Wq+MAxu+++955re3Vq9dw+OGHc8Xll3XayVuyZKnL/+PixYv5ef48LrvsMp555lnn9I6ODpcOHuA2AMvepE6eQbZ1DLSIkZRNMSPlUsxIudzzVr83qfMHtzr8Lf2Dp3d43ow503anrC7V1t7mct9ut2Ox7voZVZ11LNrb2l1fBztWy7Zfp719q+fYPSzHbsdqdbxmUFAgJaWlXHjhKLdl1dbUAPDwww9y/HHH89jjk8jLy6O5uZnXX3sFH1/Xw2Xbtnpt7HZnR21XVFZW0tbWxrqtfpjJzMzk3/8+YoeX097ezqrVq0hJTt7uvDmb9jr279+fVatX71S9u0trJ4PU1e25Y3BFdoey2U3Y7dRvyHXe7umUS2O02O2MLM523hZXyuWeZ9+qA2TEvLsqMyuLs848w2XaEUccQV1dHcUbNuzQMrIys4iLi6N3796UlpYCcOihh7q+TmYmo0aOJCAggKamJjo6Ohgy5Gg6OjrIzs7umjezg/7+exW9o6Npb2+nsLDQ4zxHHH4Es+fMcQ6QEhgYSEJCwh6vra2tjRUrVrid15eamkphYdEOL8dqtZI2eDBzf5q33XlXrV5NRkYGN954A//7/HO38/JCQ0P32Hl5GnjFIBUVlUaXIOKRstl92NpasbXtG9dDUi6NYQdy2lrIaWvRECMeKJcCEBIawv777+fyFxfXh+nT3yUuLo7Jkx6nf79+DD39dO4aP47XXnvd4yAcnvyyYAE5OTm88Pw09tsvjX//+99MuPcel3k+/eRTWlpaeOGFaQwaNIh///sIJj3+OB99/InbKJB72i8LFrBkyVLefusNTjj+eBISEjj88MO49957nNeEy83N5cwzhrH//vux335pvPzSi7u1h26zwMBAZ/sDJPZNZP/993MZ0OXl/77K2SNGcMklF5OcnMzVV13Jaaed6jz3D+CFF6Zx34R7nffH3nkHJxx/PH379uXAAw7gxf/7D/HxCds8l3FLY8fdRWpqCp99+jEnn3wSffv2JS1tMLfffhtvv/Xmbr/vzmhPnkGSk5PdjsEWMQNlU8xIuRQzUi4F4JghQ/jh++9cps2a9T533X0Pl11+JQ89+AA//PAd1dXVvP/+Bzz/wn92eNl2u51rr7ue5559lq++/ILCwkIefGgi78+a4ZynqbmZSy69jMcee4Svv/qS5uYmvvrqax559LEueoc757LLr2DCvfcwdepz9OrlGIhk0aI/KC93nK/2yKOPMXXqs3z+v8+orKzkpZf+S3Bw8G6/7r/+dRAffzTHef/RRxyD6Hw4ew5jx44D4Ntvv2XChPu59bYxPP7YY+TkZHP99Tfy519/OZ8XHxePzfZPJzwsPJxnnnmK6OhoampqWPn335xzzrk7fD7u8uXLOePMs7j99tt45umniYyMoLS0lMWLlzBx4iO7/b47Y+kTl9Cjf5wLDg5mXUY6AwelUW+iC5ampaVpwyCmpGzuPX1P2fY1f7bHL8xxAnpLzbav77O3rJ87a48tW7k0hjdwTVg0AG/VlNG+7dmZFp24x2sCGFtWsFdeZ3uUy66REB/PuHFjmTp1GoVFO37YnHjm7+9Pc3Oz0WXIbtjWZ2JH+zbak2eQIq3ExKSUzW7CYsEvzDGaWUttZY8/L0+5NIa3xcKN4Y5O3ru15bT38JztLOVSzGjzpQtk36Zz8gwS4O9vdAkiHimbYkbKpZiRcilm1BXnt0n3pxQYJHI3rvMhsicpm2JGyqWYkXIpZqRLewiokyciIiIiItKjqJNnkLVr1xpdgohHyqaYkXIpZqRcdg3bpnM9vbQHqkto0JXub/NnwbYb50Grk2eQfqmp259JxADKppiRcilmpFx2jaqqKgBSU1MMrqRn8PPzM7oE2U2bPwtVVbt+LU79ZGIQH19fo0sQ8UjZFDNSLsWMlMuu0dTUxKJFixgx/CwAcnJy6Wjf3gU7pDO+fn60trQYXYbsAi9vb1JTUxgx/CwWLVpEU9Ou75VVJ88g9fV1Rpcg4pGy2U3Y7dRvzHfe7umUS2O02u1cviHHeVtcKZddZ85HHwMwYvhwgyvp/ry9vWhv7zC6DNkNixYtcn4mdpU6eQYpLS0zugQRj5RNh929UPneYGvdd867UC6NYQPW7EM521nKZdex2+3MnvMRX3z5JRERkVgtFqNL6rZ8fH1p07XyuiWb3U5VVeVu7cHbTJ08g6SmppKenm50GSJulE0xI+VSzEi57HpNTc00NRUbXUa3lpaWRm5urtFliMHUyRMR6aZ8QyIAaK2rMrgS6am8gYtDHdeCe7+2Ap0lJSLSPaiTZ5ANGzYYXYKIR8pmN2Gx4B/RG4DW+uoef16ecmkMb4uFOyNiAJhTV0l7D8/ZzlIuxYyUSwFdQsEwPj7qX4s5KZtiRsqlmJFyKWakXAqok2eYqKhoo0sQ8UjZFDNSLsWMlEsxI+VSQJ08ERERERGRHkWdPINkZGQYXYKIR8qmmJFyKWakXIoZKZcC6uQZJjk52egSRDxSNsWMlEsxI+VSzEi5FFAnzzB+fn5GlyDikbIpZqRcihkpl2JGyqWALqFgmMbGRqNLEPFI2ewm7HYaStY7b/d0yqUxWu12btiY57wtrpRLMSPlUkCdPMMUFxcbXYKIR8pm99HR0mR0CXuNcmkMG7CkRV8YO6NcihkplwI6XNMw/fv3N7oEEY+UTTEj5VLMSLkUM1IuBbQnT0Sk2/IJDgegrb7a0Dqk5/IGzguOAODT+irajS1HRER2kDp5Bikp2Wh0CSIeKZvdhMVCQGQMAG0NNT3+vDzl0hjeFgsTevUB4IuGatp7eM52lnIpZqRcCuhwTcNYLGp6MSdlU8xIuRQzUi7FjJRLAXXyDNO7d2+jSxDxSNkUM1IuxYyUSzEj5VJAnTwREREREZEeRZ08g2RmZhpdgohHyqaYkXIpZqRcihkplwLq5BkmMSHB6BJEPFI2xYyUSzEj5VLMSLkUUCfPMP4BAUaXIOKRsilmpFyKGSmXYkbKpYAuoWCY5uZmo0sQ8UjZ7CbsdhpLC523ezrl0hhtdjt3lK533hZXyqWYkXIpoE6eYQoKCowuQcQjZbP7aG9uMLqEvUa5NEYHsLCp3ugyTEu5FDNSLgV0uKZhBgwYYHQJIh4pm2JGyqWYkXIpZqRcCmhPnohIt+UTFApAW0OtwZVIT+UNnBEUBsA3DTW0G1uOiIjsIHXyDFJWWmp0CSIeKZvdhMVCQK8+ALQ11vX48/KUS2N4Wyw8EhUPwA+NtbT38JztLOVSzEi5FNDhmobpsNmMLkHEI2VTzEi5FDNSLsWMlEsBdfIMExsba3QJIh4pm2JGyqWYkXIpZqRcCqiTJyIiIiIi0qOok2eQ7Oxso0sQ8UjZFDNSLsWMlEsxI+VSwOBO3pFHHsn0d95i6ZLFFBcVMGzoUJfHp02bSnFRgcvfzBnvGVRt19KudDErZVPMSLkUM1IuxYyUSwGDR9cMDAxg9Zp03v9gNm+9+brHeX76aR5jx4133m9tbd1b5e1RQUFBRpcg4pGyKWakXIoZKZdiRsqlgMGdvHnz5jNv3vxtztPa2kpZWdneKWgvam1tMboEEY+UzW7CbqexrMh5u6dTLo3RZrdzb1mB87a4Ui7FjJRLgW5wnbyjjz6KlSuWUVNTw8Jff+Ppp5+mqqra6LJ2W25untEliHikbHYf7U31Rpew1yiXxugAfmysM7oM01IuxYyUSwGTD7wyf9587rhjLKNGX8zkyU9w9FFHMuO997BaOy/b19eX4OBg559Zd1kPGjTI6BJEPFI2xYyUSzEj5VLMSLkUMPmevP99/rnz9tq1a1mTns6i339lyJCjWbjwV4/Pue3WMYwfP85t+qBBg2hsbCQjI4Pk5GT8/PxobGykuLiY/v37A1BSshGLxUrv3r0ByMzMJDEhAf+AAJqbmykoKGDAgAEAlJWW0mGzOU9uzc7OJjY2lqCgIFpbW8jNzXN+yMrLy2ltbSUuLg6A3NxcoqOjgDTa2lrJysomLS0NgMrKSpqaGomPTwAgPy+PiMhIQkNDsXV0kLFuHWlpgwEL1dVV1NXVk5iYCMD69esJCw0lLDwcu83G2owMBg0ciNXLi9raGqqqqklKSgKgsLCQoKBAIiIiAUhPT2fAgAF4e3tTV1dLeXkFKSkpABQXF+Pn50evXr2c/xepqSn4+vrR0NBASclGUlP7AbBx4wa8vLyJjo4GYN26dSQl9cXPz5+mpkaKirZs7xIAYmJiAMjKyiI+Po6AgEBaWprJz1/PwIEDHe1dVkZHRzuxsX0AyMnJJibmn/bOycll8ODBAFRUVNDS0uLS3lFRvQgJCaW9vZ3MzExne1dVVdLQ0EhCwqb2zs8nIiKc0NAwZ3sPHjQIi9VKTXU1NbW19O3bF4CCggJCQoIJD48A7KSnr92ivWupqqwkKTkZgKKiQgICAomM/Ke9+/fvh4+PL/X1dZSVlbu0t6+vL1FRUQBkZGSQkpLsbO+NGzfSr9/m9t6Il9VK9JaZTUzE39+f5qYmCgoLnZktLS3FbrcRExPrbO+4uDgCAwNpaWkhL8+R2b59EykvL6OtrZ0+fTa3dw69e0cTHBxCW2sr2Tk5zvaurKigqbmZ+Ph4APLy8ujVK3Kr9nZktqqqiob6ehK2zGxYGGFhYc7MOtu7poaamhpnexcWFBAUHExExD/tvWVmKyoqSXa2dxEB/v5EbpHZfqmp+Pg62ru0tIzU1FQANmzYgI+PN1FR0c72Tk5OJjEmnObWdspqG0mMCnVkq64JCxAZEuCov6yGmPAg/Hy8aW1rZ2N1A32jwxzZqm/CZrPTKzTQUX95Lb1CAgjw86GtvYOiyjqSe4cDUN3QTFu7jegwx7xFFXVEBPsT6OdDe4eNgvJaUmIc89Y0ttDS2k50eBCtVj+q6psItrYT7O+LzWYjv6yWlN5hYLFQ19hCQ0sbsRHBjrxU1RPk70tIgC92u5280hqSokOxWq3UN7VS19RKn0jHvCXVDQT4ehMa6Of4HJVU0zcqFC8vKw3NrdQ0tBDXK8Tx+axpwNfbi6BNn6s9sY7o2zdR6wgD1hEJcXEc7xtIW3sbc/Jz6O/crnleR1jjU7C3tWEvKMSa6qjXXl2DvaUFa4yjBltRMZbwMCxBQdDRji2vAGu/ZMCCvbYWe2MT1ljHdsFWvAFLSDCWkBCw27Dl5GNNTSItKtjwdYSfnx8xMb3Jysoy7HtEcHCIvkeg7xFbryOCgoKIjo427HuEI7P6HrGn+hqbPxvbY+kTl2CKg+yLiwq45prr+Pa777Y5398rl/PU088wY8ZMj4/7+vri6+vrvB8UFMSypYsZOCiN+nrzHNoUHR3dI881lO5P2XToe8olRpewbRYLoYmOLy+1BetMcV7e+rmz9tiylUtj+Fss/NrX8WX2mPXpNG8nZ9OiE/dGWYzddJ6g0ZRLMSPlsmcLDg5mXUb6dvs2pt6Tt7U+fWKJiIigtKS003laW1u7xQic3aFG2Tcpm2JGyqWYkXIpZqRcChh8Tl5gYCD7778f+++/HwCJfRPZf//9iN+06/ehBx/g0EMPISEhgWOPPYa333qT3Lw85v/8s5Fld4nNhwCImI2yKWakXIoZKZdiRsqlgMF78v71r4P4+KM5zvuPPjIRgA9nz+G+++4nLS2NkSMvJDQ0lJKSEn7++ReefuZZ/UIhIiIiIiLSCUM7eb//voi4+M6P37/k0sv2YjV7V25urtEliHikbIoZKZdiRsqlmJFyKWDySyj0ZI7RNUXMR9kUM1IuxYyUSzEj5VKgCzt5oaGhXbWofUJwcIjRJYh4pGyKGSmXYkbKpZiRcimwi528MbfczNlnj3Def+WVl1m9aiVLFv/FfvuldVlxPVlbm84rFHNSNrsJu52mig00VWwwxeUT9jTl0hjtdjuPlBfxSHkR7ftAznaWcilmpFwK7GIn7/LLL6O4uBiA4487juOPO47LLruCefPm8dCDD3ZpgT1VVla20SWIeKRsdh9tDbW0NdQaXcZeoVwaox34oqGGLxpqaDe6GBNSLsWMlEuBXezkRUf3dnbyTj31FL748kt+/uUXXv7vf/nXvw7q0gJ7qrQ07fEUc1I2xYyUSzEj5VLMSLkU2MXRNWtqaoiLi6O4eAMnnXQiTz39DAAWiwUvL68uLE9ERDrj7R8EQHtzg8GViNlNi+58JOvtibQ6tuuVto6uKkdERPawXerkffPNN7z04v+Rm5tLREQEP/00D4AD9j+AvLy8rqyvx6qsrDS6BBGPlM1uwmIhsHcCALUF63r8eXnKpTGswAH+gQAsbKzDZmw5pqNcihkplwK72Mmb+MijFBQUEhfXh0mTptDY2AhA75jeTJ/+bpcW2FM1NTUaXYKIR8qmmJFyKWakXIoZKZcCu9jJ8/Hx4ZVXX3Wb/vrrb+x2QfuK+PgEamvTjS5DxI2yKWakXIoZKZdiRsqlwC4OvLJyxTKmPvcs/z7iiK6uR0RERERERHbDLnXybrvtDsLDw5k9+wMWLPiZW8fcQkxMTFfX1qPl69xFMSllU8xIuRQzUi7FjJRLgV3s5H373Xdcc+11HHrYEbz33gzOPfdc/vzjd6ZPf5szzhimETZ3QERkpNEliHikbIoZKZdiRsqlmJFyKbCLnbzNKisree211zn1tNN59NHHOO7YY3n9tVdZtnQxd981ngB//66qs8cJDQ01ugQRj5RNMSPlUsxIuRQzUi4FdnHglc2ioqIYNfJCRo0aSUJCAl999TXvf/ABffr0Ycwtt3DooYdy8SWXdlWtPYqtQ9cbEnNSNrsJu52myhLn7Z5OuTSGHchsbXbeFlfKpZiRcimwi528M84YxkWjR3HCCSeQmZnJ9Onv8vEnn1JbW+ucZ/HiJfw8/6cuK7SnyVi3zugSRDxSNruPtvpqo0vYa5RLY9iBDe1tRpdhWsqlmJFyKbCLh2tOm/ocG0tKOOfc8znt9GG8/c50lw4eQElJCf/5z/91SZE9UVraYKNLEPFI2RQzUi7FjJRLMSPlUmAX9+QdcshhNDU3b3Oe5uZmpk57flcWv4+wGF2ASCeUze7Cyy8AgI6WJoMr2RuUS6OEWR2DqdXYdAiYO+VSzEi5lF3s5G3ZwfPz88PHx8fl8fr6+t2rah9QXV1ldAkiHimb3YTFQlBMXwBqC9b1+PPylEtjWIF/+QcCsLCxDpux5ThNi07cK68ztqxgm48rl2JGyqXALnbyAgICePCB+xkxYjgRERFujyf2Td7dunq8ujp1hMWclE0xI+VSzEi5FDNSLgV28Zy8hx58gGOOGcKE++6ntbWVu+66h2efm0pJSQm333FnF5fYMyUm7p1fIUV2lrIpZqRcihkpl2JGyqXALu7JO+20U7n9jjv5/fdFTJv6HH/8+Sd5eXkUFhZy/nnn8emnn3VxmSIiIiIiIrIjdmlPXnh4OOvz1wNQV19PeHg4AH/++RdHHXVklxXXk61fv97oEkQ8UjbFjJRLMSPlUsxIuRTYxU5efv56Evs6TvjPzsri7BHDATj9tFOp2epSCuJZWGio0SWIeKRsihkpl2JGyqWYkXIpsIudvA9nz2b//dIAePGll7nyyivJyc7kkUcm8t//vtKlBfZUYZv2foqYjbIpZqRcihkpl2JGyqXALp6T9/rrbzhvL1iwkONPOJGDDjqQvLw80tPXdllxPZndZpaBqEVcKZvdhN1Oc1Wp83ZPp1waww7ktDY7b4sr5VLMSLkU2IVOnsViYfSoUZxx5jASExKx2+0UFBTw5VdfqYO3E9ZmZBhdgohHymb30Vq371wLSbk0hh0obG8zugzTUi7FjJRLgV04XPOdd97i2Wefpk9sLGvXrmXdunXEJ8Tz/LSpvPXmG9tfgAAwaOBAo0sQ8UjZFDNSLsWMlEsxI+VSYCf35I0eNYqjjjySUaMv4rfffnd57JhjhvDWm29w4YUX8NFHH3dpkT2R1cvL6BJEPFI2uw+rrz8Atk2H0/VkyqVxgq2O34PrdQiYG+VSzEi5FNjJPXnnnnsO//d/L7p18AB+/fU3XnzpZc4/77wuK64nq62tMboEEY+UzW7CYiE4Nong2CSwWIyuZo9TLo1hBQ71D+JQ/6BdG6mth1MuxYyUS4Gd7OSlpQ1m3vz5nT4+76d57Ldp1E3ZtqqqaqNLEPFI2RQzUi7FjJRLMSPlUmAnO3nh4eGUlZV3+nhZeTlhYWG7XdS+ICkpyegSRDxSNsWMlEsxI+VSzEi5FNjJTp6Xlxft7e2dPt7R0YG39y5dlUFERERERES6wE71yCwWC88/P5XWllaPj/v6+XZJUfuCwsJCo0sQ8UjZFDNSLsWMlEsxI+VSYCc7eXPmfLTtGerQyJo7KCgokLq6OqPLEHGjbIoZKZdiRsqlmJFyKbCTnbyx48bvqTr2ORERkWzcWGJ0GSJulE0xI+VSzEi5FDNSLgV2spMnIiImYbfTUlPuvC2yJ9iB/LYW520REeke1MkzSHp6utEliHikbHYfLTUVRpew1yiXxnB08jyfhy/KpZiTcimwk6NrStcZMGCA0SWIeKRsihkpl2JGyqWYkXIpoE6eYXSpCTErZbP7sPr4YvXZN0Y1Vi6NE2ixEmjR1wVPlEsxI+VSQJ08w9TV1RpdgohHymY3YbEQ3CeF4D4pYLEYXc0ep1wawwocHhDE4QFB+sLggXIpZqRcCuicPMOUl+8759JI96Jsihltmctp0Yl75TXHlhXs8dfYW+9F9gytL8WMlEsB7ckzTEpKitEliHikbIoZKZdiRsqlmJFyKaBOnoiIiIiISI+iTp5BiouLjS5BxCNlU8xIuRQzUi7FjJRLAXXyDOPn52d0CSIeKZtiRsqlmJFyKWakXAqok2eYXr16GV2CiEfKppiRcilmpFyKGSmXAhpdU0Ske7LbaamtdN4W2RPsQEFbq/O2iIh0D4buyTvyyCOZ/s5bLF2ymOKiAoYNHeo2z913jWfZ0sVkZ2Xy4QezSElJ3vuF7gFr1641ugQRj5TN7qOluoyW6jKjy9grlEtj2IHcthZy21rUyfNAuRQzUi4FDO7kBQYGsHpNOvc/8KDHx8fccjPXXHM1Eybcz/ARI2hsbGLWzBk94ljj1FQNbyvmpGyKGSmXYkbKpZiRcilg8OGa8+bNZ968+Z0+ft111/LCC//Hd99/D8Dtd9zJiuVLGTZ0KP/7/PO9VOWe4evb/Tuq0jMpm92HxcuxCrd3tBtcyZ6nXBrHz2IBoEWHBbtRLsWMlEsBEw+80rdvX2JiYliwcIFzWl1dHcuWLeewww41sLKu0dDQYHQJIh4pm92ExUJIfD9C4vvBpi/hPZlyaQwrcGRAMEcGBJv3C4OBlEsxI+VSwMQDr/TuHQ1AWVm5y/Sy8jJ69+7d6fN8fX3x9fV13g8KCtozBe6mkpKNRpcg4pGyKWakXIoZKZdiRsqlgIk7ebvqtlvHMH78OLfpgwYNorGxkYyMDJKTk/Hz86OxsZHi4mL69+8POD4UFovV2YnMzMwkMSEB/4AAmpubKSgoYMCAAQCUlZbSYbMRGxsLQHZ2NrGxsQQFBdHa2kJubh6DBg0CoLy8nNbWVuLi4gDIzc3l3/8+grKyctraWsnKyiYtLQ2AyspKmpoaiY9PACA/L4+IyEhCQ0OxdXSQsW4daWmDAQvV1VXU1dWTmJgIwPr16wkLDSUsPBy7zcbajAwGDRyI1cuL2toaqqqqSUpKAqCwsJCgoEAiIiIBSE9PZ8CAAXh7e1NXV0t5eQUpKY5juouLi/Hz83MOybt27VpSU1Pw9fWjoaGBkpKNpKb2A2Djxg14eXkTHe3opK9bt46kpL74+fnT1NRIUdGW7V0CQExMDABZWVnEx8cREBBIS0sz+fnrGThwoKO9y8ro6GgnNrYPADk52cTE/NPeOTm5DB48GICKigpaWlpc2jsqqhchIaG0t7eTmZnpbO+qqkoaGhpJSNjU3vn5RESEExoa5mzvwYMGYbFaqamupqa2lr59+wJQUFBASEgw4eERgJ309LVbtHctVZWVJCUnA1BUVEhAQCCRkf+0d//+/fDx8aW+vo6ysnKX9vb19SUqKgqAjIwMUlKSne29ceNG+vXb3N4b8bJaid4ys4mJ+Pv709zUREFhoTOzpaWl2O02YmJine0dFxdHYGAgLS0t5OU5Mtu3byJLly6lra2dPn02t3cOvXtHExwcQltrK9k5Oc72rqyooKm5mfj4eADy8vLo1Styq/Z2ZLaqqoqG+noStsxsWBhhYWHOzDrbu6aGmpoaZ3sXFhQQFBxMRMQ/7b1lZisqKkl2tncRAf7+RG6R2X6pqfj4Otq7tLSM1NRUADZs2ICPjzdRUdHO9k5OTiYxJpzm1nbKahtJjAp1ZKuuCQsQGRLgqL+shpjwIPx8vGlta2djdQN9o8Mc2apvwmaz0ys00FF/eS29QgII8POhrb2Doso6knuHA1Dd0Exbu43oMMe8RRV1RAT7E+jnQ3uHjYLyWlJiHPPWNLbQ0tpOdHgQm8bWJDo0kGB/H2w2G/lltaT0DgOLhbrGFhpa2oiNCHbkpaqeIH9fQgJ8sdvt5JXWkBQditVqpb6plbqmVvpEOuYtqW4gwNeb0EDHYT+5JdX0jQrFy8tKQ3MrNQ0txPUKcXw+axrw9fYiaNPnak+sI/r2TeSnn+YRHx+HNTYJe2sr9uINWJMd6zR7ZRX2jg6s0Y7Pja2gEEtULywBAdjbWrEXFGNNdeTDXl2NvbUN66YfE22FRVgiwrEEBUFHO7a8Aqz9UkiLCt7j6whrP8fn3lZSisXfD0uYIz+27FwsSQlYvH2wNzZir6zCmuD4jNlKy7D4+GCJcGTClpuHJSEOi48v9qYm7GUVWPs66rWVl2OxWrFsWvfY8tZj6RODxc8Pe0sL9o0lWJMc9dorKrHb7VijHJ8b2/oCLNFRUL1pr4DVijVlU3tXVWFv37K9i7D0isASGIi9rQ17QSHWTecF2atrsLe0YI1xrKdsRcVYwsO2au9kwIK9thZ7YxPWWMd2wVa8AUtIMJaQELDbsOXkY01NAosVe10d9rp6rHGO9ZRtYwmWwAAsoaGAHVt2HtbkRPDyxt7QgL26Bmt83D/t7eeHJXxTe+fkYklMwOKzqb0rqrAmxpMWFdzpOsLPz4+YmN78+utvhnyPiI6OcqyT9T1C3yO2+h4RFBREfn6+Yd8jHJkt2+e/R+ypvsbmz8b2WPrEJZjiIPviogKuueY6vv3uO8BxuOai33/ltNOHsnr1Gud8H380h9WrV/PwxEc8LsfTnrxlSxczcFAa9fX1e/Q97Iy0tDTS09ONLkPEjbLp0PeUS4wuYdssFkITHV9eagvWmeIyCuvnztpjy94yl9OiE/fY62xpbFnBHn+NvfVedpUVODbQ0Zlf2FiHzdhy9rrtZUDrSzEj5bJnCw4OZl1G+nb7NqY9xH79+vWUlJRw7LHHOqcFBwdzyCEHs2TJ0k6f19raSn19vfPPrMclb9y4wegSRDxSNsWMlEsxI+VSzEi5FDD4cM3AwECX694l9k1k//33o7qqmqLiYt54403uuP02cnNyWV9QwD1330VJSYlzb1935uXV446UlR5C2RQzUi7FjJRLMSPlUsDgTt6//nUQH380x3n/0UcmAvDh7DmMHTuOl17+L4GBgTz99JOEhoby119/celll9PS0mJUyV0mOjqa8vLy7c8ospcpm2JGyqWYkXIpZqRcChjcyfv990XExW/7fIRnnn2OZ559bi9VJCLSTdihta7KeVtkT7ADxW2tztsiItI9aH+uQdatW2d0CSIeKZvdhZ3mqlKji9hrlEtj2IGstu5/9MyeolyKGSmXAiYeeKWnS9o0ZLWI2SibYkbKpZiRcilmpFwKqJNnGD8/f6NLEPFI2ew+LFYvLFYvo8vYK5RL4/hgwQeL0WWYknIpZqRcCqiTZ5impkajSxDxSNnsJiwWQhL6E5LQHyw9/wu4cmkMK3B0YDBHBwbrC4MHyqWYkXIpoE6eYYqKio0uQcQjZVPMSLkUM1IuxYyUSwENvGKY/v37k56ebnQZIm7Mns2+p1xidAnSiT35f5MSE05uSbXjzsoFe+x1RHaG2deXsm9SLgW0J09ERERERKRHUSfPICUlJUaXIOKRsilmVFnXZHQJIm60vhQzUi4F1MkTERERERHpUdTJM0hMTIzRJYh4pGyKGUWGBBhdgogbrS/FjJRLAQ28IiLSPdmhtb7GeVtkT7ADG9vbnLdFRKR7UCfPIFlZWUaXIOKRstld2Gmu3Gh0EXtNQVmt0SXsk+zAutZmo8swLa0vxYyUSwEdrmmY+Pg4o0sQ8UjZFDPqHRZodAkibrS+FDNSLgXUyTNMQIC+sIg5KZvdiMXi+NsH+PnqwBOjWNGXhc5ofSlmpFwKaL1tmJYWHf4i5qRsdhMWC6GJAwlNHLhPdPRa2zqMLmGfZAWODQzh2MAQfWHwQOtLMSPlUkCdPMPk5683ugQRj5RNMaMNVfVGlyDiRutLMSPlUkCdPMMMHDjQ6BJEPFI2xYySeocZXYKIG60vxYyUSwGNrikiIiLi0bToxG0+bg2Lxha9e3uZx5YV7NbzRUQ80Z48g5SVlRldgohHyqaYUVV9k9EliLixV1YZXYKIG23HBdTJM0xHR7vRJYh4pGyKGXXYdCluMR97hwYEEvPRdlxAnTzDxMb2MboEEY+UTTGjqFANCS7mY42OMroEETfajgvonDwRke7JDm2Ndc7bInuCHShrb3PeFhGR7kGdPIPk5GQbXYKIR8pmd2GnqbzY6CL2msLyWqNL2CfZgfRWXXOrM7aCQqNLEHGj7biADtc0TExMrNEliHikbIoZ9QoJMLoEETeWqF5GlyDiRttxAXXyDBMUFGR0CSIeKZtiRgF+PkaXIOLGEqAfH8R8tB0X0OGahmltbTG6BBGPlM1uwmIhNNFxwdvagnVg79lnTLW1axRDI1iBYwNDAFjYWIfN2HJMx97WanQJIm60HRfQnjzD5OTkGl2CiEfKpphRUUWd0SWIuLEX7DvnxUr3oe24gDp5hhk8eLDRJYh4pGyKGSXHhBtdgogba2qy0SWIuNF2XECdPBERERERkR5F5+QZpKKiwugSRDxSNsWMahr2/jD+06IT9/prSvdir642ugQRN9qOC2hPnmFaWnRSrJiTsilm1KqBV8SE7K1tRpcg4kbbcQF18gwTFxdndAkiHimbYkbRYRoSXMzH2jva6BJE3Gg7LqDDNUVEuic7tDXVO2+L7Al2oKKj3XlbRES6B3XyDJKbq+FtxZyUze7CTlNZkdFF7DXFuoSCIezA6pYmo8swLVvhvvMZlO5D23EBHa5pmKioXkaXIOKRsilmFBbkZ3QJIm4sEeFGlyDiRttxAXXyDBMSEmp0CSIeKZtiRkH+vkaXIOLGEqRzRcV8tB0X0OGahmlvbze6BBGPlM1uwmIhJL4/AHVFWWDv2WdMdXTYjC5hn2QFjg4IBuD3pnr0v7CVDq0vxXy0HRdQJ88wmZmZRpcg4pGy2X1YrPvOwRjry2uNLmGf5WWxGF2CadnyCowuQcSNtuMCOlzTMGlpaUaXIOKRsilmlBITbnQJIm6s/VKMLkHEjbbjAurkiYiIiIiI9Cjq5BmkqqrS6BJEPFI2xYxqG1uMLkHEjb1GhxGL+Wg7LqBOnmEaGhqNLkHEI2VTzKipVQMJiPnYm3QNQTEfbccF1MkzTEJCgtEliHikbIoZxYRrqHoxH2tsjNEliLjRdlxAo2uKiHRb7c36tVb2vGpdJkBEpNtRJ88g+fn5Rpcg4pGy2U3Y7TSW7jvDt2+orDe6hH2SDVjZokMSO2Mr3mB0CSJutB0X0OGahomICDe6BBGPlE0xo5AAX6NLEHFjCQ0xugQRN9qOC6iTZ5jQ0DCjSxDxSNkUMwpWJ09MyBIcbHQJIm60HRcweSdv/LixFBcVuPz98vM8o8vqEraODqNLEPFI2ewmLBaC4/sRHN8PLBajq9njbDab0SXsk6zAUQFBHBUQZO4vDEaxaX0p5qPtuEA3OCdv7doMRl90sfN+R3vPOAE8Y906o0sQ8UjZ7D6sXqZfhXeZ/DJdj8wovhZ17zpjy11vdAkibrQdFzD5njyAjo52ysrKnH+VVVVGl9QlBg8aZHQJIh4pm2JGyb11+JGYjzU1yegSRNxoOy7QDTp5KSkpLF2ymN9/W8iL//cf4uPijC6pS1ispm962Ucpm2JGln3gkFTphrSXU0xI23EBkx+uuXTZMu4cO47s7Gx6945h/Lg7+fTTjznp5FNpaGjw+BxfX198ff85QT8oyJwX0K2prja6BBGPlE0xo7qmVqNLEHFjr6szugQRN9qOC5i8kzdv3nzn7fT0tSxbtow///ids0cM5/0PPvT4nNtuHcP48ePcpg8aNIjGxkYyMjJITk7Gz8+PxsZGiouL6d+/PwAlJRuxWKz07t0bgMzMTBITEvAPCKC5uZmCggIGDBgAQFlpKR02G7GxsQBkZ2cTGxtLUFAQra0t5ObmMWjT7vLy8nJaW1uJ27QXMjc3l+CQYNLC02hrayUrK5u0tDQAKisraWpqJD4+AYD8vDwiIiMJDQ3F1tFBxrp1pKUNBixUV1dRV1dPYmIiAOvXrycsNJSw8HDsNhtrMzIYNHAgVi8vamtrqKqqJinJcWhJYWEhQUGBREREbmrfdAYMGIC3tzd1dbWUl1eQkpICQHFxMX5+fvTq1QuAtWvXkpqagq+vHw0NDZSUbCQ1tR8AGzduwMvLm+joaADWrVtHUlJf/Pz8aWpqpKhoy/YuASAmJgaArKws4uPjCAgIpKWlmfz89QwcONDR3mVldHS0ExvbB4CcnGxiYv5p75ycXAYPHgxARUUFLS0tLu0dFdWLkJBQ2tvbyczMdLZ3VVUlDQ2NJCRsau/8fCIiwgkNDXO29+BBg7BYrdRUV1NTW0vfvn0BKCgoICQkmPDwCMBOevraLdq7lqrKSpKSkwEoKiokICCQyMh/2rt//374+PhSX19HWVm5S3v7+voSFRUFQEZGBikpyc723rhxI/36bW7vjXhZrURvmdnERPz9/WluaqKgsNCZ2dLSUux2GzExsc72jouLIzAwkJaWFvLyHJn19/enrb2NtrZ2+vTZ3N459O4dTXBwCG2trWTn5Djbu7KigqbmZuLj4wHIy8ujV6/Irdrbkdmqqioa6utJ2DKzYWGEhYU5M+ts75oaampqnO1dWFBAUHAwKTHhYLeTW1pD36hQvLysNDS3UtPYQlykYzjz0poG/Ly9CAvyd2SgpJqEXiH4eHvR2NJGVX0T8b1CHZ/P2ka8rVbCgx3z5pfW0CcyGF9vL5pb2ymrbSQxyjFvRV0TFiAyJMBRf1kNMeFB+Pl409rWzsbqBvpGOw4rrKpvwmaz0ys00FF/eS29QgII8POhrb2Doso6knuHA1Dd0Exbu43oMMe8RRV1RAT7E+jnQ3uHjYLyWsf7BmoaW2hpbSc6PIhKHKJDAwn298Fms5FfVktK7zCwWKhrbKGhpY3YCMcIgBur6gny9yUkwBe73U5eaQ1J0aFYrVbqm1qpa2qlT6Rj3pLqBgJ8vQkN9HO2oUt7N7QQ18vR3mU1Dfhu0d55JdXEb2rvppY2KuqaSIj6p729rBYiggP+ae+IYHx9vGhpbae0ppHEaMe8lXWO67NFhgTg5+1FdX0zvcMCWXbmBbS326iubiQqylFvQ0MLNpudkBBHDZWVDQQH++Hr601Hu43Kqgaiox31Nja20t7eQWioo4aqqgYCA/3w8/PGZrNTUVFPdHQIh2UsxV5Ti72pCWusYz1lK96AJTTEMaqirQNb7nrHIXsWK/a6Ouz1DVj7OD5jtg0bsQQFbRpm344tOw9rSl+wemGvr8deU4s13rGespWUYvH3wxLmyI8tOxdLUgIWbx/sjY3YK6uwJjg+Y7bSMiw+Plg2DZNuy83DkhCHxccXe1MT9rIKrH0d6zRbeTkWqxXLpnWPLW89lj4xWPz8sLe0YN9YgjXJ8RmzV1Rit9uxRjnW9bb1BViio6B604+qVivWFMc2xF5Vhb29A2u0Yz1lKyjC0isCS2Ag9rY27AWFWFMd6zR7dQ32lhasMY71lK2oGEt4GJagIOhox5ZXgLVfMmDBXluLvXGr9g4JxhISAnYbtpx81/auq8ca51hP2TaWYAkMwBIa+k97JyeClzf2hgbs1TWu7e3nhyV8U3vn5GJJTMDis6m9K6qwJm5q77JyLN5eWCIiNrV3Ppb4Plh8fbFjh6pqrH0TN7V3BRaLBUuvTe2dvx5L7BbtvaEEa/Km9q6sxG6zkRbl2B7t7PeI6OgoxzpZ3yP0PWKr7xHV1dVER0cb9j3Ckdky036PiIj4p723zGxFRSXJzvYuIsDfn8gtMtsvNRUfX0d7l5aWkZqaCsCGDRvw8fEmKira2d57sq+x+bOxPZY+cQn2HZrTJL7+6ksWLFjAE08+5fFxT3vyli1dzMBBadTXm+diumlpaaSnpxtdhogbs2ez7ymXGF2COVgshCY6vrzUFqwDe7dale+0lJhwckuqAdjfL2CvvObpf32/V17HzKzAsYGOzvHCxjo0xqkra78UbNm5u7WMsWUFXVSNiIPZt+Oye4KDg1mXkb7dvk23Omg3MDCQpKQkSktLO52ntbWV+vp6519nh3WKiHR3HS1NdLQ0GV2G9HB1HR3UaUh2EZFuxdSHaz780IN8/8OPFBYWEhsbw13jx2GzdfDpZ/8zurTdVlCgX+7EnJTNbsJup6Fk3xm+fWOVeY7E2JfYgGUtjUaXYVq2DRuNLkHEjbbjAibv5PXp04eXX3qRiIhwKior+evPvxg+4hwqKyu3/2STCwkJNtXhoyKbKZtiRkF+PjS19ozrpErPYQkKwt6oveliLtqOC5i8k3fzLWOMLmGPCQ+PYIN+ARQTUjbFjEIC/Siv05dpMRdLaAj2snKjyxBxoe24gMk7eT1bzx4kQbozZbNbsFgI7uMYSa1+Q26PH3ilx78/k7ICh/s7LkW0uLlBA6+4US7FjJRLUSfPMOnpa40uQcQjZbP7sHr7GF3CXpNbWmN0Cfssf11YuVO27DyjSxBxo+24QDcbXbMnGbTpui0iZqNsihklbbp2noiZWFP6Gl2CiBttxwXUyTOM1cvL6BJEPFI2xYys2pskZmTV+lLMR9txAXXyDFNbW2t0CSIeKZtiRg3NrUaXIOLGrhEMxYS0HRfQOXmGqeoBl4GQnmlXs9n3lEu6uBKRf9Q2thhdgogbe42+TIv56DumgPbkGSYpOdnoEkQ8UjbFjPpEhhhdgogba3yc0SWIuNF2XEB78kREuq2OVu3dkj2vwdZhdAkiIrKT1MkzSFFRodEliHikbHYTdjsNG/OMrmKvKa1uMLqEfZINWNLcaHQZpmUrKTW6BBE32o4L6HBNwwQEBBpdgohHyqaYkZ+vfpMU87H4+xldgogbbccF1MkzTGRkpNEliHikbIoZhQXqy7SYjyUszOgSRNxoOy6gwzVFRLoni4WgmCQAGkrywW43uCDpiazAIf6OvQLLmhuxGVuOiIjsIHXyDJKenm50CSIeKZvdh5fvvrN3K7ek2ugS9llBuuB3p2zZuUaXIOJG23EBHa5pmP79+xldgohHyqaYUWJUqNEliLixJCUYXYKIG23HBdTJM4yPj6/RJYh4pGyKGXl7aXMl5mPx9jG6BBE32o4LqJNnmPr6OqNLEPFI2RQzamxpM7oEETf2Rl1eQsxH23EBdfIMU1ZWbnQJIh4pm2JGVfXNRpcg4sZeWWV0CSJutB0X0MArhklJSdGJsWJKyqbsqv39AvbYsqOjQygr06/TYi7WhPjdHnxlWnRiF1WzbWPLCvbK64jxtB0XUCdPRKTbsrXrEEbZ85ptunCCiEh3o06eQYqLi40uQcQjZbObsNupL84xuoq9pq5Oh2sawQb82dxgdBmmZSstM7oEETfajgvonDzD+Ppq5CMxJ2VTzMhLo2uKCVl8NLqmmI+24wLq5BkmKirK6BJEPFI2xYwCA/WlRczHEhFudAkibrQdF9DhmiIi3ZPFQlBvx4ANDaUFYLcbXJD0RFbgX36BAKxoaURn54mIdA/q5BkkIyPD6BJEPFI2uw+vPTiapdmUl2tkTaOEeHkZXYJp2XLzjC5BxI224wI6XNMwKSnJRpcg4pGyKWYUER5kdAkibiwJcUaXIOJG23EBdfIM4+vrZ3QJIh4pm2JGXt7aXIn5WHx0rqiYj7bjAurkGaahQUNSizkpm2JGra3tRpcg4sbe1GR0CSJutB0XUCfPMBs3bjS6BBGPlE0xo/r6FqNLEHFjL6swugQRN9qOC6iTZ5h+/foZXYKIR8qmmFFkpM7JE/Ox9k0wugQRN9qOC2h0TRGRbsvWoUMY96Tvjzh9j7/G6X99v8dfY3e12nXhBBGR7kadPINoV7qYlbLZTdjt1BdlG13FXlNf12x0CfskG7CoSef3dMZWXm50CSJutB0X0OGahvGyqunFnJRNMSOL1WJ0CSJuLFpfiglpOy6gTp5honv3NroEEY+UTTGjoCANCS7mY4mMNLoEETfajgvocE0Rke7JYiEw2jHoQ2NZIdjtBhckPZEVOMAvAIBVLU3o7DwRke5BnTyDZGZmGl2CiEfKZvfh7R9odAl7TUVFvdEl7LPCvfRVoTO2vPVGlyDiRttxAR2uaZjExESjSxDxSNkUMwoLDTC6BBE3lj4xRpcg4kbbcQF18gzj7+9vdAkiHimbYkbePl5GlyDixuKnc0XFfLQdF1AnzzDNTU1GlyDikbIpZtTe1mF0CSJu7C0tRpcg4kbbcQF18gxTUFhodAkiHimbYkY1tfrSIuZj31hidAkibrQdF1AnzzADBgwwugQRj5RNMaNevYKNLkHEjTWpr9EliLjRdlxAo2uKiHRbdtuODWi/v58GLZFd16HLc4iIdDvq5BmktLTU6BJEPFI2uwm7nbrCfWeY7IZ6nftkBBvwa5MuX9EZe0Wl0SWIuNF2XECHaxrGbtclZcWclE0xIzvamyTmY9deTjEhbccF1MkzTExMrNEliHikbIoZBQdrSHAxH2tUL6NLEHGj7biADtcUEemmLARExwHQVFYM2tMle4AF2G/TOZ1rWpqUMhGRbkKdPINkZWUZXYKIR8pmN2EBnwDHiJNNFnp8H6+yQueFGcEC9PLydt7u4THbabb1BUaXIOJG23EBHa5pmLi4OKNLEPFI2RQzCgnVCKFiPpbe0UaXIOJG23GBbtLJu+rKK/lj0W/kZGfy5Refc/DBBxtd0m4LDAw0ugQRj5RNMSMfHy+jSxBxY/HXuaJiPtqOC3SDTt7ZZ49g4sSHmDr1eYYOO5M1a9Ywa+Z79OrVvU92bmnRcOBiTsqmmFFHu0aLE/Oxt7YaXYKIG23HBbpBJ++G669n1qz3+XD2bDIzM7l3wn00NTVz8UWjjS5tt+Tl5RldgohHyqaYUVV1g9EliLixF20wugQRN9qOC5i8k+fj48NBBx3IggULndPsdjsLFi7gsMMOM7Cy3Tdo0CCjSxDxSNkUM4qKCjG6BBE31pQko0sQcaPtuIDJR9eMjIzE29ubsvIyl+nlZeX079ff43N8fX3x9fV13g8KCnL51ywCAwMJDg42ugwRN7uazUB/vz1QjXTKYiHQzweAdn8/2MZFmf39uv//jZ+fL/49MGPeQeY+d8YKWAIcNXpbOtBBs66sAQHYTP5/uFlwk75z7Cv0HbNn29E+jak7ebvitlvHMH78OLfpy5YuNqAaERGRbbnP6AJ22HlGFyC75QKjCxCRLhUUFER9feeXFzJ1J6+yspL29naio1yHKI6KjqKsrMzjc/7vxZd49bXXXaZFRERQVVW1x+rcWUFBQSxbuphDDj2chgadZyLmoWyKGSmXYkbKpZiRcrlvCAoKoqSkZJvzmLqT19bWxsqVf3Psscfw7XffAWCxWDj22GN55+13PD6ntbWV1q1Gu9pWL9dIDQ0Npq1N9m3KppiRcilmpFyKGSmXPduO/N+aupMH8Nrrr/P8tKmsWLmSZcuWc/311xIYEMAHH842ujQRERERERHTMX0n7/PPv6BXZCR33zWe6OhoVq9ew6WXXU55ebnRpYmIiIiIiJiO6Tt5AG+/M52335ludBldprW1leeem+p2WKmI0ZRNMSPlUsxIuRQzUi5lM0ufuITOx90WERERERGRbsXUF0MXERERERGRnaNOnoiIiIiISA+iTp6IiIiIiEgPok6eAa668kr+WPQbOdmZfPnF5xx88MFGlyT7kCOPPJLp77zF0iWLKS4qYNjQoW7z3H3XeJYtXUx2ViYffjCLlJTkvV+o7FNuvXUMX3/1Jesy0lm5YhlvvfkG/fqluszj5+fHlMmTWLVqJZnr1vL6a68SFRVlUMWyL7jiisv58YfvyVi7hoy1a/j888846aQTnY8rk2IGt465heKiAh59dKJzmrIp6uTtZWefPYKJEx9i6tTnGTrsTNasWcOsme/Rq1cvo0uTfURgYACr16Rz/wMPenx8zC03c801VzNhwv0MHzGCxsYmZs2cgZ+f316uVPYlRx91FO9Mn87wEedw0cWX4O3jzfuzZhIQEOCc55FHJnLaaady4403cf4FI4mJjeHNN14zsGrp6TZs2MCUJ55g2BlncsaZZ/Hrr7/x9ltvMnDgQECZFOP961//4rLLLmX1mjUu05VN0eiae9mXX3zOihUreODBhwCwWCws/utP3n77bV586WWDq5N9TXFRAddccx3ffvedc9qypYt59dXXeeXVVwEICQlhxfKljB07nv99/rlRpco+JjIyklV/r+C88y/kjz/+ICQkhL9XLmfMrbfx1VdfA9C/Xz9++WU+w0eczdKlywyuWPYVq1f9zaRJk/jyq6+VSTFUYGAg3333Dfff/wB33H47q9esZuLER7W+FEB78vYqHx8fDjroQBYsWOicZrfbWbBwAYcddpiBlYk49O3bl5iYGBYsXOCcVldXx7JlyznssEMNrEz2NaGhoQBUV1cDcNBBB+Lr6+uy/szKzqawsFDrT9krrFYr55x9NoGBASxeslSZFMNNmTKJuXN/cskgaH0pDt3iYug9RWRkJN7e3pSVl7lMLy8rp3+//gZVJfKP3r2jASgrK3eZXlZeRu/evY0o6f/bu/eoKMs8DuBfLiMwIJoynMRhQVNywbRsRdGpXTJd0gQ1wERNA+yc3dRNGDUvqeBllYtkeUERsS2VxMQ0L4UZqKEleOEeIIPgkAjiyHAZGJT9g5p6D61gCzOd8fs5Z87hfW7vb97znJnz433eZ+gxZGJigrCw1fj+++/xww8/AADsJfZoampCbW2toG1VVTXsJRJDhEmPiSFDhuDY0SOwsLBAfX09goLnoaioCEPd3DgnyWB8vL3xzNBnMHHSq+3q+HlJAJM8IiL6g9mwYT2GPP00pkydZuhQiHD9+nWMn+CFnj174tVJE7Hl/RhMe83P0GHRY8zBoR/Cw9fg9RkBaGpqMnQ49AfF5Zp6VFNTg5aWFkjshP9FsZPYoaqq6n/0ItKf27fb5qFEItyBS2Inwe3btw0REj1m1q9bi/Evj4Ov33T8+OMtXfntqtuwsLDQLeP8mURih9v8/KRupNVqUVpaiuzsbPx74ybk5eUhODiQc5IMZtgzwyCRSPDlqZMou6FA2Q0FxozxQFBgIMpuKFBVXcW5SUzy9Emr1SIrKxsy2VhdmYmJCWQyGTIzMw0YGVGbsrIyVFZWQiaT6cpsbGzw3HPPIjPzsgEjo8fB+nVr4eXlBT//6SgvLxfUZWVlo7m5WfD5+dRTAyGVSvn5SXplYmqKHj0sOCfJYM6dPw/Pl17G+AleutfVq9dwODkZ4yd44dq1LM5N4nJNfdsVF4f3YzbjWlYWrly5innzgiC2skLipwcNHRo9JsRiseB37xz/5Ag3N1eo7qqgrKjA7t3x+NfCBVCUKFBWXo4li+WorKwU7MBJ1NU2bFiPqVN88GZgMOrq6iH56bkRtVoNjUYDtVqNA4mfYs3qVVCpVFCr67B+XTgyMjK4Uxx1m2XvLsWZb1KhVCphY2ODqVN8MMbDAwEBszgnyWDq6+t1zyv/rKGhAXfv3tWVc24Skzw9O3r0GPr26YPF8lBIJBLk5uZh5qzZqK6u7rgzURcYPnwYPjuUpDsOW9P246mfHkzCokUh2LZ9B8RiMSIiNsLW1haXLl3CzFmzue6futXcOW8AAA5/liQof2dRCA4ebCtbsyYMrQ8eIG7XLlhY9EBqahqWLV+h91jp8WFnZ4cPtsTA3t4earUa+fn5CAiYhbPn2nYg5pykPyrOTeLv5BERERERERkRPpNHRERERERkRJjkERERERERGREmeUREREREREaESR4REREREZERYZJHRERERERkRJjkERERERERGREmeUREREREREaESR4REREREZERYZJHRERkZBYvliNi08YuG08kEuG7i+kYNmxYl41JRETdh0keERF1iwpl+UNfoSGLDB1il/vuYjqCg4MMGoNEIkFwUCC2fPChrszKygo7tm/DlcsZ2L5tK6wsLdv1Wbc2HBfSz0NRUoyMS9/ho717IJONBQBotVrExu7EihXL9PpeiIjo92GSR0RE3WL4syN0r/dWrUZtba2gbEfsTkOH2GlmZmZ6PZ9IJPrdfQMCZiAjIxNKpVJXNm9eMOrr6zEjYBY0Gg2C5wXr6qRSKU6dPIGxY8dg7br1GPfyeATMnI1v0y9gw/p1unaHk4/AfeRIuLi4/O7YiIhIP5jkERFRt6iqqtK91Go1WltbBWVTfLyRlnoGJdeLcDbtG8yZ84aur1QqRYWyHJMnv4rkw5/henERThz/AgMHDsDw4cNx8sRxFBUW4JOP/4M+ffro+sXEbMae+N0IWfQOsrOu4oeCPGzcuEGQNJmYmGD+/Ldx8cK3uF5chJSULzFp0kRdvYfHaFQoy+Hp+TecOnkcpYrrcHcfCScnJyTsice1q5dRVFiAE8e/wAsvyHT9DiUdhKOjI8LD1ujuVgJAaMgipHx1SnBtgoOD8N3F9HZxL1y4AJczM3DubCoAwMGhH2JjtyM/Lwe5OdlI2BMPqVT60Ovu4+2NlJTTgrLevXqhpKQEBQUFKC4uRi9bW13dvzesRytaMXHSZJw4cRIlJQoUFhZi1644vDrZR9fu3r17uJSRAR8f74een4iIDM/c0AEQEdHjZ+rUKZDL5VixciVycnIxdKgbIiMj0NDQgKSkQ7p28tAQrFodBqVSic2bo7Bt61bU1ddh1arVaGxsROzOHVi8WI5ly5br+shkY9HU1ITXfP3h6ChFzOZo3L2rwqZNEQCABQvm47VpU7H03eVQKBQYPXoUPvxgC+7cqcHFixd14yxfvgxrw9fhRlkZ7t27BweHfvj6zBls3BSB5uYm+Pr6Ym9CAl588a9QVlQgeN5bOJ3yJT7Ztx/79u1/5Gsik42Fuk6N12cEAADMzc2xf98nyMy8jKnTfNHS0oJ3/rUQ+/d9jHEvT4BWq203Ru/eveHiMhjXsq4Jyvck7MXBTxOxdOkSlJaWYvrrAbr2np5/w8ZNEWhsbGw3Xm1treD46pWrGOXu/sjvjYiI9ItJHhER6Z08NBTh4Wtx8mTbHa7y8nK4uLhg9qyZgiQvNnYn0tLSAADxu/dgx45t8POfjksZGQCAxAOJ8Pf3E4zdrNUiJCQUjRoNCgsLERkVjfdWrkBERCREIhEWLpiP6a/PQGbmZQBAWVkZ3EeOxOxZMwVJXlRkNM6eO6c7VqlUyMvL1x1HRkbhFS8vTJgwHgl7P4JKpcL9+/dRV1eHqqqqR74mDQ0NkMuX6JK3adOmwtTUFKHyxbo2i0JCUZCfizEeHkg7e7bdGP37O8DU1BSVlZWC8ps3b2Ks7AXY2dkJYnN2doapqSmKi693KsbKykpIpf0f+b0REZF+MckjIiK9srKywoABzoiOjkRk5CZduZmZGdRqtaBtXn6B7u+q6rbkJP/XZVXV6NvXTtgnLw+NGo3uODMzEzY2NnBwcIC1tTXEYjESDwjvtIlEIuTk5ArKrmVlCY7FYjHkoSEYN+4l2Nvbw9zcHJaWlujfv2uSnoKCAsHdOTdXVzg7O6OosEDQzsLCAk7OTkD7HA+WP22ootE0tav7ebnsr5mYPFqMjRoNrKysHq0TERHpHZM8IiLSK2trawCAfPESXLlyVVB3//59wXFLyy9JT2tr609lLb+UoRWmpp3PVKytxQCA2W/Mxa1btwR1zc3CxKihoUFwvGrVSrz4wosIX7sOpaWl0Gg0iNsVC1GPh2+S8uDBg3bZlMi8/ddvQ4NwuaTY2hpZWdmYv2Bhu7Z37tz5zXPV1NQAAHr37qX7+2EUilI8ePAAgwY91WFbAHiid2/cudPxuEREZFhM8oiISK+qq6vx44+34OTkhOTkI10+vqurKywtLaH56W7eiBEjUFdXh4qKCqhUKmg0GvTv7yBYmtkZI/8yEgeTknDqVNsSU7FY3G4TlGattt1OnHdqamAvkQjK3NzcOjxfdnY2vCdPRnV1Nerq6joVY2npDdTW1sJlsAtKShQdtlepVEhNTcPcuXMQH7+n3XN5tra2gufynh7yNHJyczoVCxERGQ531yQiIr2Ljo7GgvlvIyjwTQwcOABDhgzBdH9/vPXWvP977B4iEaKjIjF48GC89JIn5KEhSEjYi9bWVtTX1yN25y6ErVkNPz9fODk54ZmhQxH45lz4+fk+dFyFQoGJr3jBzc0Vrq5/xvZtW2FqKvwavVl+E6NHjcKTTz6JPk88AQBIT7+Avn374u1//gNOTk6YO2cOPD09O3wfyYeTUXO3BgkJ8XB3d4ejoyM8PEZjbXgY+vV78jf7tLa24ty583B3H9nJqwUsX7ESZqamOHH8GCZOfAUDBjhj0KBBCAp8E8eOHhG0HeXujrS031gnSkREfyhM8oiISO/2H0iEXL4E06f74+vTKfjsUBL8/f1QVlb+f499/vy3UCgUSD58CLE7tuOrr1IQvTlGVx8REYmY97dgwfy3kZZ6Bvv2fYxx48Z1eO41YeFQ3buHo58fwUd7E5CamobsbOFdrcioKEgdpUj/9hxyctqe6SsuLsay5Sswd+4cnE75Es8+NxyxOzv+jcBGjQbTpvlCqVQifvcupKWeQXRUFCwsLKBW/+87e/sPHIC3jzdMOvnAXVlZGf7uNRHp6RewetV7OPP1aSQm7odMJsO7v9q19PnnR6Bnz544fvxEp8YlIiLDMennIG01dBBERERdISZmM3rZ2iIwKLjjxkbs+BfHEBe3G0c+/7zLxozdsR25eXn48MOtXTYmERF1D97JIyIiMjJLli6FmblZxw07SSQSIb+gAHFxu7tsTCIi6j7ceIWIiMjI5ObmITc3r8vG02q12LLlgy4bj4iIuheXaxIRERERERkRLtckIiIiIiIyIkzyiIiIiIiIjAiTPCIiIiIiIiPCJI+IiIiIiMiIMMkjIiIiIiIyIkzyiIiIiIiIjAiTPCIiIiIiIiPCJI+IiIiIiMiIMMkjIiIiIiIyIv8FRI3U+tKMtsAAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "jetTransient": { + "display_id": null + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "madrid_s = climate.where(\n", + " (climate[\"city\"] == \"Madrid\") & (climate[\"day\"] >= SUMMER_START) & (climate[\"day\"] <= SUMMER_END)\n", + ")[\"temperature\"].to_numpy()\n", + "\n", + "london_s = climate.where(\n", + " (climate[\"city\"] == \"London\") & (climate[\"day\"] >= SUMMER_START) & (climate[\"day\"] <= SUMMER_END)\n", + ")[\"temperature\"].to_numpy()\n", + "\n", + "fig, ax = plt.subplots(figsize=(9, 4))\n", + "bins = np.linspace(0, 45, 30)\n", + "ax.hist(madrid_s, bins=bins, alpha=0.7, color=\"#e63946\", label=\"Madrid\")\n", + "ax.hist(london_s, bins=bins, alpha=0.7, color=\"#457b9d\", label=\"London\")\n", + "ax.axvline(\n", + " madrid_s.mean(),\n", + " color=\"#e63946\",\n", + " linestyle=\"--\",\n", + " linewidth=1.5,\n", + " label=f\"Madrid mean {madrid_s.mean():.1f}°C\",\n", + ")\n", + "ax.axvline(\n", + " london_s.mean(),\n", + " color=\"#457b9d\",\n", + " linestyle=\"--\",\n", + " linewidth=1.5,\n", + " label=f\"London mean {london_s.mean():.1f}°C\",\n", + ")\n", + "ax.set_xlabel(\"Temperature (°C)\")\n", + "ax.set_ylabel(\"Days\")\n", + "ax.set_title(\"Summer temperature distribution — Madrid vs London\")\n", + "ax.legend()\n", + "ax.grid(True, linestyle=\"--\", alpha=0.4)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8a690193", + "metadata": {}, + "source": [ + "### 5.3 Mean summer temperature — all cities ranked" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "334a833a", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:27.853545427Z", + "start_time": "2026-04-14T12:39:27.605095719Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:15.573465Z", + "iopub.status.busy": "2026-04-07T12:06:15.573114Z", + "iopub.status.idle": "2026-04-07T12:06:16.558429Z", + "shell.execute_reply": "2026-04-07T12:06:16.555709Z" + } + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA90AAAGGCAYAAABmGOKbAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAApOtJREFUeJzs3XdYFNcaBvB3l94VxUIJqGBNtWs0MTcW7GJBY42JJbHElsTeoqCmWGKJMfbescfeu9iRDooUFRSR3vf+Ydy4QWDR2R04vr/n2eeyM2dmv93XS/h2Zs4oKto7qkBEREREREREklPKXQARERERERGRqNh0ExEREREREekIm24iIiIiIiIiHWHTTURERERERKQjbLqJiIiIiIiIdIRNNxEREREREZGOsOkmIiIiIiIi0hE23UREREREREQ6wqabiIiIiIiISEfYdBMRUbFx6eJ5zJs3V6ux27dtxfZtW3VckXzGjB6FmOhI2JYurbPXcHR0REx0JDw9uxU4ztOzG2KiI/H+++/rrBbS3rx5cxESHCh3GUREpCU23UREpHPOzs6YM2cWLpw/i/CwEAQF+mP3rp34+uuvYGpqmu92bm5uGDN6FBwdHfVYLRHlx6NTJwwY8LXcZRARlSiGchdARERi+/zz/2HZn0uRkZGB7dt3IDAoCMZGxqhfvx4mT5qIalWr4sex4wAATT9phtzcXPW2Vau6YcyY0Th/4QKioqI09vtFz156fR9EBHTy6Ijq1aph+fIVcpdCRFRisOkmIiKdcXJywh9LFiMqKgrdPHsgNjZWvW71mjVwcXHB55//T70sMzNT631nZWVJWuvrUCgUMDY2RkZGhtylkGDMTE2Rlp4udxlERCQBnl5OREQ6M2TIt7C0tMSY73/QaLhfuHfvHlasWKl+/vI13Z6e3fDXsj8BADu2b0NMdCRioiPRqFFDAK++ptvY2BjfjxmNc2fP4G54KHyvXMKkiRNgbGysMe6Tpk2xy2cHAvz9EBIciDOnT2LcuLGFvp+Y6Eh4zZwBD49OOHH8KO7dDcNnzZoBAL4ZPBh7dvvAz+8WwkJDcPDv/Wjbtk2++3Bv1QrHjx3F3fBQnDh+FM3+2U9BHBwccO7sGRw/dhRly5YFAFhbW2P69KnwvXIJd8NDce7sGQwd8i0UCoXGttbW1pg3by4CA+4gwN8P8+fPhY2NdaGv+TIzMzPMmTMLfn63EBTojwUL5sHGxka9fv78ufC7fROGhnm/09+0cQPOnD5Z4P4rVXLBX8v+xI3rVxEeFgJf38v4Y8liWFlZASj4GvSY6EiMGT1K/fzFNfGVK1fCwt8XIDDgDm7fuoEffvgeAGBvXxGrVq5AUKA/bly/isGDB2nsr1GjhoiJjkT79u0wetRIXPW9guCgACxbthRWVlYwNjbG9OlTcevmdYQEB2Le3N/y/DsDgM6dPXDw7/0ICw3BHb/b+GPJYtjbV9QYs33bVhw/dhTvvfcedu7YjrDQYIwbX/i/xwoVKmDliuUICQ7E7Vs3MGXyJCiVmn/aKRQKDBjwNU4cP4rwsBDcvHENc+bM0sgNAFq1bIm1a1fj2lVf3A0PxflzZzFy5AiN/W3fthUtmjeHk5OT+v+Ply6eL7ROIqK3HY90ExGRzrRo0Rz37kXA1/dqkbe9ePESli9fgQEDvsaC3xciJCQEABASEvrK8QqFAqtXrUT9+vWwfsNGhISEoEb16hg4cAAqV66Mr74eAACoWrUq1qxZhYCAQPz662/IyMxEJRcX1KtbV6u6Pv64Mdq3b4dVq1Yj/ulTREZFAgAGDPgKhw8fwc6du2BkbISOHTrgr2V/ok/ffjh27LjGPurXr4fWrVtjzdq1SE5OxtdffYXlf/2JevUb4OnThFe+rrOzM7Zt3YyEhAT06NET8U+fwszUFDt2bEPFChWwbv0GREdHo27dOhg/fhzKlS+HqVOnq7dftXIF6tevh3Xr1iMkJATurd0xf/48rd7zC14zZyAxMRFzf5uLKlWqoG/fPnB0cESXrs+b4B07dsKzWzc0a/Ypjh49pt7Ozs4OH3/cGHPnzc9330ZGRti4YT2MjU2wctVqxMXGokKFCmjevDmsra2RlJRUpFpfWPrHEoSEhMJ71mx8/vn/MGrkCCQkJKBP7144e+48vLxnobNHJ0ydMhk3btzEpUuXNLYfPmwo0tPTsXjxYri4uOCrr/ojOysbubm5sLGxwW9z56F27Y/Qvbsn7t+/j3nzF6i3/e674fjxh++xd+8+bNy0GWVsbfHVV/2xc8d2tGzVGomJieqxpUuXwob1a7F79x7s2LkTj+MeF/i+lEoDbNywHtevX8dPM2aiadMm+OabwbgXEYG1a9epx/08ZzY8Pbthy5atWLFyFd5xckL//l/i3VrvomMnD2RnZwN4/iVXakoqli37CympKfj448b48YfvYWVpiRkzvQAAv/++ENbWVqhYsSKmTnv+bys1JfW1ciEiepuw6SYiIp2wtLSEfcWKOHjw0Gttf//+fVy6fBkDBnyN06dP48KFiwWO9/DohKZNm6BLl264fOWKenlgUBB+njMbdevWga/vVXzySVOYmJigd+8+iH/6tMh1ValSBf/7vIX6S4AXmjT9FOkvnQ68atVqHDr4NwYNGpin6XZ1dUWzzz5HREQEAOD8+fM4dvQIOnXsiFWr1+R5TdcqVbBly2Y8fPgQPXv1xrNnzwAAgwYPgouzM1q2csfdu/cAAOvXb8Cjh4/w7bff4M8/lyEm5gFatWyJRo0aYsaMmfhj6fOzB9asXVfk2d+zsrLg2b2HulGLiorC5MmT0LJFCxw+cgRnz55DTEwMunTurNF0d+rUEUqlEjt27Mx331WrusHZ2RkDBw3G/v0H1MtfbmJfx/UbNzB27HgAzz+by5cuYOqUyZg1azYWL/kDALBr125cv+aLHj2652m6DQwM0blLN/V7LlOmDDp27IATJ06iT99+AIA1a9aikosLevTorq7XwcEB348ZjTk//4KFCxep93fg74M4fOhv9OvXV2N5+fLl8ePYcVi/foNW78vMzBR79u7F/H9eb9269Th08AC+6NFD3XTXr1cPvXr1xNChw+Gza5d623PnL2DTxvVo366devnQYcM1/v2uW7ceCQkJ6NevL+b8/AsyMzNx+swZPHj4EDY2Nti500erOomIiKeXExGRjlhZWQIAklOS9fJ67du1Q0hIKEJDQ2FburT6ce7cOQBA48aNAUB9dLFVq5Z5TsHWxoWLF/M03AA0GhYbGxtYW1nh0uXLeO/dd/OMPXP2rLrhBoCAgEAkJibiHWfnPGOrVa+GHTu2ISoqEt17fKFuuAGgXbu2uHTpMp4lPNN4z2fOnoWhoSEaNGgAAPjf558hKysLa146Apqbm4uVq1YV6b2v37BB3XwCzxv3rKws/O/zzwAAKpUKO3f6oGXLFrCwsFCP6+zhAV9fX0RGRua778TE50eym336KcwKmNG+qDZu3Kz+OTc3Fzdv3oJSqcSmTf8uT0xMRFhYGJzfeSfP9tu3b9d4z9euX4dSqcTmLVs0xl27fgP29vYwMDAAALRp0xpKpRJ79+7TyCYuNhZ3797Fx40baWyfnp6OLVuK9iXIy0e0AeDSpct456X30K5dWzx79gynTp/WqOH2rVtITk5G45dqePnfr4WFBWxLl8alS5dhbm4OV9cqRaqLiIg08Ug3ERHpRFLS82bb0sJSL69XqZILqlatCj+/W69cX7ZMGQDAnj170fOLHvjtt18xYcJ4nD17Dgf+/hv79u2HSqUq9HUi77+6cWze/HOMGPEdatWsqXEbtJdnY38hOjomz7Jnz56h1H+uswWANatXIS7uMb7o2RupqZqn8lauVAm1atbM/z3/c923o4MjYmNj82wfFhb2yu3yc/fuXY3nqampiI2NhZOjk3rZtu07MGzYULRu7Y7t23egSpXK+OCD99Uz1OcnMjISS/9chm8GD0Lnzh64dOkyDh8+gh07d772qeUAEB0drfE8MSkJaWnpec5ySExMQulX3BM9OkYzqxe1xPx3eWIiDAwMYG1thadPE1CpUiUolUqcP3fmlXVlvdTIA8DDh4+KNDlgWlo64uPjNZY9e/YMpUuXUj+vVKkSbGxs4Hf75iv38eLfB/D8souxP/6Ajz9uDGtrzWv9rayKdu0/ERFpYtNNREQ6kZycjAcPHqJatWp6eT2lUgl//wBM/+mnV65/0SSlp6fDo3NXfPxxY3z++ef4rNmn6NixA86cPYsvvuj1yib5ZemvmFG6fv36WL1qJS5evIQJEybhUewjZGdno7unJzp39sgzPjcn59U7f8WR9/0H/kZ3z27o3Nkjz6nHCoUCp06dxpI//njl7sLDwgt8L7oQEhKCmzdvoUvnzti+fQc6d+6MjIwM7N27r9Btf/ppBrZu3YZWrVri008+wYwZ0zFs+FC0b98BDx48zPdLkf9OHvayV33Wubmv/vxfdeZDTj5Z5eS8+t+JAop/alIgNzcXvXr3feXrpaSkaDx/1b+rguT3Hl6mVCoRFxeHYcO/e+X6J0+eAHg+yd7OHduQlJSEX379DREREcjIyMB7776LSZMmQqks+hkhRET0LzbdRESkM0ePHUWf3r1Rp05tXL16rcjba3Pk+YV7ERGoWbMmzpw5q9V+z549h7Nnz2H6dGD48GEYP24sPv64sVbb/1fbtq2RkZGBnr16a9z2rLunZ5H39V8zZsxETnY2Znl7ISU5RePa3IiICFhYmBdac1R0FJo0+Rjm5uYaR7urVCnaacOVKlXC+fMX1M/Nzc1Rrlw5HDuuec369u3bMXXqFJQrVw4enTrh2LHjGqfFFyQwMBCBgYFYsOB31K1bB3t270KfPn3w88+/qPfx3yOxjo6ORXof+hBxLwJKpRKRkfcRHn638A10UUNEBJo2bYIrV3wLbOobN2oEW1tbfD1gkMY17U5OTnkHF+H/k0RE9Byv6SYiIp1ZsmQpUlJS8OsvP2ucyvqCs7Mzvv76q3y3T01NAwDYWOc97fq/9u7dB/uKFdGrV88860xNTWFmZgYAKFWqVJ71d+7cAYBX3vJJGzk5uVCpVDB46Yiro6Mj3N1bvdb+NKhU+OHHsdi//wDmz5+Lli1aqFft3bsPdevWxaeffppnM2tra/X1xcePnYCRkRH69e2jXq9UKvFV//5FKqV3r14atwPr17cPjIyMcOL4SY1xPrt2Q6VS4aefpsHFxRk7duY/gdoLlpaW6npfCAgIRE5ODkz+ySU5ORlPnjxBw3+uVX/hy359i/Q+9OHA3weRnZ2N0aNGvXL9y6eB68qevftgaGiIkSNH5Fn3/FT4519e5Pxz1PzlI/1GRkav/FxTU9PUt3AjIiLt8Eg3ERHpTEREBIYOHY4//liCUydPYPuO7QgKDIKRsTHq1qmDdu3aYuu2bfluf+fOHWRnZ2PI0G9hZW2FzIxMnD13Tn1a7Mu2b9+B9u3bYc7sWfi4cWNcuXIFSgMDuLq6on37dujZszdu3bqFUaNGoGGDBjh67Diio6JQpmxZ9OvXFzExMbh8+corqijcsWPH8M3gQdiwYT18du1C2TJl8OWX/XD33j3Uqlnztfb5MpVKhWHDv8PKlcuxdOkS9OnbD+fOnccffyxFy5YtsHbNKmzdug23bt+Gubk5qlevjnZt26BBg0aIf/oUh48cweXLlzFhwng4OTkhODgErdu4w8q6aM2TkZERtm7ZjL1796JKlSro168vLl26jEOHD2uMi4+Px8mTJ9GhfXskJCTkmb39VZp8/DFmes3Avn37ER4eDkMDA3Tp0gU5OTnYf+Df2cw3btyE4cOH4ddffsbNW7fQsEEDVK5cuUjvQx8iIiLw88+//POZO+LgwUNITknBO05OcG/tjg3rN2Lpn3/qtIaLFy9i7bp1+G74MNSqWROnTp9GdlY2KlV2Qbu27TBl6lTs338Avr6+ePo0AQvmz8WKlaugUqnQtUvnV55uf+vWLXTs2AFTp07BzRs3kZKagiNHjur0fRARlXRsuomISKcOHzmC5i1a4NtvvkGrli3Rt08fZGZmIiAgAD/9NAMbNm7Kd9u4uDiMGzcew4YNw2+//gJDQ0N06doNFy7kbbpVKhW++moABg0cgK5du8LdvRXS0tJx//59rFi+AuHhz69vPnz4CJwcndCje3fY2pZGfPxTXLx4Eb/+9ttrT9h17tx5jB79PYYOG4Lp06YiMjISXt6z4OToKEnTDQDZ2dkYNOgbrF+3FqtWrkD3Hl/g+vUb6NylG777bjjatWuLrl27IDk5GeHh4fj1t7lI/Of9qFQqfNn/a0yfPhWdO3tApVLh8JEj+OmnGThyWPtbuk2cNBmdO3fC999/DyMjQ+zatRuTp0x95dht23egRYsW2Ltvn8Yp9/m54++PUydPoUXz5qhQoQLS0tPg7++P3n364tq16+px8+YvQJkyZdC2bRu0b98OJ06cRK/effKdLExOixYvQVh4OAYNHIjRo58f8Y6JicHpU6dx+MjhQraWxrhxE3Dr1m306d0b48eNRXZ2NiIjo7Bz505cueILAHj6NAH9vvwSU6ZMxtgff0BCwjPs3LkTZ8+ew6ZNmvMIrF6zFrVq1UJ3z24YPGggIiMj2XQTERVCUdHekRfnEBERkaRatWyJVatWoJNHF1y+fFnucoiIiGTDa7qJiIhIcj17fYF79yLYcBMR0VuPp5cTERGRZDp26IAaNWugRfPmmDx5itzlEBERyY6nlxMREZFkYqIjkZycjD179mLsuPH53ueaiIjobcGmm4iIiIiIiEhHeE03ERERERERkY6w6SYiIiIiIiLSEU6kpkPly5dHSkqK3GUQERERERGRDlhYWODRo0cFjmHTrSPly5fH9Wu+cpdBREREREREOvRR7boFNt5sunXkxRHuj2rX5dFuQVSuXAnh4XflLoMkwjzFwjzFwSzFwjzFwjzFwjzfnIWFBa5f8y2032PTrWMpKSlITk6WuwySQFZWNrMUCPMUC/MUB7MUC/MUC/MUC/PUH06kRqSl5OQkuUsgCTFPsTBPcTBLsTBPsTBPsTBP/WHTTaSluLjHcpdAEmKeYmGe4mCWYmGeYmGeYmGe+sOmm0hLlSpVkrsEkhDzFAvzFAezFAvzFAvzFAvz1B823UREREREREQ6wqabSEsxMTFyl0ASYp5iYZ7iYJZiYZ5iYZ5iYZ76w6abSEvGxsZyl0ASYp5iYZ7iYJZiYZ5iYZ5iYZ76w6abSEtly5aVuwSSEPMUC/MUB7MUC/MUC/MUC/PUHzbdRERERERERDqiqGjvqJK7CBFZWloiOCgAVavV4E3nBaFUKpGbmyt3GSQR5ikW5ikOZikW5ikW5ikW5vnmtO35eKSbSEuVKrnIXQJJiHmKhXmKg1mKhXmKhXmKhXnqj6HcBYiuVs2aSE1NlbsMkoCTkyPMzczlLoMkwjzFUpzyjI+PRzRnhH1txsYmcpdAEmKeYmGeYmGe+sOmW8d8fHbIXQJJ5OnTpyhdurTcZZBEmKdYilOeqWlp+PSTZmy8X1NKSorcJZCEmKdYmKdYmKf+sOnWsbk+ZxAaEy93GSQBAwWQwxkQhME8xVJc8nynnA3GdWsGW1tbNt2v6eHDh3KXQBJinmJhnmJhnvrDplvHouISEfrgidxlkAScrI0QmZgldxkkEeYpFuYpjipVqiAgIEDuMkgizFMszFMszFN/OJEaERERERERkY6w6SbS0tP0HLlLIAkxT7EwT3HwdEexME+xME+xME/9YdNNpCWlQu4KSErMUywlIc8GDRpgzeqVuHbVFzHRkXBv1eqV41xdXbF61UoEBtxBaEgQDuzfBwd7+3z36+nZDTHRkRqP8LCQQusZPWokfH0vY5fPDlSuXEljnZGREYZ8+w2OHDmEsNBg+N2+id27dqK7pycMDXV7ZZqBkn+aiIR5ioV5ioV56s9b8UkX9McNkbZsTAzkLoEkxDzFUhLyNDc3wx3/AEyYOCnfMc7Ozti1aydCQ0PRtasnPm/eEvPnL0B6RkaB+05MTMQHH9ZWP+o3aFTg+Hp16+Lzzz9H//5fw2fXbnjNnKleZ2RkhI0b12Po0KHYsH4jOnTshDZt22P16rX46qsvUa1a1aK98SKyK1dOp/sn/WKeYmGeYmGe+qP3idTmzZsLG2trfPX1AH2/NBERkWxOnDiJEydOFjhm3Ngfcfz4ccz08lYvi4iIKHTfKpUKcXFxWtdiU8oGjx49QkBAAAwNDeDZrZt63cABX6NhgwZo3bot/O7cUS+/f/8+9u7bByMjI61fh4iIiN6SI91EUohO4szIImGeYhEhT4VCgc8//x/Cw+9i44b1uHXzOvbt3aPVmVoWFha4fOkCfK9cwqqVK1C1asFHo0+ePAUTExOEhQZjw/p1mDV7tnqdR2cPnDlzVqPhfiE7OxtpaWlFf3NFEBJS+KnxVHIwT7EwT7EwT/0pVk13w4YNsX/fXtwND8X1a76YMH4cDAz+PWVw+7atmPHTdEyaOAF3/G7jxvWrGDN6lMY+KlVywc4d2xEeFoKTJ47hk6ZN87xO9erVsXXrZoSFhsDP7xZ+njMb5ubm6vXz5s3FyhXL8c3gwbh+zRd+frfg7TVT59exUfFmZ878RcI8xSJCnmXLloWlpSWGDR2CEydP4ouevXDw4EEsX74MDRs2zHe7sLAwjB7zPfp/9TWGDR8BpVKBPbt9ULFihXy3yc7ORq/efVC7Tj188GFtnD17Tr2ucqVKCA0NlfS9FYWTk5Nsr03SY55iYZ5iYZ76U2z+SqlQoQLWr1uDrVu3YcSIkXB1dcUvv8xBRkYGfps7Tz2uW7euWLbsL7Rr3x516tTB/HlzceWKL06fOQOFQoHlf/2Fx4/j0K59B1hZWeOn6VM1XsfMzAwbN6zH1atX0aZtO5QtWwa//vIzvLxmYtSo0epxjRs3wqPYWHTr1h0ulVyw9I8l8LtzBxs3bnpl/cbGxjA2NlY/t7CwkPgTIrkZG5SAmZpIa8xTLCLkqfxnQptDhw7jr7+WAwDu3PFH3bp10bdPb1y8ePGV2129eg1Xr15TP/f19cWpkyfQu3dv/PLLrwW+5pMnT16xVN7P0tTUVNbXJ2kxT7EwT7EwT/0pNke6+/Xri5iYGEyYOAmhYWE4eOgQfv1tLgYPHgSF4t8/AAICAjF33nzcvXsP27fvwM2bt9CkyccAgE+aNoWraxV8N2IU/P0DcOnSJcya/bPG63h4dIKJiQm+GzESQUFBOHfuPCZOmoyuXTqjbNmy6nHPnj3DxH9qOXr0GI4eO4amTZrkW//wYUMRHBSgfly/5ivxJ0Ryy8xRyV0CSYh5ikWEPOPj45GVlYXg/5zuFxISAgeH/Gcv/6/s7Gz43fFDJReX16oj/G44XF1dX2tbKaTr+PR10i/mKRbmKRbmqT/Fpul2c3XV+KYeAK5cuQJLS0vYV6yoXhYQEKAxJjY2Vt0su7q5IiYmBo8ePVKvv3r1qubruLnBP8Bf45q0K1d8YWBggCpVqqiXBQUHIzc399/XeRSLMmXL5Fv/wkWLUbVaDfXjo9p1tXnbVII8Ts2WuwSSEPMUiwh5ZmVl4ebNm6hSpbLG8sqVKyMqKlrr/SiVStSoXh2PYmNfq45dPrvQtGkTvFurVp51hoaGMDMze639aisyKkqn+yf9Yp5iYZ5iYZ76U2yabm1lZWtOlqNSqaDQwT3msrM0/4BTQQWlIv/XyczMRHJysvqRkpIieU0kL3srztgrEuYplpKQp7m5OWrVqolatWoCAJzecUKtWjU17sG95I8/0aF9e/Ts+QVcXFzQ/8t+aNGiOdasWases2DBPIwfN1b9fNTIEfj0k0/wzjvv4L1338Wihb/DwcEx38uhCvPX8hW4csUXW7Zsxpf9+qFmzRp455130L59O+zbuzvPPb2l5ubmptP9k34xT7EwT7EwT/0pNtd0h4SGom2b1hrL6tWrh6SkJMQ8eKDVPkJDQmFvb49y5coh9p9v+GvXrq35OiEh8OzWDWZmZuqj3fXq1UVOTg7CwsIkeCdERER5ffDB+9ixfZv6+fRpz+cc2bJ1m3pOkYMHD2LcuAkYNnwoZvz0E8LDwzBw4GBcvnJFvZ2DvQNyc/89nd6mVCn88ssc2NnZ4dmzZ7h1+zY6duz02rPSZmZmoscXPTFo4AD07t0LkydPQlp6GkJDQrBi5SoEBga91n6JiIjeVrI03VbWVupv+l9Yv34DBg74Gl4zZ2DVqtWoUqUKvh8zGsuW/QWVSrtr9U6fOYPw8HAsmD8PM2bOhKWlFcaN/VFjjM9OH3w/ZgwWLJiH336bhzJlbDFzxgxs37ETjx8/luw9kngS0nPkLoEkxDzFUhLyvHDhIuwdCp8pdvOWLdi8ZUu+67t289R4Pm3adEybNv2N63tZZmYmFi1egkWLl0i6X23EvuZp8VQ8MU+xME+xME/9kaXp/rhxYxw5fEhj2caNm9C7Tz9MnjQRR44cQkJCAjZt2oz5C37Xer8qlQpfDxiI3379Ffv37UVUVBQmTZ6KTRvXq8ekpaejZ6/e+OmnaTiwfx/S0tNwYP8BTJv+k0TvjoiIiF6XSpVb+CAqMZinWJinWJin/igq2juW/ClfiyFLS0sEBwVg9LL98Lv/qPANqNhzsjZCZGJW4QOpRGCeYikuebpWLIMlQzuiVavWuO3nJ3c5JVKNGjXyTJpKJRfzFAvzFAvzfHMver6q1WogOTk533ElbiI1IiIiIiIiopKCTTeRlh4ky38UjaTDPMXCPMURGhoqdwkkIeYpFuYpFuapP2y6ibRka1ZsJvsnCTBPsTBPcdi/dAs1KvmYp1iYp1iYp/6w6SbSkomBQu4SSELMUyzMUxzm5uZyl0ASYp5iYZ5iYZ76w0MDOuZoZ430rGy5yyAJlDIGTCzkroKkwjzFUlzyfKecjdwllHgZGRlyl0ASYp5iYZ5iYZ76w6Zbx0Z7NJW7BJJIbm4ulEqeHCIK5imW4pRnaloa4uPj5S6jxLp3757cJZCEmKdYmKdYmKf+sOnWMQ+PLkhNTZW7DJKAk5MjIiOj5C6DJMI8xVKc8oyPj0d0TIzcZZRY1apV4y1sBMI8xcI8xcI89YdNt47d8fcv8J5tVHJk5+TwF5NAmKdYmCcREREVV8XjXDyiEuDx4zi5SyAJMU+xME9xMEuxME+xME+xME/94ZFuHatVsyZPLxeEhYUFKpSvIHcZJBHmKRbmKY4XWfI0fTFkcTJZoTBPsTBP/WHTrWM+PjvkLoEkEhsbi3LlysldBkmEeYqFeYrjRZbpaWlo+kkzNt4lXMWKFZGQkCB3GSQR5ikW5qk/bLp1LP3CKqjiI+QugySQkWuFNGWS3GWQRJinWJinODJyrZBe2gqmTQbB1taWTTcREZV4bLp1TJX4ELnx9+UugyRgDUPkgqfhiIJ5ioV5isMahlAp7eUugyQSHh4udwkkIeYpFuapP5xIjUhLqUoLuUsgCTFPsTBPcTBLsZQrZyd3CSQh5ikW5qk/bLqJtJSlMJa7BJIQ8xQL8xQHsxSLpaWV3CWQhJinWJin/rDpJtKSEjlyl0ASYp5iYZ7iYJZiycrMlLsEkhDzFAvz1B823URaKpUTL3cJJCHmKRbmKY7Cshw2bCgO7N+H4KAA3Lp5HStXLEeVKpU1xpiYmMDbayb8/G4hJDgQfy37E2XLli1wv61bu2PTxg3w87uFmOhI1KpVs9BalUolvL29cP2aL9atXYMyZcporLe0tMTYsT/i9KkTCA8LwY3rV7Fl80a0bu1e6L5FEcZrRoXCPMXCPPVH5033vHlzERMdidmzvfOs8/aaiZjoSMybN1fXZRC9sXgDXvciEuYpFuYpjsKybNSwIVavWYN27Tuixxc9YWhkiE0bN8DMzEw9Ztq0qWjRojkGD/4Gnbt0Q/kK5bFi+bIC92tubo7Lly/D2yvv3yv56dixAxwc7NGzV2/c9vPD2B9/UK+ztrbGnt270K1rFyxctBit3Nugc5eu2L1nLyZNnAhra2utX6ckq169utwlkISYp1iYp/7oZfby6OhodOzQAdOm/YT09HQAz7+F7tSpI6KiovRRAhEREQmgV+8+Gs9HjhwNv9s38f777+PSpUuwsrLCFz26Y+iw4Th37jwAYPSoMTh9+iRq1/4I165df+V+d+zYCQBwdHTUupZSNjaIioxCYGAQ3Nzc0KZNa/W6cePGwsnJEU2afopHjx6pl4eH38WuXbuRkZGh9esQEVHJppfTy2/f9kNMzAON06natG6N6JgY+PndUS8zNjbGjJ+m49bN6wgPC8Eunx344IMP1OttbGywaOHvuH3rBsJCQ3D27Gl09/RUr69YsQKWLF6EO363ERoShL8P7MdHH32oXt+3bx+cP3cW9+6G4czpk+jSpbN63ZTJk7BmzSr18wEDvkZMdCSaNWumXnbu7Bn0/KKHVB8LlTCmqjS5SyAJMU+xME9xFDXLF0eMExISAADvv/8ejI2NcebMWfWY0LAwREVFoU6dOpLVCQA7dvqgTp3auHc3DFOmTMaCBb8DABQKBTp26ICdPj4aDfcLqampyMl5O65dj3/yRO4SSELMUyzMU3/0dp/uzVu2oEd3T/j47AIA9OjhiS1btqJxo0bqMZMmTkCbNm0wYuQoREVFY8iQb7Fxw3p83KQpEhIS8OMP36NqVTf06t0X8fHxqFTJBaampgCenxa2Y/t2PHz4EP37f4XYuDi89967UCqff6/g7u6On6ZPw9Rp03HmzBk0b94c8+b+hgcPHuD8+Qu4cPEivviiB5RKJXJzc9GoYUM8efIEjRs1xMmTJ1GhQgVUquSC8xcu6usjo2LGUJUldwkkIeYpFuYpjqJkqVAoMH36VFy+fBlBQUEAgHJ25ZCRkYHExESNsXFxj1HOTtrLEBITE+Heui3s7Ozw5MkT5ObmAgBsbW1RunQphIaGSfp6JVHaP2c4khiYp1iYp/7orenesWMnxo8bCwcHBwBA3br18O23Q9VNt5mZGfr27YNRo8bgxImTAIAffvgRn1y8gC96dMcfS/+Eg4MD/Pzu4NatWwCgcWq6h0cnlCljizZt26m/7b537556/bffDMLWrduwZs1aAMCyZX+hdu2P8M03g3H+/AVcunQZlpaWePfdd3Hr1i00aNgAS/9YilburQAAjRo1RMyDBxr7fJmxsTGMjf+9zYmFBe8zKppkpTVMcuLkLoMkwjzFwjzFkay0hlnhwwAA3t5eqF6tGjp5dC58sA7FxWn+21MoFDJVUvw4ODjk+QKESi7mKRbmqT96m708Pj4ex44dR3fPbujR3RPHjh9D/NOn6vUuLs4wNjbG5StX1Muys7Nx48YNuLm5AQDWrF2Ljh074Mjhg5g0cQLq1v33NLFatWrBz++OuuH+L1dXN1zx9dVYduWKL9xcXQE8/7ba398fjRs1Qo0a1ZGVmYn1Gzbi3Vq1YG5ujkYNG+JiAUe5hw8biuCgAPXj+jXffMcSERHRm/GaOQMtmn+Ort2648GDh+rlsXGxMDExyTNRmZ1dWcTG6eeLmSdPniAhIQGurlX08npERFS86fWWYZu3bIGnZzd069YVmzdvKfL2J06cRL36DbHsr+UoX748tmzejCmTJwGAeoK2N3H+wkU0atzweYN98RISEhIQGhqK+vXro1GjhrhwMf+me+GixaharYb68VHtum9cDxUv1jlPCx9EJQbzFAvzFIc2WXrNnAF3d3d08+yOyMhIjXW3bt1GZmYmmjT5WL2sSpXKcHR0xNWrVyWv91VUKhV279mLzh4eKF++fJ715ubmMDAw0EstcsvvDEEqmZinWJin/ui16T5x4iSMjIxhaGSEkydPaay7dy8CGRkZqF+vnnqZoaEhPvjwAwQHh6iXxcfHY9u27Rj+3QhMnTYNvXr1BAAEBASgVq2aKFWq1CtfOzQ0BPXqajbC9erVRXDIv/u+eOEi6terhyZNmuD8hQsAgPMXLqBTpw6oUqUKLvyz7FUyMzORnJysfqSkpGj3oVCJka40l7sEkhDzFAvzFEdhWXp7e6FzZw8MHTYcyckpsLOzg52dnXqOl6SkJGzavAXTpk5B48aN8N5772He3N/g6+urMXP56VMn4O7+7wSvpUqVQq1aNVG16vOz66pUqYJatWrC7jWvA58z52fExMRg/7496Nq1C9zc3FCpkgt6dO+Ow4cPvjWXoZUpYyt3CSQh5ikW5qk/erumGwByc3PxabPP1D+/LC0tDWvXrcOkSRPxNCEB0dHPJ1IzMzXDps2bAQA/fD8Gt27dRlBwMIyNjdGi+ecICQkFAOzatRvfDR+GlSuWY9as2XgUG4t3362FR48e4erVa/jjjz+xdOkS+N25gzNnzqBFixZo07o1uvf4Ql3DxUuXYGlpiebNP4e39ywAwIXzF7Fs2VI8fPgI4eF39fExUTGVqTCRuwSSEPMUC/MUR2FZftmvLwBg545tGstHjhqNrVufL5s2bTpUubn4a9kymJgY4+TJUxg/YaLGeFdXV1hbW6mft2zZAvPnzVU/X/rHEgDAb7/NxW9z5xX5fSQkJKBd+44YNnQIRoz4Do4ODnj27BkCAwMxc4bXW3MdpZWVNYBoucsgiTBPsTBP/dFr0w0AycnJ+a7z9p4NpUKJhb/Ph4WFBW7duoWevXrj2bNnAIDMrCyMHz8WTk5OSEtPx+VLl/HtkKEAgKysLPT4ohemTp2MdevWwNDQEMHBIZgw8fnp5wcPHcKUqdPwzeDB+Gn6NERGRmLU6DG48NJ12i/+Y1i2bFmEhj2fcfTipUtQKpW4WMCp5fR2UCC38EFUYjBPsTBPcRSWpb2DU6H7yMjIwISJk9R/A2izn61bt6mbdqkkJSVh1uw5mDV7jqT7LUmys7PlLoEkxDzFwjz1R1HR3lEldxEisrS0RHBQANIOzUJubEjhGxAREREAQGn7DszaTkOrVq1x289P7nKIiIhe6UXPV7VajQIPLuv1mm6ikuyJgbT3dyV5MU+xME9xMEux1KhRXe4SSELMUyzMU3/YdBMRERGRjvCe5WJhnmJhnvrCpptISyaqN78tHRUfzFMszFMczFIsT5/ydn4iYZ5iYZ76w6abSEvGqgy5SyAJMU+xME9xMEuxpBRwjSOVPMxTLMxTf/Q+e/nbRmFdAcps/gEhguRcK5RVJsldBkmEeYqFeYojOdcKZjZWhQ+kEsHRyQkBAQFyl0ESYZ5iYZ76w6Zbx0wb9Ze7BJKISWwszMqVk7sMkgjzFAvzFIdJbCxMy5VDeloa4uPj5S6HiIjojbHp1jEPjy5ITU2VuwySgImJCTIyeNaCKJinWJinOF5kGR8fj+iYGLnLoTd0//59uUsgCTFPsTBP/WHTrWN3/P0LvGcblRz29vaI4R+AwmCeYmGe4mCWYrGxsUFKSorcZZBEmKdYmKf+cCI1Ii3Z2NjIXQJJiHmKhXmKg1mKhXmKhXmKhXnqD5tuIi2pcnPlLoEkxDzFwjzFwSzFwjzFwjzFwjz1R1HR3lEldxEisrS0RHBQAK/pJiIiIuHwmnsion97vqrVahR4STGv6dYxH58dcpdAEomNjUU5zo4sDOYpFuYpDmZZMqSnpaHpJ80KbbyrV6uGwKAgPVVFusY8xcI89YdNt46lX1gFVXyE3GWQBDJyrZDG+wALg3mKhXmKg1kWfwobe5g2GQRbW9tCm26FklcyioR5ioV56g+bbh1TJT5Ebjyn4xeBsdIKubn8Q1AUzFMszFMczLL4K8qf6c+ePdNZHaR/zFMszFN/+PUGkZaMVelyl0ASYp5iYZ7iYJZi4R/1YmGeYmGe+sOmm0hLScpScpdAEmKeYmGe4mCWYnnnnXfkLoEkxDzFwjz1R4ime/u2rZg+farcZRARERG99YYNG4oD+/chOCgA27dtxcoVy1GlSuV8x69ftxYx0ZFwb9Wq0H27urpi9aqVCAy4g9CQIBzYvw8O9vb5jlcqlfD29sL1a75Yt3YNypQpo7He0tISY8f+iNOnTiA8LAQ3rl/Fls0b0bq1u/ZvmIioEHpvum1tbTFrljeuXL6Iu+GhuHH9KjZuWI96devquxSiIrHK5Sk4ImGeYmGe4mCWJV+jhg2xes0atGvfEf36fQlDI0Ns2rgBZmZmecYOHDgAKpV2d691dnbGrl07ERoaiq5dPfF585aYP38B0jMy8t2mY8cOcHCwR89evXHbzw9jf/xBvc7a2hp7du9Ct65dsHDRYrRyb4POXbpi9569mDRxIqytrYv+5gUXFRkpdwkkIeapP3qfSG35X8tgZGyEESNHISLiPuzs7NCkyccoXbq0vkshKpJMhQmMVZlyl0ESYZ5iYZ7iYJYlX6/efdQ/V6hQASNHjobf7Zt4//33cenSJfW6WrVqYvDgQWjdui1u3rhW6H7Hjf0Rx48fx0wvb/WyiIiC7xBTysYGUZFRCAwMgpubG9q0af3v/saNhZOTI5o0/RSPHj1SLw8Pv4tdu3Yjo4Bm/m1lYWmJpALuRUwlC/PUH70e6ba2tkbDhg3g5TUL589fQHR0NG7cuIFFixbj8JEjmPvbr1izZpXGNoaGhrh18zq+6NEdAGBmZoYFC+YhJDgQ16/5YvDgQXle59LF8xg+fBjm/vYrgoMCcOXyRfTq1VNjjL19RSxdugQB/n6443cbq1augKOjIwCgQYMGiLgXDjs7O41tpk+fCp+dvO/22ypDYSp3CSQh5ikW5ikOZimW0qVLq48YJyQkqJebmZpi8aKFmDhhEuLi4grdj0KhwOef/w/h4XexccN63Lp5Hfv27in0lPQdO31Qp05t3LsbhilTJmPBgt/V++vYoQN2+vhoNNwvpKamIicnpwjv9O3Ag2RiYZ76o9emOyUlBcnJyXB3bwVjY+M86zdu2oTPmjVDuXLl1MuaN28OMzMz7N6zFwAwefIkNGrYEP2/+hpf9OyNxo0a4r333s2zr8GDB+HmrVto2ao11qxZi9mzvNXXExkaGmLjhvVISU6BR+eu6NjJAykpKdi4YR2MjIxw6dIl3L9/H127dFbvz9DQEJ09PLB58xapPxYiIiIiISkUzw9aXL58GUFBQerl06ZPha/vVRw6fFir/ZQtWxaWlpYYNnQITpw8iS969sLBgwexfPkyNGzYMN/tEhMT4d66LerWa4D69RsiICAQwPPLHUuXLoXQ0LA3e4NvHe0uBaCSgnnqi16b7pycHIwcNRrdunZFgP8d7N61E+PGjUWNGtUBAL6+VxEWFqbR7Pbo7ol9+/YjNTUV5ubm+KJHd/w0YybOnj2HwMBAjBg5GoaGec+SP378ONasWYt79+5h0eIliI+PR+PGjQEAHTq0h1KpxJjvf0BgYCBCQ0MxavQYODg4oHGjRgCATZs2o3t3T/X+WrRoDhMTE+zZu/eV783Y2BiWlpbqh4WFhWSfGxUPZXIK/yaeSg7mKRbmKQ5mKZY+ffqgerVq+HbIUPWyli1a4OOPP8aUqdO03o9S+fxP1kOHDuOvv5bjzh1/LFq8BEePHkPfPr0L3T4uLg65ubnq5wqFQvs3QWovvrQgMTBP/dH7RGoHDvyN2nXqon//r3Di5Ck0btQQhw7+DU/PbgCAjS81u2XLlsVnnzXDpn+OLru4OMPExATXr11X7y8hIQFhYXm/pQzwD9B4HhsXh7L/zFhZq2ZNuLi4ICQ4UP3wv3MbJiYmcHZxBgBs2boNLi4uqF37IwBAd09P7N27D2lpaa98X8OHDUVwUID6cf2a7+t/SFQsxRuUKXwQlRjMUyzMUxzMUhxeM2egtXsrdO3WHQ8ePFQv/7hJY7g4OyMw4A7uR9zF/Yi7AIC//voT27dtfeW+4uPjkZWVheCQEI3lISEhcHDIf/by/Dx58gQJCQlwda1S5G3fZm5ubnKXQBJinvqj94nUACAjIwOnz5zB6TNnMH/+Avz6y8/4fsxobN26Ddu3b8eE8eNQp05t1K1bF/cjI3H58uUiv0ZWdrbmApVK/S2puYUFbt26jWHDv8uz3ZMnT9T/e+TIUXTv7on79yPx2WfN0LWrZ57xLyxctBh/LvtL/dzCwoKNt2BUYtxhj/7BPMXCPMXBLMXgNXMG3N3d8f0PPyLyPzMkL1q0BBs3btZYduL4UUybNh2Hjxx95f6ysrJw8+bNPLceq1y5MqKiootcn0qlwu49e9G1S2fMnTs/z3Xd5ubmyMjI4HXd//Gqs0up5GKe+lMs/ssWHBICc3NzAMDTpwk4dOgwunt6wrNbN2zZ8u83nvfuRSAzMxMf/XP0GQBsbGxQuXL+9358ldu3b6NSpUp4/Pgx7t27p/FISkpSj9u4aRM6tG+P3r17ISIiAld882+iMzMzkZycrH6kpKQUqSYq/oxVnMVUJMxTLMxTHMyy5PP29kLnzh4YOmw44mJjYWdnBzs7O5iaPp8kLy4uDkFBQRoPAIiOjtFo0E+fOgF393/vl73kjz/RoX179Oz5BVxcXND/y35o0aI51qxZ+1p1zpnzM2JiYrB/3x507doFbm5uqFTJBT26d8fhwwd5qeArJCUlyl0CSYh56o9ev94oXboU/vxzKTZv3oKAgAAkJ6fggw/ex5Bvv8GhQ/9OpLFx4yasWbMKBgYG2LZtu3p5amoqNm3egsmTJuLp06d4/PgJxo39UeMaHW347PTBt99+g1WrVuCXX37DgwcP4OjogDatW2PJH3+oT4E6efIUkpOTMeK74fj119+k+RCoxDLNTZW7BJIQ8xQL8xQHsyz5vuzXFwCwc8c2jeUjRz0/q1Fbrq6usLa2Uj8/ePAgxo2bgGHDh2LGTz8hPDwMAwcOxuUrV16rzoSEBLRr3xHDhg7BiBHfwdHBAc+ePUNgYCBmzvBCYiIbkv968iRe7hJIQsxTf/TadKekpOL6tesYNHAAnJ2dYWRkhJiYGGzYuAkLFy5Sjzt95gxiY2MRFByc53SfGTNmwsLCHGtWr0JycjL+/HMZrKys/vtSBUpLT0fnzl0xceJ4rFi+DBYWFnj48BHOnj2LpKR/71WnUqmwdes2DB8+DNu281Zhb7tEg9Kc4EcgzFMszFMczLLks3dwUv9co0YNBAQEFDA67zYFLdu8ZQs2b5HuTjJJSUmYNXsOZs2eI9k+Rebi4qJVnlQyME/90WvTnZmZqdUvNnNzc9jY2GDTps151qWmpuK770biO4xUL/tj6Z8aYxo0bJxnuxYt3TWex8XFYeTI0YXWXKFCBRw/fgKxsbGFjiUiIiIiIiJ6WbG6el6hUMDW1hbfDB6ExMREHD58RLZarKysUKN6dXTq1An9+38lWx1UfFjm8jQzkTBPsTBPcTBLsURHF32SMyq+mKdYmKf+FKum28HBAZcvXUBMTAxGjhot64yRq1auwEcffYh169fj9JkzstVBxUe2wggmnOBHGMxTLMxTHMxSLGamprw2WiDMUyzMU3+KVdMdFRX1yut35NC1W/63B6O3U7rCDBZILnwglQjMUyzMUxzMUiy2ZcrgES/REwbzFAvz1J9iccswIiIiIiIiIhEVqyPdIlJYV4Aym6fJiaCsClAo3pG7DJII8xQL8xQHsyz+FDb2Wo8NDAzUYSWkb8xTLMxTf9h065hpo/5yl0ASefLkCcqUKSN3GSQR5ikW5ikOZlkypKelIT6+8Hv8VqlcGaFhYXqoiPSBeYqFeeoPm24d8/DogtTUVLnLIAk4OTkiMjJK7jJIIsxTLMxTHMyyZIiPj0d0TEyh44yMjfVQDekL8xQL89QfNt06dsffH8nJnBBGBAnPEviHoECYp1iYpziYpViSk5PkLoEkxDzFwjz1hxOpEWkpNjZO7hJIQsxTLMxTHMxSLMxTLMxTLMxTf9h0E2mpcuXKcpdAEmKeYmGe4mCWYmGeYmGeYmGe+sPTy3WsVs2avKZbEE5OjjA0MJC7DJII8xQL8xQHsyw5tL2um4jobcemW8d8fHbIXQJJJC0tDWZmZnKXQRJhnmJhnuJgliVHeloamn7SrMDG+8GDB3qsiHSNeYqFeeoPm24dS7+wCqr4CLnLIAmkqowBRabcZZBEmKdYmKc4mGXJoLCxh2mTQbC1tS2w6TYy4p+aImGeYmGe+sNPWsdUiQ+RG39f7jJIAikGdjDN4YQTomCeYmGe4mCWJYO2kwKVLWuHuLjHOq2F9Id5ioV56g8nUiMiIiIiIiLSETbdRFoqncNvAkXCPMXCPMXBLMUSFBQkdwkkIeYpFuapP2y6ibSUaFBa7hJIQsxTLMxTHMyy5Bs2bCgO7N+H4KAA3L51AytXLEeVKvnfmmj9urWIiY6Ee6tWhe7b1dUVq1etRGDAHYSGBOHA/n1wsLfPd7xSqYS3txeuX/PFurVrUKZMGY31lpaWGDv2R5w+dQLhYSG4cf0qtmzeiNat3bV/w28RFxcXuUsgCTFP/WHTnQ9Pz24I8PeTuwwqRnLAW9iIhHmKhXmKg1mWfI0aNsTqNWvQrn1H/Dh2PAyNDLFp44ZXzko/cOAAqFQqrfbr7OyMXbt2IjQ0FF27euLz5i0xf/4CpGdk5LtNx44d4OBgj569euO2nx/G/viDep21tTX27N6Fbl27YOGixWjl3gadu3TF7j17MWniRFhbWxf9zQvOxMRE7hJIQsxTf96o6Z43by5ioiMxbOgQjeXurVohJjryjQrTVuXKlRAWGgyPTp00lisUCuzZ7YNly5bqpQ4Sn6EqS+4SSELMUyzMUxzMsuTr1bsPtm7dhuDgYPj5+WHkyNFwdHTE+++/rzGuVq2aGDx4EEaP+V6r/Y4b+yOOHz+OmV7e8LtzBxERETh85AiePHmS7zalbGwQFRmFwMAgBAYGwtrm30Z63LixcHJyRNt2HbBt23aEhIQgPPwuNm7chBYtWyElJeX1PgCBpaamyl0CSYh56s8bH+lOS0vHkCHfwsbGRop6iiw8/C68vWdhxsyfUK5cOfXybwYPwjvvvINxY8cXeZ+GhpzUnfKyzE2SuwSSEPMUC/MUB7MUS0xMjPqIcUJCgnq5makpFi9aiIkTJiEurvDZ6hUKBT7//H/Pm+IN63Hr5nXs27un0FPSd+z0QZ06tXHvbhimTJmMBQt+V++vY4cO2Onjg0ePHuXZLjU1FTk5OUV4p2+HmAJuD0clD/PUnzduus+ePYO4uDgMHza0wHH169WDz84dCAsNge+VS5jx03T1aUb9v+yH48eOqse+OFLep09v9bItmzfix5dOCXrZipWr4O/vj19+mQMAcK1SBd9//z1+/HEcniYkYNTIEfD1vYy74aE4cvggmjVrpt7W0dERMdGR6NChPXZs34bwsBB07uyR5zVsbW3x94H9WLH8LxgbG2v9+ZA4Egxs5S6BJMQ8xcI8xcEsxeLm5obp06fi8uXLGpM2TZs+Fb6+V3Ho8GGt9lO2bFlYWlpi2NAhOHHyJL7o2QsHDx7E8uXL0LBhw3y3S0xMhHvrtqhbrwHq12+IgIBAAM//ritduhRCQ8Pe7A2+ZVxdXeUugSTEPPXnjZvunJxczJr9M/r374+KFSu8coyzszM2bFiH/QcOoHmLFvjm2yGoX78evL1mAgAuXLyEqlXdYGv7/D+0DRs1xJMnT9C4USMAz48816lTBxcuXMi3jlGjxqBB/fro2fMLzJs/F3v27MHhI0cwYMDXGDx4EGb8NBPNW7TEyZOnsHrVClSq5KKx/YTx47B8xQp82ux/OHnylMY6e/uK2OWzA4FBQRg4aDAyMzPzvL6xsTEsLS3VDwsLC60/QyIiIiIRfffdMFSvVg3fDvn34EzLFi3w8ccfY8rUaVrvR6l8/ifroUOH8ddfy3Hnjj8WLV6Co0ePoe9LB2nyExcXh9zcXPVzhUKh/ZsgInpDkkykdvDgQdzxv4Pvx4x55frhw4Zip48Pli9fgbt378HX9yomT56Krl27wMTEBIGBgUhISECjRs+/qWzcqCH+/HMZGjZsAAD46MMPYWhoCN8rvvnWEB0djalTp2PO7FkoX64cJk+ZCgD4ZvBgLF7yB3bv2YOwsHB4ec/CnTv+GDhggMb2fy1fgb//PojIyEjExsaql1epUhm7d/ng5MlTGDVqtMYv7P++x+CgAPXj+rX8a6WSyTw3We4SSELMUyzMUxzMUhxeM2egfr166NqtOx48eKhe/nGTxnBxdkZgwB3cj7iL+xF3AQB//fUntm/b+sp9xcfHIysrC8EhIRrLQ0JC4OCQ/+zl+Xny5AkSEhLg6lqlyNu+zR49elj4ICoxmKf+SDZ7uZfXLHTr1vWVpynUrFkTnt26ISQ4UP3YuHE9DAwM4OTkBAC4ePESGjdqBGtra7i5uWH1mrUwNjaBa5UqaNioIW7evIm09PQCa9iydSsexcZi5crVSE5OhqWlJSpWrIAr/2nWr/j6wtVNs85bN2/l2Z+pqSl8du7Agb//LvTb2IWLFqNqtRrqx0e16xY4nkoifisuFuYpFuYpDmYpAq+ZM+Du7o6vBwxCZKTm5LqLFi3B581bokVLd/UDAKZNm45Ro199ACcrKws3b97Mc+uxypUrIyoqusj1qVQq7N6zF509PFC+fPk8683NzWFgwJn0/0uh4I2PRMI89UeyT/rSpUs4eeoUJowfl2edhYU51q/foPHLtXmLVmj8cVNEREQAAC5cuIBGjRqhQYP68LtzB8nJybh06RIaNW6ERg0b4sLFS1rVkZOdjeyc7CLXn5qWd/a+zMxMnDlzFs0/b44KFV596vzLY5OTk9UPzngpnlQlLxkQCfMUC/MUB7Ms+by9vdC5sweGDhsOMzMz2NnZwc7ODqampgCen+odFBSk8QCA6OgYjQb99KkTcHf/937ZS/74Ex3at0fPnl/AxcUF/b/shxYtmmPNmrWvVeecOT8jJiYG+/ftQdeuXeDm5oZKlVzQo3t3HD58kJcKvsLLkxZTycc89UfSrze8vWejRYvmqFOntsby27f9ULWqG+7du5fnkZX1/NYgFy5eRNWqbmjXri0unH9+7fb5CxfQtGkT1KtXV72sKJKTk/HgwUPUq6d51Lle3boICQ7JZ6t/5ebmYvh3I3D79m1s27blld+EEhEREdG/vuzXFzY2Nti5Yxu2b9uCmzeu4eaNa+jQoX2R9uPq6gprayv184MHD2LcuAkYMuRbHDt6BD17foGBAwfj8pUrr1VnQkIC2rXviB07dmLEiO9w+NDf8Nm5A506dcDMGV5ITEx8rf0SEf2XpPfGCgwMxE4fH3z11VcayxcvWYJ9e/fAa+YMbNy0CampqajqVhWffNIUEydNBgD4+wcg4dkzeHTqhL79+gN4fvR7yuRJUKlUr/0L9Y+lS/H9mNGIiIjAnTt30N3TE7Vq1cSw4cO12j43NxdDhw3HkiWLsG3rZnTp6qnVrS1IPKVy8r8PKJU8zFMszFMczLLks3dwUv9saGiI7OzCz0B8eZuClm3esgWbt2x5swJfkpSUhFmz52DW7DmS7VNkISGFH7SikoN56o/kJ/L/8stvUCo1r8cKCAhE5y7dULlyZfjs3IHDhw7i+x/G4OF/7ot4+dLl5w325csAnjfiSUnJuHnrFtLS0l6rnhUrVmLZsr8wZcpkHDt6BJ991gxf9v8ad+/e03ofOTk5GDJkGIKCgrFt62aUKVPmtWqhki3ZQJ570ZNuME+xME9xMEuxODk6yl0CSYh5ioV56o+ior2jSu4iRGRpaYngoACkHZqF3Fh+iySCJwZ2KJPDsxxEwTzFwjzFwSxLBqXtOzBrOw2tWrXGbT+/fMfVqFEDAQEBeqyMdIl5ioV5vrkXPV/VajWQnJz/3Tc4ZR2RlgxQ9An6qPhinmJhnuJglmJJL+TOM1SyME+xME/9YdNNpCXrnGdyl0ASYp5iYZ7iYJZi+e/twqhkY55iYZ76w6abSEtPDXgtv0iYp1iYpziYpVjc3NzkLoEkxDzFwjz1R9LZyykvhXUFKLMz5C6DJKDItYJSaSZ3GSQR5ikW5ikOZlkyKGzs5S6BiKjEYNOtY6aN+stdAknENiUFZhYWcpdBEmGeYmGe4mCWJUd6Whri4+MLHBMXG6unakgfmKdYmKf+sOnWMQ+PLkhNTZW7DJKApaVlgbMSUsnCPMXCPMXBLEuO+Ph4RMfEFDgmJzdXT9WQPjBPsTBP/WHTrWN3/P35x4MgeFsFsTBPsTBPcTBLsVSoUAFPnz6VuwySCPMUC/PUH06kRkRERERERKQjbLqJtBQWFiZ3CSQh5ikW5ikOZikW5ikW5ikW5qk/PL1cx2rVrMlrugVhZ2eHuLg4ucsgiTBPsTBPcTBLsbzIU5vrv6n4q1ChAu7fvy93GSQR5qk/bLp1zMdnh9wlkERiY2NRrlw5ucsgiTBPsTBPcTBLsbzIMz0tDU0/acbGu4Sz4J0FhMI89YdNt46lX1gFVXyE3GWQBLJVFkhTpMhdBkmEeYqFeYqDWYolW2WB9FI2MG0yCLa2tmy6S7jMzAy5SyAJMU/9YdOtY6rEh8iN52kbIrCCArlQyV0GSYR5ioV5ioNZisUKCqgUTnKXQRK5e/ee3CWQhJin/nAiNSItPTUoK3cJJCHmKRbmKQ5mKRbmKZZq1arJXQJJiHnqD5tuIiIiIiIiIh1h002kJTMVZ6EXCfMUC/MUB7MUS2F5Dhs2FAf270NwUABu3byOlSuWo0qVyhpjtm/bipjoSI3H7NneBe63dWt3bNq4AX5+txATHYlatWoWWqtSqYS3txeuX/PFurVrUKZMGY31lpaWGDv2R5w+dQLhYSG4cf0qtmzeiNat3QvdtygeP34sdwkkIeapP8I23Y6Ojlr/kiXShlKVI3cJJCHmKRbmKQ5mKZbC8mzUsCFWr1mDdu07oscXPWFoZIhNGzfAzMxMY9z69RvwwYe11Y+ZMwtuus3NzXH58mV4exU87mUdO3aAg4M9evbqjdt+fhj74w/qddbW1tizexe6de2ChYsWo5V7G3Tu0hW79+zFpIkTYW1trfXrlGSZmZlyl0ASYp76U6wnUouJjixw/W+/zcVvc+fpqRp626UorWCaky53GSQR5ikW5ikOZimWFKUVzAtY36t3H43nI0eOht/tm3j//fdx6dIl9fK09LQi3b99x46dAJ4fhNFWKRsbREVGITAwCG5ubmjTprV63bhxY+Hk5IgmTT/Fo0eP1MvDw+9i167dyMh4O2aBtre3x7Nnz+QugyTCPPWnWDfdH3xYW/1zhw7t8cP3Y9D0k2bqZSkpvKUIERERkSheHDFOSEjQWN7ZwwNdOndGbGwcjhw5gvnzFyAtXdovZ3bs9MHWLZtw724Y4h4/Rp8+fQEACoUCHTt0wE4fH42G+4XUVF4SQUQFK9anl8fFxakfSUlJUKlU6uePHz/G4EED4et7GXfDQ3Hk8EE0a9Ys330plUrM/e1XnD51Ag0aNEBUZATef/99jTEDBnyNy5cuQKFQAAAaNmyI/fv24m54KK5f88WE8eNgYGCgy7dMxZhNzlO5SyAJMU+xME9xMEuxFCVPhUKB6dOn4vLlywgKClIv99m1C8OGj0DXbt2xcNEidOnaBQsX/i55rYmJiXBv3RZ16zVA/foNERAQCACwtbVF6dKlEBoaJvlrljR3796VuwSSEPPUn2J9pLsgAwZ8jcGDB2Hs2PHwu+OHHt27Y/WqFfjsf5/nueecsbExlixeBCcnR3Ty6IL4+HicOXMWPbp74tatW+px3bt7YuvWbVCpVKhQoQLWr1uDrVu3YcSIkXB1dcUvv8xBRkbGK09pNzY2hrGxsfq5hYWFzt47ySNVaQHrXJ6CIwrmKRbmKQ5mKZZUpQWMCx8GAPD29kL1atXQyaOzxvINGzaqfw4MDERsbCy2bd0CZ2dnRERESFjtc/89jf3FwRgC7OzKIjIySu4ySCLMU3+K9ZHugnwzeDAWL/kDu/fsQVhYOLy8Z+HOHX8MHDBAY5yFuQXWrV2NMmXKoGu37oiPjwcAbNy0CR07dlQ3yu+9+y5qVK+OzVu2AgD69euLmJgYTJg4CaFhYTh46BB+/W0uBg8e9MpfvsOHDUVwUID6cf2ar44/AdK3LIW2fzZQScA8xcI8xcEsxaJtnl4zZ6BF88/RtVt3PHjwsMCx165dBwC4uLi8aXlaefLkCRISEuDqWkUvr1ecWVpayV0CSYh56k+JbLotLS1RsWIFXLmi2dhe8fWFq5urxrIlSxbBzNwcX/TshaSkJPXygwcPITc3B63dn9/mwdOzG86dP4+oqOff9ri5uuLq1Wua+79yBZaWlrCvWDFPTQsXLUbVajXUj49q15XkvVLxoUSu3CWQhJinWJinOJilWLTJ02vmDLi7u6ObZ3dERhY8iS4AvFurFgAgNjbv9dW6oFKpsHvPXnT28ED58uXzrDc3N39rLj/MyuJs1yJhnvpTIpvuojh2/Dhq1qiBOnVqayzPysrCtu070L27J4yMjODh0QmbN2957dfJzMxEcnKy+sFJ3sRTKueJ3CWQhJinWJinOJilWArL09vbC507e2DosOFITk6BnZ0d7OzsYGpqCgBwdnbGyJEj8N5778HR0REtW7TAggXzceHCRfU11wBw+tQJuLv/e7/sUqVKoVatmqha1Q0AUKVKFdSqVRN2dnav9T7mzPkZMTEx2L9vD7p27QI3NzdUquSCHt274/Dhg2/NZYW8rl0szFN/SuQ13cnJyXjw4CHq1auLixcvqpfXq1sXN27c0Bi7du06BAUGYfWqlejT90uN8Rs3bsKJ40fRr19fGBgY4O+/D6rXhYSGou1Lt4oAgHr16iEpKQkxDx7o5o1RsRZvYIcyOdrfroSKN+YpFuYpDmYplngDOxTU5n7Z7/kM4Tt3bNNYPnLUaGzdug1ZWZlo2qQJBgz4GuZmZoh58AAHDhzA/AWaE6m5urrC2vrfU2VbtmyB+fPmqp8v/WMJgNe/3WxCQgLate+IYUOHYMSI7+Do4IBnz54hMDAQM2d4ITExscj7LIlq1KiBgIAAucsgiTBP/SmRTTcA/LF0Kb4fMxoRERG4c+cOunt6olatmhg2fHiesStXrYbSwABr16xC7959cfnKFQBAaGgorl27hokTxmPzlq1If+nWE2vWrMXAAV/Da+YMrFq1GlWqVMH3Y0Zj2bK/oFKp9PY+iYiIiERl7+BU4PqYmAfo0rVbkfezdes2bN26LZ/RrycpKQmzZs/BrNlzJN0vEYmvxDbdK1ashLWVFaZMmYyyZcogJCQEX/b/Os/M5S8sX74CSqUS69atQa/efeDrexUAsGnTFtSrVy/PqeUPHz5E7z79MHnSRBw5cggJCQnYtGlznm9W6e1hqkqTuwSSEPMUC/MUB7MUy/M8zeQugyTyYkJiEgPz1B9FRXvHt/qw7ciRI9CubVs0b9FS0v1aWloiOCgAaYdmITc2RNJ9kzwyFCYwUWXIXQZJhHmKhXmKg1mKJUNhArPS5WHWdhpatWqN235+cpdEb8Da2gqJiUmFD6QSgXm+uRc9X9VqNZCcnJzvOOEnUsuPubk5qlWrhv5f9sPKVavkLodKgGSltdwlkISYp1iYpziYpViYp1gcHBzlLoEkxDz1561tur28ZuLg3/tx4cLFN5q1nIiIiIiIiCg/Jfaa7jc1atRojBo1Wu4yqASxzkmQuwSSEPMUC/MUB7MUy/M8eU23KCLu3ZO7BJIQ89Sft/ZIN1FRpSv5R4NImKdYmKc4mKVYmKdYStvayl0CSYh56s9be6RbXxTWFaDM5oQwIsjKtYJSyckmRME8xcI8xcEsxZKVawWFjVXhA6lEsLa2RnR0tNxlkESYp/6w6dYx00b95S6BJGIaFwczOzu5yyCJME+xME9xMEuxmMbFwdTODulpabw9kQByc3LkLoEkxDz1h023jnl4dEFqaqrcZRARERHJJj4+HtExMXKXQW8oKDhY7hJIQsxTf9h069gdf/8C79lGJUeNGtUREBAodxkkEeYpFuYpDmYpFuYpFuYpFuapP5xIjUhrCrkLIEkxT7EwT3EwS7EwT7EwT7EwT33hkW4dq1WzJk8vF0Tp0qVh+O67cpdBEmGeYmGe4mCWYtEmT556XnIkJDyVuwSSEPPUHzbdOubjs0PuEkgiGRkZMDExkbsMkgjzFAvzFAezFIs2eaanpaHpJ83YeJcASUm8ZFIkzFN/2HTrWPqFVVDFR8hdBkngca4VyvI2NsJgnmJhnuJglmIpLE+FjT1MmwyCra0tm+4SwMnJCQEBAXKXQRJhnvrDplvHVIkPkRt/X+4ySAIqAzvk5sTJXQZJhHmKhXmKg1mKpbA8ObkQEb0N+LuOSEtWuc/kLoEkxDzFwjzFwSzFwjzFcv8+DySJhHnqD5tuIi1lKniNoUiYp1iYpziYpViYp1hsrK3lLoEkxDz1h003kZYyFKZyl0ASYp5iYZ7iYJZiYZ5isSlVSu4SSELMU3+EaLrnzZuLlSuW5/t8+7atmD59qhylEREREdFrGDZsKA7s34fgoADcunkdK1csR5UqlTXGbN+2FTHRkRqP2bO9tX6N2bO9ERMdiQEDvi5wnJmZGf5YshjXr/liyeJFMDPV/DLBzs4OM2f8hAvnz+JueCh8r1zCmtUr0aTJx9q/4RJAlZsrdwkkIeapP7JPpDZv3lx09+ymfh7/9Clu3riJmV5eCAgI1GofU6ZMhUKhyPf5gIGDkJWVJV3R9FYqw4l9hMI8xcI8xcEsxfImeTZq2BCr16zBjRs3YWhogHHjxmLTxg34tNn/kJaWph63fv0G/PLrb+rnL68riLu7O+rUro0HDx4WOnbgwAFISUnBFz17Y9DAARgwcAAWLlwEAHB0dMTuXT5ITHyGGTO9EBgYCENDIzRr9im8vWbik08/K+I7L74Cg4LkLoEkxDz1R/amGwCOHz+BUaPHAADKlbPDjz/+gLVrVqNe/YZabZ+UlFTg84SEBEnqpLdbvEFZ2OY8lrsMkgjzFAvzFAezFMub5Nmrdx+N5yNHjobf7Zt4//33cenSJfXytPQ0xMUVrbmvUKECZs78CT179sa6tasLHV/Kxgbh4eEIDAxEaGgobG1t1etmeXtBBRXatG2v0fAHBwdj8+YtRaqruKtWtSqCgoPlLoMkwjz1p1icXp6ZmYm4uDjExcXhzh1/LF60BA4ODupfaPb2FbF06RIE+Pvhjt9trFq5Ao6Ojurti3p6+aWL5zF8+DDM/e1XBAcF4Mrli+jVq6dGTXXr1sGRwwcRHhaCvw/sh3urVoiJjkStWjV19TFQMaeCovBBVGIwT7EwT3EwS7FImaf1P5M+/fdgSmcPD/jdvonjx45i/LixeU79/i+FQoHff5+PP/5YimAtG46Vq1ajd+/eiLgXju7dPbF8xUoAQKlSpfDZZ82wevWaVx5hT0xM1Gr/JYXSwEDuEkhCzFN/ikXT/TJzc3N07uKB8Lt38fTpUxgaGmLjhvVISU6BR+eu6NjJAykpKdi4YR2MjIxe+3UGDx6Em7duoWWr1lizZi1mz/JWXydkaWmJ1atXISAwEK3c2+DnX37BxInjpXqLVEIZqzLkLoEkxDzFwjzFwSzFIlWeCoUC06dPxeXLlxH00imxPrt2YdjwEejarTsWLlqELl27YOHC3wvc19ChQ5CTnYMV/zTO2oiKisLHTZqibr0G+LTZ//Dw4fNT0l1cXKBUKhEaGvZ6b6yESUzkLeBEwjz1p1icXt68+ecICX5+/baFhQUePnyEfv2+hEqlQocO7aFUKjHm+x/U40eNHoPAgDto3KgRTp0+/Vqvefz4caxZsxYAsGjxEgwcOACNGzdGWFg4PDw6ASoVfvhhLDIyMhASEoI/KizFr7/+ku/+jI2NYWxsrH5uYWHxWnVR8WWaq901YlQyME+xME9xMEuxSJWnt7cXqlerhk4enTWWb9iwUf1zYGAgYmNjsW3rFjg7OyMiIiLPft577z0M+PortHJvU+QaVCpVntPYFW/ZiRlPnybIXQJJiHnqT7E40n3+/Hm0aOmOFi3d0bpNO5w6dQrr16+Fg4MDatWsCRcXF4QEB6of/nduw8TEBM4uzq/9mgH+ARrPY+PiULZMGQBAlSpV4B8QgIyMf7+dvX7jRoH7Gz5sKIKDAtSP69d8X7s2Kp4SDUrJXQJJiHmKhXmKg1mKRYo8vWbOQIvmn6Nrt+6FTnp27dp1AM+PQL9Kgwb1UbZsWVy5fBH3I+7ifsRdODk5YeqUybh08XyRa7t79x5yc3Ph6lqlyNuWRM7Or/+3NxU/zFN/isWR7tTUNNy7d0/9fMz3PyAo0B+9evWEuYUFbt26jWHDv8uz3ZMnT177NbOyszUXqFRQKl//O4iFixbjz2V/qZ9bWFiw8SYiIiJ6A14zZ8Dd3R1du3VDZGRkoePfrVULABAb++iV63fs2IEzZ85qLNu4YT127NiBLVu3Frm+hIQEnDx5Cl9+2Q8rVqzMc123tbW1cNd1E1HRFYum+79UKhVyc3NhamqK27dvo0P79nj8+DGSk5P18vphYWHo0tkDxsbGyMzMBAB8+MEHBW6TmZmpHktisszlfzRFwjzFwjzFwSzF8iZ5ent7waNTR/T/agCSk1NgZ2cH4PldatLT0+Hs7AwPj044duw4nj59ipo1amDatKm4cOGixm1nT586Ae9Zc3Dw4EE8fZqQ55Ta7OwsxMbFISws/LXqnDBxEnbv2okD+/fil19/Q0BAAAwMDPHpJ03Rt28ffNrsf6/9GRQ3UVFRcpdAEmKe+lMsTi83NjaGnZ0d7Ozs4OrqCq+ZM2BhYYEjR47AZ6cP4p/GY9WqFahfvz6cnJzQqFFDzPhpOipWrKCTenx8dkGhVOLnn2fD1dUVn376Kb75ZjCA518I0NspW/H6E/dR8cM8xcI8xcEsxfImeX7Zry9sbGywc8c23LxxTf3o0KE9ACArKxNNmzTBpk0bcPrUCUyZOhkHDhxAvy/7a+zH1dUV1tZWb/Q+CnL//n20cm+D8+cvYOqUyTh+7Cg2b96IJk2aYNz4CTp7XTlYWJjLXQJJiHnqT7E40v2//32GmzeuAXj+7WVoaBgGDf4GFy5cBAB07twVEyeOx4rly9QTrZ09exZJSbo58p2cnIwvv+yPWbO8ceTwQQQGBmHevAVYsmSRxnXe9HZJV5jBAvo524J0j3mKhXmKg1mK5U3ytHdwKnB9TMwDdOna7Y3306Bh4yLV9SqxsbGYOGkyJk6a/Mb7Ks5Kl7bFw4evPnWfSh7mqT+yN92jRo3GqFGjCxwTFxeHkSPzH/Pf7U2MjZGSkqJ+3rWbp8b6V/1ybdHSXeO5r+9VtGjRSv3cw6MTMjMzER0dU2CtRERERERERC/I3nRLycDAAJUrV0adOrWxbv2GN9pX165dcD/iPh48fIhaNWti4sQJ2Lt3H9LT0yWqlkqaMjlxhQ+iEoN5ioV5ioNZioV5iiUgIKDwQVRiME/9KRbXdEulevVqOPj3fgQFB2PduvVvtK9ydnZYuHABTp08jmnTpmDfvv348cexElVKJVG8QRm5SyAJMU+xME9xMEuxME+xuLm5yV0CSYh56o9QR7rv3PFHFdeqkuxryR9LseSPpZLsi8SgEus7qrce8xQL8xQHsxQL8xSLoaFQrcNbj3nqD38TEmnJWMVJ9ETCPMXCPMXBLMXCPMWSlMRb+omEeeoPv97QMYV1BSiz+R8cEZirlFAqSsldBkmEeYqFeYqDWYqlsDwVNvb6K4be2OPHT+QugSTEPPWHTbeOmTbqX/ggKhGSYmNhXa6c3GWQRJinWJinOJilWLTJMz0tDfHx8XqqiN5EpUqVOPmWQJin/rDp1jEPjy5ITU2VuwySgJOTIyIjo+QugyTCPMXCPMXBLMWiTZ7x8fGIjuEtWYlIXGy6deyOvz+Sk5PlLoMkcD8yEs+ePZO7DJII8xQL8xQHsxQL8xRLDL8cEQrz1B9OpEakJRMTE7lLIAkxT7EwT3EwS7EwT7EwT7EwT/1h002kpTJleK9RkTBPsTBPcTBLsTBPsTBPsTBP/eHp5TpWq2ZNXtMtCCcnRxgaGMhdBkmEeYqFeYqDWYpFqjx53TcRlWRsunXMx2eH3CWQRFQqFRQKhdxlkESYp1iYpziYpVikyjM9LQ1NP2nGxltmgYGBcpdAEmKe+sOmW8fSL6yCKj5C7jJIAk9VFiitSJG7DJII8xQL8xQHsxSLFHkqbOxh2mQQbG1t2XTLrHLlSggLC5e7DJII89QfNt06pkp8iNz4+3KXQRLINrBDbk6c3GWQRJinWJinOJilWKTIkxMQFR/Gxpx4SyTMU3/4e4xIS0aqLLlLIAkxT7EwT3EwS7EwT7GkpPAsFJEwT/1h002kJfPcJLlLIAkxT7EwT3EwS7EwT7E8evRQ7hJIQsxTf4Ruuh0dHRETHYlatWoWOG7M6FE4cvhggWPmzZuLlSuWS1kelTDPDGzlLoEkxDzFwjzFwSzFous8hw0bigP79yE4KAC3bl7HyhXLUaVKZY0x27dtRUx0pMZj9mzvAvdrbm4Or5kz4Ot7GWGhITh54hj69Old4DZKpRLe3l64fs0X69auyXM7JktLS4wd+yNOnzqB8LAQ3Lh+FVs2b0Tr1u6v9+ZlULlyFblLIAkxT/2RtemeN29uvr/4vL1mIiY6EvPmzdV5HX8s/ROe3Xvo/HWIiIiISDqNGjbE6jVr0K59R/T4oicMjQyxaeMGmJmZaYxbv34DPviwtvoxc2bBTfe0qVPQrFkzDB/+HT5t9hn+Wr4CXjNnoGWLFvlu07FjBzg42KNnr9647eeHsT/+oF5nbW2NPbt3oVvXLli4aDFaubdB5y5dsXvPXkyaOBHW1tZv9kEQUbEm+0Rq0dHR6NihA6ZN+wnp6ekAABMTE3Tq1BFRUVE6f30DAwOkpqbyXtpUKAueIicU5ikW5ikOZikWXefZq3cfjecjR46G3+2beP/993Hp0iX18rT0NMTFaT+hW926dbFt+3ZcuHARALBhw0b06d0LH370IQ4fOfLKbUrZ2CAqMgqBgUFwc3NDmzat1evGjRsLJydHNGn6KR49eqReHh5+F7t27UZGRobWtcnp4cMHcpdAEmKe+iP76eW3b/shJuaBxqk1bVq3RnRMDPz87qiXNWvWDLt8diDA3w9+frewZs0qODs7a+zrww8/xOFDfyM8LAR/H9iPd999V2N9o0YNERMdic8+a4aDf+/HvbthqF+/Xp7Ty5VKJaZOnaJ+rUkTJ4C3DKVc+f/vQhJinmJhnuJglmLRd54vjhgnJCRoLO/s4QG/2zdx/NhRjB83FmampgXux9fXFy1btECFChUAAI0bN0LlypVx6tTpfLfZsdMHderUxr27YZgyZTIWLPgdAKBQKNCxQwfs9PHRaLhfSE1NRU5OTlHepmwMDGQ/XkcSYp76Uyz+y7Z5yxb06O6pft6jhye2bNmqMcbc3Ax/LvsLrdu0Q/fuPaDKVWHF8r+g+KcbNjc3x9o1qxAcHAL31m3x29y5mDJ50itfb8KE8fD2no1Pm/0PAQF5bwr/zeBB8OzWDaPHfI9OnTqjVKlSaO1ecq63Id1IU1rIXQJJiHmKhXmKg1mKRZ95KhQKTJ8+FZcvX0ZQUJB6uc+uXRg2fAS6duuOhYsWoUvXLli48PcC9zVp8hQEhwTj2tUriLgXjg3r12HCxEkaR8//KzExEe6t26JuvQaoX7+h+m9MW1tblC5dCqGhYdK8URnZ2dnJXQJJiHnqT7H4emPHjp0YP24sHBwcAAB169bDt98OReNGjdRjDhz4W2Ob0aPHwM/vFqpWrYqgoCB4eHSCUqnEmO9/QEZGBoKDg1GxYkXMmT0rz+v9+stvOH3mTL71DBgwAIsWLcLffz8/+j123Hg0a/Zpge/B2NgYxsbG6ucWFvyjgYiIiEhfvL29UL1aNXTy6KyxfMOGjeqfAwMDERsbi21bt8DZ2RkRERGv3NdX/fujTu3a6Pdlf0RFRaFhgwbw9pqJR48e4cyZswXW8d/T2BU8XZLorVcsmu74+HgcO3Yc3T27QaFQ4NjxY4h/+lRjTKVKLvjh++/x0UcfwtbWFkrl84P0Dg72CAp6fu2Mf0CAxjUxV69efeXr3bx1K99arKysUKFCeVy7fkO9LCcnBzdv3irwl+bwYUMxZsxobd4ulVClcx7LXQJJiHmKhXmKg1mKRV95es2cgRbNP4dH56548KDg2yBdu3YdAODi4vLKptvU1BTjxv2IrwcMxLFjxwEAAQGBqFWrFr4ZPLjQpvu/njx5goSEBLi6lvyZooODg+UugSTEPPWnWJxeDjw/xdzTsxu6deuKzZu35Fm/ZvUqlCpVCj/8OBZt23VA23YdAADGRsZ5xhZGF5OmLVy0GFWr1VA/PqpdV/LXIHklGpSSuwSSEPMUC/MUB7MUiz7y9Jo5A+7u7ujm2R2RkZGFjn+3Vi0AQGxs3uurAcDQ0BDGxsbIzc3VWJ6Tm6M+6FMUKpUKu/fsRWcPD5QvXz7PenNzcxgYGBR5v3Jwdn5H7hJIQsxTf4pN033ixEkYGRnD0MgIJ0+e0lhXunQpuLq6Yv6C33H27DmEhoailI2NxpiQkBDUrFEDJiYm6mW1a9cuch1JSUl4+PARan/0oXqZgYEB3n//vQK3y8zMRHJysvqRkpJS5Nem4i2neJwYQhJhnmJhnuJglmLRdZ7e3l7o3NkDQ4cNR3JyCuzs7GBnZwfTfyZKc3Z2xsiRI/Dee+/B0dERLVu0wIIF83HhwkWNeX1OnzoB93/m70lOTsb58xcwedIkNGrUEE5OTvD07IauXbri74MHX1lHYebM+RkxMTHYv28PunbtAjc3N1Sq5IIe3bvj8OGDJeayRBOTgiego5KFeepPsfkvW25uLj5t9pn655clJDxDfHw8evfuidjYWDg42GPC+PEaY3x8dmHc2B/xyy9zsHDhYjg5OeKbbwa/Vi0rVqzA0GFDcffuPYSGhmLQoIG8fyLBENlyl0ASYp5iYZ7iYJZi0XWeX/brCwDYuWObxvKRo0Zj69ZtyMrKRNMmTTBgwNcwNzNDzIMHOHDgAOYv0JxIzdXVFdbWVurn3w4Zignjx2HRwoUoVaoUoqOjMOfnn7F27brXqjMhIQHt2nfEsKFDMGLEd3B0cMCzZ88QGBiImTO8kJiY+Fr71be0NN5iVyTMU3+KTdMNPP9m8VVUKhW+HTIUM36ajuPHjiAsPByTJ0/V+AWbmpqKfl/2x5zZs3D40N8ICQmBl5c3Viz/q8h1LP1zGcqVL4f58+ciNzcXm7dsxd8HD8Laio3328wy55ncJZCEmKdYmKc4mKVYdJ2nvYNTgetjYh6gS9duRd5PXFwcRo0e80a1/VdSUhJmzZ6DWbPnSLpffYqOjpG7BJIQ89QfRUV7R5XcRYjI0tISwUEBSDs0C7mxIXKXQxJ4YmCHMjlxhQ+kEoF5ioV5ioNZikWKPJW278Cs7TS0atUat/38JKqMXkeNGjUQEBAgdxkkEeb55l70fFWr1cj3ADJQjK7pJiIiIiIiIhINm24iLZnn5v/tFZU8zFMszFMczFIszFMsjx69esZ3KpmYp/6w6SYiIiIiIiLSETbdRFpKVVrKXQJJiHmKhXmKg1mKhXmK5VX3GaeSi3nqT7GavVxECusKUGZnyF0GSUCRawWl0kzuMkgizFMszFMczFIsUuSpsLGXqBoiInmw6dYx00b95S6BJGKfkwMDAwO5yyCJME+xME9xMEuxSJVneloa4uPjJaiI3kRoaKjcJZCEmKf+sOnWMQ+PLkhN5Y3nRVCuXDnExsbKXQZJhHmKhXmKg1mKRao84+PjER3DewrLzcHBHvfuRchdBkmEeeoPm24du+PvX+A926jk4L0MxcI8xcI8xcEsxcI8xWJmZi53CSQh5qk/nEiNSEsZGelyl0ASYp5iYZ7iYJZiYZ5iYZ5iYZ76w6abSEsREfflLoEkxDzFwjzFwSzFwjzFwjzFwjz1h6eX61itmjV5TbcgnJwcERkZJXcZJBHmKRbmKQ5mKRZ95clrvvWjatWqvFxAIMxTf9h065iPzw65SyCJxMbGoly5cnKXQRJhnmJhnuJglmLRV57paWlo+kkzNt5EVCyx6dax9AuroIrnrIAiMFAZI02RKXcZJBHmKRbmKQ5mKRZ95KmwsYdpk0GwtbVl061jcXFxcpdAEmKe+sOmW8dUiQ+RG8/rJYSgMEWuihNOCIN5ioV5ioNZikUPeXKCIv3JycmWuwSSEPPUH/6eItJSitJK7hJIQsxTLMxTHMxSLMxTLBUqVJS7BJIQ89QfNt1EREREREREOsKmm0hLNjnxcpdAEmKeYmGe4mCWYikOeQ4bNhQH9u9DcFAAbt28jpUrlqNKlcp5xtWpUxtbt25GaEgQggL9sXPHdpiamua73wYNGmDN6pW4dtUXMdGRcG/VSqt6Ro8aCV/fy9jlswOVK1fSWGdkZIQh336DI0cOISw0GH63b2L3rp3o7ukJQ0P5rwoNDw+TuwSSEPPUHzbd//D07IYAfz+5y6BiLJWnyAmFeYqFeYqDWYqlOOTZqGFDrF6zBu3ad0SPL3rC0MgQmzZugJmZmXpMnTq1sWH9Opw+dRpt2rZHm7btsGr1auTm5ua7X3NzM9zxD8CEiZO0rqVe3br4/PPP0b//1/DZtRteM2eq1xkZGWHjxvUYOnQoNqzfiA4dO6FN2/ZYvXotvvrqS1SrVvX1PgAJlS9fQe4SSELMU3+K9JWZra0tfvjhezT//H8oW7Ysnj17Bn//AMybNx9XfH11VSM8Pbth/ry5AIDc3Fw8fPQIZ06fwUwvbzx58kRnr0v0siyFkdwlkISYp1iYpziYpViKQ569evfReD5y5Gj43b6J999/H5cuXQIATJs2FStWrsKixUvU48LCwgvc74kTJ3HixMki1WJTygaPHj1CQEAADA0N4Nmtm3rdwAFfo2GDBmjdui387txRL79//z727tsHIyP5P0sLCwu5SyAJMU/9KVLTvfyvZTAyNsKIkaMQEXEfdnZ2aNLkY5QuXVpX9aklJiai6SfNoFQqUbNmDcyb+xvKly+Pnr166/y1iQDAADlyl0ASYp5iYZ7iYJZiKY55WltbAwASEhIAAGXKlEGd2rXhs9MHe3b7wNnZGaGhYZgz52dcvnJF0tc+efIU+n/5JcJCg5GSkoJBg79Rr/Po7IEzZ85qNNwvZGdnIztb/pmmMzMz5C6BJMQ89Ufr08utra3RsGEDeHnNwvnzFxAdHY0bN25g0aLFOHzkiHrcoEEDcezoEYSGBMH3yiV4e3vB3NxcY19t2rTGieNHcTc8FJcunsfgwYMKfX2VSoW4uDg8evQIJ06cxIqVq9C0aROYmpqiWbNm2OWzAwH+fvDzu4U1a1bB2dlZvW2jRg0REx2p/iULALVq1URMdCQcHR3zfc2+ffvg/LmzuHc3DGdOn0SXLp21/bhIQMXhujSSDvMUC/MUB7MUS3HLU6FQYPr0qbh8+TKCgoIAAM7O7wAARo8ZjQ0bNqFXrz647eeHLVs2oVIlF0lfPzs7G71690HtOvXwwYe1cfbsOfW6ypUqITQ0VNLXk1p4+F25SyAJMU/90brpTklJQXJyMtzdW8HY2Djfcbm5uZg8ZQqaffY5RowchSYfN8akSRPV69977z38ufQP7N6zF583b4Hf5s7Djz98D0/Pbvnu81XS09NhYGAAAwMDmJub4c9lf6F1m3bo3r0HVLkqrFj+FxQKRZH2+TJ3d3f8NH0a/ly2DP/7vDnWrd+AeXN/Q+PGjV453tjYGJaWluoHT9cQT7yBndwlkISYp1iYpziYpViKW57e3l6oXq0avh0yVL1MqXz+5/D69RuwZetW+N25g2nTpiMsLBw9unfXSR1PnjxBVlbWf5a+/t+t+lK9enW5SyAJMU/90fr08pycHIwcNRq//Pwz+vTuDT+/27hw8RJ2796NgIBA9bjly1eof46KisKcn3/BnNmzMGHC88Z78KCBOHv2HObPXwDg+TcsVd3c8O03g7F16zataqlUyQV9+/TGjRs3kZKSggMH/tZYP3r0GPj53ULVqlXV32IW1bffDMLWrduwZs1aAMCyZX+hdu2P8M03g3H+/IU844cPG4oxY0a/1msRERERkW55zZyBFs0/h0fnrnjw4KF6+aNHsQCA4OBgjfGhoaFwcHDQW33hd8Ph6uqqt9cjIv0p0uzlBw78jdp16qJ//69w4uQpNG7UEIcO/q1xlLpp0ybYsmUTrvpeQXBQAH5fsAC2trYw++eWC25urrjyn+tjrlzxRaVKldTfNL6KjY0NQoIDERYajDOnTyEu7jGGDR8O4HkTvmTxIlw4fxZBgf64dOl5U+zgYF+Ut6fB1dUtz+RwV674wi2fX4YLFy1G1Wo11I+Patd97dem4slUlSp3CSQh5ikW5ikOZimW4pKn18wZcHd3RzfP7oiMjNRYFxkZiQcPHqJKlSoayytXroSo6Gi91bjLZxeaNm2Cd2vVyrPO0NBQY7Z1uXACY7EwT/0p8i3DMjIycPrMGcyfvwAdOnpg69Zt+P6fI7yOjo5Ys3oVAgICMXDQILi3boOJ/9xGwaiAU9K1kZSUhBYt3fHZ/5rD1a0aOnfpqr4OYc3qVShVqhR++HEs2rbrgLbtOgAAjI2ev2ZurgoANE43NzSUdgbIzMxMJCcnqx8pKSmS7p/kZ6gqfpPB0OtjnmJhnuJglmIpDnl6e3uhc2cPDB02HMnJKbCzs4OdnZ3GPbj/WLoUX3/VH23btoGLiwt++OF7VKniik2bNqvHbNmyCf2/7Kd+bm5ujlq1aqJWrZoAAKd3nFCrVk042L/eQZ+/lq/AlSu+2LJlM77s1w81a9bAO++8g/bt22Hf3t157ukth4wMTrwlEuapP0WavfxVgkNC4O7eCgDw/vvvQalUYvr0n6BSPW9027dvrzE+JCQU9erV01hWr15dhIffLfBeiLm5ubh3716e5aVLl4Krqyu+/2EsLl++DACo/5/9v/gWp1y5cnj27BkAqH9B5ic0NAT16tbFtm3bNeoMDgkpcDsSV7LSCiY56XKXQRJhnmJhnuJglmIpDnl+2a8vAGDnDs3LGEeOGq2+tHH58hUwNTHB9GlTUapUKfj7++OLL3oiIiJCPd7F2Rm2trbq5x988D52bP93n9OnTQUAbNm6DaNGFf2Sw8zMTPT4oicGDRyA3r17YfLkSUhLT0NoSAhWrFyFwMDXu2RSSvb29uq/pankY576o3XTXbp0Kfz551Js3rwFAQEBSE5OwQcfvI8h336DQ4cOAwDu3bsHY2NjfPVVfxw5chT16tVFnz6at/T6889lOHBgH0aOHIE9e/agTp066N//S4yfMPFVL1uohIRniI+PR+/ePREbGwsHB3tMGD9eY8y9e/cQHR2NMWNGYc6cn1G5cmV8U8iM6X/88SeWLl0Cvzt3cObMGbRo0QJtWrdG9x5fvFadRERERKR/9g5OWo1btHiJxn26/6tBw8Yazy9cuKj1vrWVmZlZaB1EVPIUYfbyVFy/dh2DBg7Azh3bceL4Ufz4w/fYsHETJk6aDADw9w/A1GnTMXTIEJw4fhSdPTwwa9Zsjf3c9vPD4G++RccO7XH82FH88P0Y/PLLb1pPovZfKpUK3w4Zivffew/Hjx3BtGlTMWOml8aY7OxsDBkyDK5VXHH0yBEMHTIEc37+pcD9Hjx0CFOmTsM3gwfjxPFj6NO7F0aNHoMLFy6+Vp1U8tnkPJW7BJIQ8xQL8xQHsxQL8xTL3bu8xZRImKf+KCraO6rkLkJElpaWCA4KQNqhWciN5SnpIkhSWsMqN1HuMkgizFMszFMczFIs+shTafsOzNpOQ6tWrXHbz0+nr/W2c3R0QFSU/iaXI91inm/uRc9XtVoNJCcn5zuuyBOpEb2tMhUmcpdAEmKeYmGe4mCWYmGeYrGyspa7BJIQ89QfNt1EWlIg/4n+qORhnmJhnuJglmJhnmLJzs6WuwSSEPPUHzbdRFqyzeG9DEXCPMXCPMXBLMXCPMUSwrv4CIV56s8b3zKMCqawrgBlNu+BJ4LHuVYoq0ySuwySCPMUC/MUB7MUiz7yVNi83n2xqehq1KiBgIAAucsgiTBP/WHTrWOmjfrLXQJJxCQ2FmblysldBkmEeYqFeYqDWYpFX3mmp6UhPj5e569DRPQ62HTrmIdHF6SmpspdBkmgdOlSePo0Qe4ySCLMUyzMUxzMUiz6yjM+Ph7RMTE6f5233dOn/GJDJMxTf9h069gdf/8Cp4+nksPKygpJSTzlURTMUyzMUxzMUizMUywpKTyQJBLmqT+cSI1IS46OjnKXQBJinmJhnuJglmJhnmJhnmJhnvrDppuIiIiIiIhIRxQV7R1VchchIktLSwQHBfCaboGYmJggI4Mz0YuCeYqFeYqDWYqFeYqlOOXJ6/jfnLm5OfuUN/Si56tarUaBlxTzmm4d8/HZIXcJJJFnz57BxsZG7jJIIsxTLMxTHMxSLMxTLMUpz/TUVDT99DM23m+gdOlSbLr1hE23jj2dNRfZQWFyl0ESeFquDDJjn8hdBkmEeYqFeYqDWYqFeYqluORp6OKE0tPGwdbWlk33G7C2tkF0ND8/fWDTrWM5EdHIDg6VuwySgCo9Ddn3o+UugyTCPMXCPMXBLMXCPMXCPMWSm5MjdwlvDU6kRqQla/5HRijMUyzMUxzMUizMUyzMUyxBwcFyl/DWYNNNpKVnzrytgkiYp1iYpziYpViYp1iYp1iqV6smdwlvDTbdRFpSKRVyl0ASYp5iYZ7iYJZiYZ5iKSl59u3bB0ePHEZQoD+CAv2xZ88ufPZZM40xvXr1xPZtWxEU6I+Y6EhYW1sXut8xo0chJjpS43H61IkCt1EqlfD29sL1a75Yt3YNypQpo7He0tISY8f+iNOnTiA8LAQ3rl/Fls0b0bq1exHfddEplGwF9eWt+KTHjB6FI4cPvvF+YqIj4d6qlQQVUUlknJQidwkkIeYpFuYpDmYpFuYplpKS54MHD+A9axbcW7dB6zZtce7ceaxauQJVq1ZVjzEzM8PJkyexcOGiIu07MDAIH3xYW/3o1KlzgeM7duwABwd79OzVG7f9/DD2xx/U66ytrbFn9y5069oFCxctRiv3NujcpSt279mLSRMnavVFwJt4lpCg0/3Tv3Q+kdq8eXPR3bMb1q5bh3HjJmis8/aaiS+/7IctW7dh1KjRui7ljX3wYW08e/ZM7jJIJkYpvKWCSJinWJinOJilWJinWEpKnkeOHNV4PmfOz+jbpw/q1P4Iwf9cx7x8+QoAQKNGDYu075ycbMTFxWk9vpSNDaIioxAYGAQ3Nze0adNavW7cuLFwcnJEk6af4tGjR+rl4eF3sWvXbp3fE/1ZYqJO90//0suR7ujoaHTs0AGmpqbqZSYmJujUqSOioqL0UYIk4uLikJmZKXcZJJOUCnZyl0ASYp5iYZ7iYJZiYZ5iKYl5KpVKdOzQAebmZvC9eu2N91epUiVcu+qLC+fPYtHC3+Fgb1/g+B07fVCnTm3cuxuGKVMmY8GC3wEACoUCHTt0wE4fH42G+4XU1FTk6Hh28XfeeUen+6d/6aXpvn3bDzExDzSuTWjTujWiY2Lg53dHvezSxfMYMOBrjW2PHD6IMaNHqZ/HREeid+9eWLNmFcJCg3Hq5HHUqVMbLi4u2L5tK0JDgrBntw+cnZ3z1NG7dy/4XrmEsNBgLF26BFZWVup1H3zwATZv2gC/2zcRGHAHO7Zvw3vvvquxPU8vJyIiIiIq/qpXr46Q4EDcuxuG2bO98fWAgQgJCXmjfV67fh0jR41Gr969MW78RLzzjhN8fHbAwsIi320SExPh3rot6tZrgPr1GyIgIBAAYGtri9KlSyE0NOyNaqKSQW/XdG/esgU9unuqn/fo4YktW7a+1r5GjhyB7dt3oEXLVggNDcPiRQsxZ84sLFy0GO6t2wIKBbxmztDYxsXFBe3bt0O/L/ujZ68+ePfddzHL20u93tLSAlu3bUenTp3Rrn1H3L17F+vWrSnw/0QvMzY2hqWlpfqh7XZUclg81P5UIir+mKdYmKc4mKVYmKdYSlKeYWFhaNHSHW3bdcDateuwYP48uLm5vdE+T5w4iX379iMgIBCnTp1C7z79YG1tjQ7t2xW6bVxcHHJzc9XPFQr5J6WLjIyUu4S3ht6a7h07dqJevXpwcHCAg4MD6tath507dr7WvrZs2Yq9e/chPPwuFi9ZgnfeeQc+O3fh1KlTCA0NxYrlK9GoUSONbUxMTDBixCjcueOPS5cuYdKkKejYsQPs7J6fJnPu3Hns3OmD0LAwhIaG4ocfx8LMzEzr6zyGDxuK4KAA9eP6Nd/Xem9UfGVZmMldAkmIeYqFeYqDWYqFeYqlJOWZlZWFe/fu4fbt25g1ew78/f0xYMBXkr5GYmIiwsPvwsXFpcjbPnnyBAkJCXB1rSJpTUVhZWUp22u/bfTWdMfHx+PYsePo7tkNPbp74tjxY4h/+vS19hUQEKD+OS7u8fNlgYH/LnscBzMzU1ha/vsPKTo6Gg8fPlQ/v3r1KgwMDFClyvN/6GXLlsUvP8/B2bOnERhwB8FBAbCwsICDg4NWNS1ctBhVq9VQPz6qXfe13hsVX5n8xSQU5ikW5ikOZikW5imWkpynQqmEsbGJpPs0NzeHs7MzYmNji7ytSqXC7j170dnDA+XLl3/lvg0MDKQoM1+lSpXW6f7pXzqfvfxlm7dsUZ/2PWHipDzrc3Nz85xqYWiYt8TsrGz1zyqV6vmy7Kw8y5RFuPfcgvnzULp0aUyZMhVRUdHIzMzE3j27YGRkpNX2mZmZnGRNcAqV3BWQlJinWJinOJilWJinWEpKnuPHjcXxEycRHR0NS0tLeHTqiMaNGqFnz97qMXZ2dihXzg6V/jlKXb16daSkJCM6OgYJ/9xKa8uWTTj490GsWr0GADBl8iQcPnIUUVFRqFChPL4fMxq5uTnw2bX7teqcM+dnNG7UEPv37cHsOT/j5s1byM7OQoP6DTBs+FC0adMOiTqdYbyEBCoAvTbdJ06chJGRMVRQ4eTJU3nWP3kSj/LlyqmfW1paSjarnoODA8qXL6+eHbB27drIyclBWNjzyQvq1auL8RMm4vjx5ze4t7evmOfm9fR2s7nH615EwjzFwjzFwSzFwjzFUlLyLFu2LH5fMA/lypVDUlISAgIC0LNnb5w+c0Y9pm+f3hgz5t9bFu/y2QEAGDlqNLZu3QYAcHF2hq2trXpMxYoVsWTxIpQuXQpP4uNx5fIVtGvfEfHx8a9VZ0JCAtq174hhQ4dgxIjv4OjggGfPniEwMBAzZ3jpuOGGelI30j29Nt25ubn4tNln6p//69y5c/D07IbDR44iMTERP3w/RrKp8jMyMrBg/lz8NGMmLC2tMHPGdOzdu099n727d++ia5cuuHnzFqysLDF50iSkpaVJ8tokhsR3HGB9P1ruMkgizFMszFMczFIszFMsJSXPMd//UOiY3+bOw29z5xU4pkHDxhrPvx0y9I3qepWkpCTMmj0Hs2bPkXzfhalWtSqC/rlvOemW3q7pfiE5ORnJycmvXLdw0WJcvHgJa9eswrq1q3Hw0CFERERI8rr37t3Dgb8PYt3atdi0cQP8AwIwfsJE9foxY36AjY0NDh38G7//vgArVq7E48ePJXltEkOugd7/70I6xDzFwjzFwSzFwjzFwjzFotTxNeP0L0VFe0eezK8DlpaWCA4KwONvxiDrlp/c5ZAEUu3KwDzuidxlkESYp1iYpziYpViYp1iKS56GVV1ht3oxWrVqjdt+/Dv7dTk4OCA6uvifuVCcvej5qlarke+BZUCGI91EJZVxYv7/R6KSh3mKhXmKg1mKhXmKhXmK5elrXotORcemm0hLyfblCh9EJQbzFAvzFAezFAvzFAvzFIvza9xfnF4Pm24iIiIiIiIiHWHTTaQl81hOrCcS5ikW5ikOZikW5ikW5imW6OgouUt4a+j1lmFvIwNnB6jS0+UugySQaWkOw+RUucsgiTBPsTBPcTBLsTBPsRSXPA1dnOQuQQhmZuZITEySu4y3AptuHSs9frTcJZBEYmNjYVeO1zKJgnmKhXmKg1mKhXmKpTjlmZ6ainhOBPZGbG1t8ejRI7nLeCuw6dYxD48uSE2V/xtBenNOTo6IjORpOKJgnmJhnuJglmJhnmIpTnnGx8cjOiZG7jKItML7dOuItvdsIyIiIiIiopKH9+kmkpiraxW5SyAJMU+xME9xMEuxME+xME+xME/94enlOlarZk2eXi4IJydHmJmayV0GSYR5ioV5ioNZioV5ioV5SqO4nBpvZGQsdwlvDTbdOubjs0PuEkgiCQkJKFWqlNxlkESYp1iYpziYpViYp1iYpzTSU1PR9NPPZG+8k5M5c7m+sOnWsaez5iI7KEzuMkgC2YYGyMrOkbsMkgjzFAvzFAezFAvzFAvzfHOGLk4oPW0cbG1tZW+64+J433V9YdOtYzkR0cgODpW7DJJAQiUnlLobKXcZJBHmKRbmKQ5mKRbmKRbmKZZKlSohICBA7jLeCpxIjYiIiIiIiEhH2HQTack8Ll7uEkhCzFMszFMczFIszFMszFMsMcVgMre3BZtuIi3lGvFqDJEwT7EwT3EwS7EwT7EwT7EYG3P2cn1565puT89uCPD3k7sMKoHSS1nLXQJJiHmKhXmKg1mKhXmKhXnqVt++fXD0yGEEBfojKNAfe/bswmefNdMYM2fOLJw/dxZhoSG4fesGVq1cAdcqBd9ve968uYiJjtR4bFi/DmXLls13GzMzM/yxZDGuX/PFksWLYGZqqrHezs4OM2f8hAvnz+JueCh8r1zCmtUr0aTJx6/57sVW4pruNw14z569aNL0Ux1XSUREREREpL0HDx7Ae9YsuLdug9Zt2uLcufNYtXIFqlatqh5z69ZtjBo9Bp82+ww9e/aGQqHApk0boFQW3NYdP34CH3xYW/0YMnRYgeMHDhyAlJQUfNGzN9LT0zFg4AD1OkdHRxz8+wA+/rgxZsz0wufNW6Bnrz44d/4CvL1mvtmHIKgSdY6Io6Mjdu/yQWLiM8yY6YXAwEAYGhqhWbNP4e01E598+lmh+0hPT0d6enq+642MjJCVlSVl2SQIm3tRcpdAEmKeYmGe4mCWYmGeYmGeunXkyFGN53Pm/Iy+ffqgTu2PEBwcDADYsGGjen1UVBTm/Pwzjh09AicnJ0REROS778zMTMTFxWksS0rK/z7dpWxsEB4ejsDAQISGhsLW1la9bpa3F1RQoU3b9khLS1MvDw4OxubNW7R7s2+ZEnWk++WADxz4G+HhdxEcHIxly/5Cu/YdAQCDBg3EsaNHEBoSBN8rl+Dt7QVzc3P1Pv57evmY0aNw5PBB9PyiBy5eOIe74c9v7+Vgb49VK1cgJDgQQYH+WLp0SYGnYJD4ku3Ly10CSYh5ioV5ioNZioV5ioV56o9SqUTHDh1gbm4G36vXXjnGzMwM3bt3R0RERKGTojVq1BC3bl7HmdMnMWuWN0qXLoVKlVzyHb9y1Wr07t0bEffC0b27J5avWAkAKFWqFD77rBlWr16j0XC/kJiYqO1bfKuUmCPdLwKePefnAgPOzc3F5ClTcP9+JJyd38Esby9MmjQREyZMzHffLi4uaNOmDQYMGISc3BwoFAqsWrUCKSmp6NylGwwNDeDt5YWlfyxB126eOnqHVNzlGBvJXQJJiHmKhXmKg1mKhXmKhXnqXvXq1bF3zy6YmJggJSUFXw8YiJCQEI0x/fr1xaSJE2BhYYHQ0FD0+KJXgWfqnjxxEn8f+Bv3IyPh4uyMceN+xPp16/Dj2HH5bhMVFYWPmzRF2bJlNY6Qu7i4QKlUIjQ07M3f7FukxDTd2ga8fPkK9c/PT7n4BXNmzyqw6TYyMsJ3I0YiPv75bRA+adoU1atXR8NGjRET8wAA8N2IkTh18jg++OAD3Lx5M88+jI2NNWYAtLCwKNL7o+LPMC3/yxKo5GGeYmGe4mCWYmGeYmGeuhcWFoYWLd1hZWWFdm3bYMH8eejcpZtG471zpw9Onz6NcuXK49tvBuPPpUvQsVNnZGRkvHKfu/fsUf8cGBgI/4AAXLxwDtWqVcWdO3fyrUWlUuU5JV2heMM3+JYqMU23tgE3bdoEw4YNhWsVV1hZWcLAwBBmZqYwMzVFWj7XckdFR6sbbgBwc3NFTEyMuuEGgJCQECQkJMDNzfWVTffwYUMxZszoor0pKlHMnjyVuwSSEPMUC/MUB7MUC/MUC/PUvaysLNy7dw8AcPv2bXz44QcYMOArjB07Xj0mKSkJSUlJuHv3Hq5du4YAfz+0dnfHrt27tXqN+/fv48mTJ7C2Kvps9Hfv3kNubi5cXQueMZ00lZhrurUJ2NHREWtWr0JAQCAGDhoE99ZtMHHiJACAUQH3oUtLTX3j+hYuWoyq1WqoHx/VrvvG+6TiJcmxotwlkISYp1iYpziYpViYp1iYp/4plEoYG5vkv16hgEKhgLGJ9vfcrlixAkqXLg2lQdFbwYSEBJw8eQpfftkPZmZmedZbW/O2cq9SYppubQJ+//33oFQqMX36T7h27TrCw++ifIWiT/gQEhIKe3t72Nv/+4vFzc0NpUqVQnBwyCu3yczMRHJysvqRkpJS5NclIiIiIqK30/hxY9GgQQM4OjqievXqGD9uLBo3agSfnT4AgHfeeQfDhg3Fe++9Bwd7e9StWwfL/lyKtPR0HDt2XL2f06dOwN3dHQBgbm6OyZMmonbtj+Do6IgmTT7GqpUrcPfePfj6Xn2tOidMnAQDpRIH9u9FmzatUamSC1xdXfH1V/2xd8+uN/4cRFRiTi8Hnge8e9dOHNi/F7/8+hsCAgJgYGCITz9pir59++DbIUNhbGyMr77qjyNHjqJevbro06d3kV/n9JkzCAwMxKKFCzF16jQYGBpilrcXzp+/gFu3bungnVFJYPaYp1SJhHmKhXmKg1mKhXmKhXnqVtmyZfH7gnkoV64ckpKSEBAQgJ49e+P0mTMAgIyMDDSoXx8DB3wNGxsbPH78GBcvXkLHjp3w5MkT9X5cXV1hbW0F4Pkk0zVq1EC3bl1hbW2NR48e4dSp0/j5l1+Rk5PzWnXev38frdzbYMR3wzF1ymSUK1cOT+LjcfvWbYwbP+HNPwgBlaimu7CA/f0DMHXadAwdMgQTxo/DxYuXMGvWbCz8fUGRX6t//68xc+YM7Ny5Hbm5uThx8iQmTZqig3dFJYVKWWJODCEtME+xME9xMEuxME+xME/dGvP9DwWuf/ToEfr07VfofuwdnNQ/p6eno2evVx+ELFumTNEKfElsbCwmTpqMiZMmv/Y+3iaKivaOKrmLEJGlpSWCgwLw+JsxyLrlV/gGVOwlVHJCqbuRcpdBEmGeYmGe4mCWYmGeYmGeb86wqivsVi9Gq1atcdtP3h6hRo0aCAgIkLWGku5Fz1e1Wg0kJyfnO45fVxERERERERHpCJtuIi1Z34+WuwSSEPMUC/MUB7MUC/MUC/MUy8v3/ibdYtNNpKWU8nZyl0ASYp5iYZ7iYJZiYZ5iYZ5icXJyKnwQSYJNN5GWcopw/0Mq/pinWJinOJilWJinWJinWExNTeUu4a1RomYvL4kMnB2gSk+XuwySgElpGxgamchdBkmEeYqFeYqDWYqFeYqFeb45Q5fic3Q5PS1N7hLeGmy6daz0+NFyl0ASsc3JgYGBgdxlkESYp1iYpziYpViYp1iYpzTSU1MRHx8vdxmIjIqSu4S3BptuHfPw6ILU1FS5yyAJODk5IjKSv5xEwTzFwjzFwSzFwjzFwjylER8fj+iYGLnLgJubG28ZpidsunXsjr9/gfdso5IjOyeHv5gEwjzFwjzFwSzFwjzFwjyJXg8nUiPSUmxsrNwlkISYp1iYpziYpViYp1iYp1iYp/6w6SbSkkqVK3cJJCHmKRbmKQ5mKRbmKRbmKRbmqT9suom0VL58BblLIAkxT7EwT3EwS7EwT7EwT7EwT/1h001ERERERESkI2y6ibQUGhoqdwkkIeYpFuYpDmYpFuYpFuYpFuapP2y6ibRkb28vdwkkIeYpFuYpDmYpFuYpFuYpFuapP2y6ibRkbm4udwkkIeYpFuYpDmYpFuYpFuYpFuapP2y6ibSUkZEhdwkkIeYpFuYpDmYpFuYpFuYpFuapP2y6ibR07949uUsgCTFPsTBPcTBLsTBPsTBPsTBP/WHTTaSlatWqyV0CSYh5ioV5ioNZioV5ioV5ioV56o+h3AWIzsLCQu4SSCLm5uawtLSUuwySCPMUC/MUB7MUC/MUC/MUC/N8c9r2emy6daR06dIAgOvXfGWuhIiIiIiIiHTFwsICycnJ+a5n060jT58+BQB8VLsuUlJSZK6G3pSFhQWuX/NlnoJgnmJhnuJglmJhnmJhnmJhntKxsLDAo0ePChzDplvHUlJSCvzWg0oW5ikW5ikW5ikOZikW5ikW5ikW5vnmtPn8OJEaERERERERkY6w6SYiIiIiIiLSETbdOpKZmYnffpuLzMxMuUshCTBPsTBPsTBPcTBLsTBPsTBPsTBP/VJUtHdUyV0EERERERERkYh4pJuIiIiIiIhIR9h0ExEREREREekIm24iIiIi+n97dx7XU/Y/cPxVaZUIYdQoyxiDL4NiGN/vjNmsla0Yy5jFLMiWvaKxzpCybzOYMYgYYxtkbIUZmSlFqU872rSnRRH1+8P4jI+iovpkfu/n49Hj4XPvuee+zz2P8/h4f+459wohhKgiknRXkY/HjOGi3x/EREfy6+FDvP766+oOSTyDaY5TSUyIU/k763tG3WGJcurWrRvbftzKpQB/EhPi6NO7d4kyM6ZPI/CSP9FRkXjt9qR5c4vqD1SUqay+XLHCo8RY3blju5qiFWVxcJjA0SO/EhEexpXLgWzdspmWLVuolNHV1WXJ4kWEhFwhMkLB999tomHDhmqKWDxJefry5717SozPb79doqaIxdN89NFoTp74jXBFKOGKUA4dOkCvXm8r98u4fLGU1Z8yNquPJN1VwMbGGlfXuXh4rKR3n36EhobiuXM7DRo0UHdo4hkoFOF0fL2z8m/gwMHqDkmUk4GBPldDw3Bydil1/4Tx4/j000+YPduJAdbW3L6dj+fOHejq6lZzpKIsZfUlwOnTZ1TG6vgJDtUYoaiI7m+8wY/btjHA2pbhH46glnYtdnnuRF9fX1nm669def/99/jyy68YPMSOxk0as2Xzd2qMWpSmPH0JsGPHTpXxuWiR/Me+JkpKSmLJN9/Qp28/+vbrz++//8EPW7fQunVrQMbli6as/gQZm9WllroD+Df64vPP8fTchdeePQDMmj2Hd999lw+HD2PtuvVqjk5U1P3790hNTVV3GOIZnDnjw5kzPk/cP3bsZ6xatYbjv/0GwKTJU7gcdIk+vXtz8NChaopSlEdZfQkPXn8iY/XFMHLUaJXPU6Y4EhJ8mQ4dOnDx4kXq1KnDh8OHMcFhIr///gcAjlOncfasD507d+LSpUB1hC1KUVZfPpRfkC/j8wVw4sRJlc9Lly7jo9Gj6dK5E0lJSTIuXzBP68+IiAhAxmZ1kTvdlUxbW5sOHf7DuXPnlduKi4s5d/4cXbp0UWNk4lk1b96cSwH+XPjjPGvXrMa0aVN1hyQqQbNmzWjcuDHnzp9TbsvJySEwMIguXTqrMTLxrLp3f4MrlwM5d9aHb75ZgrFxPXWHJMrJyMgIgKysLAA6dPgPOjo6Kt+lUdHRxMfHy3dpDfd4Xz40eNAgQoIvc/rUSebMnoW+np4aohMVoampia2NDQYG+vgHXJJx+YJ7vD8fkrFZPeROdyWrX78+tWrVIjVN9RejtNQ0WrVspaaoxLO6FBjIlKmOREdH06hRY6Y5TmH//n30euc98vLy1B2eeA6NGpkAkJqaprI9NS2VRo0aqSMk8Rx8zvhw7OgxbsTFYWFuzuzZM9mxfTvWNrYUFRWpOzzxFBoaGsyf78qff/5JeHg4AI1MGnHnzh2ys7NVyqamptHIxEQdYYpyKK0vAfYfOEB8fALJycm89lobnJ2daNmyJWM//0KN0YonadOmDYcPHUBXV5e8vDw+G/s5kZGRtG/XTsblC+hJ/QkyNquTJN1CPMWj01nDwhQEBgby58UL2FgPYNduL/UFJoRQ8ehyAIVCQWhYGH4XfqdHj+6cP/+7GiMTZVmyZDFtXn2VgYPkeRkvuif15c6dnsp/KxQKUlJS2LvHC3Nzc65fv17dYYoyREdH8/4HfahTpw4D+vdj1coVDB5ip+6wxDN6Un9GRkbK2KxGMr28kmVkZHDv3j1MGqr+4tfQpKGsl/gXyM7OJiYmFgsLC3WHIp5TSsqD8WhiovrUVZOGJqSkpKgjJFGJbty4QXp6uozVGm7xooW8/967DLUbRlLSTeX2lNQUdHV1lVOVHzIxaUiKfJfWSE/qy9I8XPsr47NmKiws5Nq1awQHB/PNt0sJDQ1l7NhPZVy+oJ7Un6WRsVl1JOmuZIWFhVy5EkzPnm8qt2loaNCzZ08CAgLUGJmoDAYGBpibm0tS9i9w48YNkpOT6dmzp3KboaEhnTq9TsAja53Ei+mll5pgbGxMSrKM1Zpq8aKF9OnTBzv7YcTFxansu3IlmLt376p8l7Zs2QIzMzP5Lq2BntaXpWnfrh0AKSnJVR2aqAQampro6OjKuPyXeNifpZGxWXVkenkV+O7771m5woPLV64QGBjE559/hoG+Pru99qg7NFFB8+a68NuJk8THx9OkSWOmT3OkqOg++w8cVHdoohwMDAxU3rv9crOXadeuLVmZWSQkJrJ58xYmT5pIbEwsN+LimDljOsnJyXgfP66+oEWpntaXmVlZTHOcypGjR0lJScXCwhwXZydir13Dx9dXfUGLJ1qyZDGDBtryyadjyc3Nw+Tv9aA5OTkUFBSQk5PDrt1efO06j6ysLHJyclm8aAH+/v7yhOQapqy+NDc3Z9CggZw6dZrMzEzavvYaX3/tyoULfoSFKdQcvXjcnNmzOH3Gh4SEBAwNDRk00JYe3bszYsQoGZcvoKf1p4zN6qXxUlOzYnUH8W/0ycdjGDfuK0xMTLh6NZS58+YRGBik7rBEBW1Yv45u3bphbFyP9IwM/vrzL75dukzWubwgund/g30/7y2x3WvPXqZOdQRgxvRpjBw5AiMjI/766y/mODkTExNb3aGKMjytL+fMcWLrls20b98OIyMjkpOT8fU9yzK35aSlpZVSm1C3xITS74ZOmerInj0P+llXVxfXeXOxtbVFV1cHHx9f5jg5y1KtGqasvmza9CXWrF7Nq21exUBfn8SkJLyPebNy1Wpyc3OrOVpRFvflbvTs+SaNGjUiJyeHsLAw1q3bwNlzD970IePyxfK0/pSxWb0k6RZCCCGEEEIIIaqIrOkWQgghhBBCCCGqiCTdQgghhBBCCCFEFZGkWwghhBBCCCGEqCKSdAshhBBCCCGEEFVEkm4hhBBCCCGEEKKKSNIthBBCCCGEEEJUEUm6hRBCCCGEEEKIKiJJtxBCCCGEEEIIUUUk6RZCCCGEqOE+HD6MXZ47K7XOw4cP0q9f30qtUwghREmSdAshhKhxVqzwIDEhjm+/XVJi35LFi0hMiGPFCg81RCYAEhPi6NO7t7rDqBY/793D/Pmuao1BV1eXGTNm4OGxQrlNU1OTJUsWE3jJn+0/baNBgwYqxxgaGjJr1kzO+p4hJjqSoMAAvHZ70rdvH2WZVatW4+Q0Bw0NjWprixBC/H8kSbcQQogaKSEhAVsbG/T09JTbdHV1GTjQlvj4eDVG9v9HrVq11B1Clanutmlraz/zsf379yM3N4e//P2V22xtbTA1bcqIkaMIDglh1swZyn1GRkYcOngAu6FDWLN2Hb379GPwkKEcPHQYF2dnjIyMADh9+gyGtWvzzju9nr1hQgghyiRJtxBCiBopODiExMQklTtz/fr2JSExkZCQqyplNTQ0cHCYgN+F34mOiuTEieP0799PuV9TUxP35W7K/efO+vDZZ5+q1LFihQdbt2zmqy+/JPCSPyEhV1iyeNFTk7O2bV9j714vIsLDCFeE4n3sCB06dABgmuNUTvzmrVJ+7NjPuOj3R4lzTpzowOWgS4SFhjB1ymS0tLSY6+LM1ZBg/P3/ZJi9vfIYMzMzEhPisLYewP5f9hEdFcnRI7/SokVzOnbsyLGjR4iMULBj+0/Ur19f5fwjPhyOr89pYqIjOet7hjFjPipRr42NNft+3ktMdCSDBw8q0eaH8W/dupnEhDiV9vT+4AOOex8lJjqSC3+cx3HqFLS0tJT7ExPiGDVqJNu2/UB0VAS+Pqfp0qUzFhYW/Lx3D1GR4Rw6uB9zc3PlMQ+v46hRI/H/6yLRURFs3LieOnXqPHfbjI3rsX7dWgL8/yI6KoJTJ08w0NZWpX969OjO52PHkpgQR2JCHGZmZtjb2xEWGqJy/j69e5OYEFci7hEfDsfvwu/ExkQBDxLi5W7LCL4SRLgilD17dtO27WslrvOjbG1tOHHipMq2enXrEh8Xj0IRjkKhwKiukXLf7NmzePllM/oPsGHv3p+JjIwkJiYWT89dvP9Bb/Ly8gAoKiri9Okz2NraPPX8Qgghns+/9ydsIYQQL7zdXl4MH2bP/v0HABg+3B4vrz306N5dpdzEiQ4MGTyIWbOdiI2N5Y03urFm9SrS0zPw8/NDU1OTpKQkvvhyHJmZmVhadsFt2VJSUlI4fPhXZT09enQnOSUFO7thWDS3YOOG9YRcvYqn565S41u7Zg0hV0OYM9uJ+0X3adeuHffuFVaojW++2YOkpCQGDxmKlaUVHh7LsbS0xO/iRQZYW2NjY8PSpd9w9txZkpJuKo+bPs2Rea7zSUhIwMNjOevWriU3L5d581zJz89n46YNzJgxnTlznAAYNGgg06dPx9nFhZCQq7Rv3w43t2Xcvn2bvXt/VtbrNGc28xcsJCTkKnfu3CkRb99+AwgJvsyUqY6cOePD/fv3AejatSurVq1g7jxXLl78Ewtzc5Yt+xYAjxUrlcdPmTKZ+fMXMH/+ApydnFi3dg3Xb9xgzdp1yrYsXrSQUaP/SZotLCywth7AmI8/wdCwDu7ubnyzZDEOEyc9V9t0dfW4ciWYdevXk5OTy3vvvsPq1Su5dv06QUFBzJvnSssWzVEownFb7g5Aenp6ufvWwsKCfv36MXbsF9wvenCdvtu0gYKCO4wc9RE5OdmMHjWKPV676fnft8jKyiq1nq5WVuzb94vKtn2/7GeP1y6uxUaTmpbG6L+vl4aGBrY2Nvyyfz/Jyckl6rp9+7bK58CgIBwmjC93m4QQQlScJN1CCCFqrH37fmHO7FmYmpoCYGlpxbhxE1SSbh0dHSZNdGDY8A8JCLgEwI0bN+hqZcXoUSPx8/Pj3r17LHf/Zw14XFwcll26YG09QCXpvnXrFs7OLhQVFREVHc3JU6f4b8+eT0y6TU2bsmHjRqKiowGIjb1W4TZmZWXhMncexcXFREfHMH78V+jr67NmzVoA1qxZi8OE8XS16srBQ4eUx23cuAlfX18AtmzeyoYN67CzH6acgrx7127s7e2U5adPm8aCBQs5dsxbeQ1at27N6FEjVRLT7zdvUZYpTUZGBgDZt7JJTU1Vbp/mOIW169Yr67px4wbL3Jbj4uysknR7ee1RXvN169fz6+FDrFy5WqUtHh7uKufU1dVl8uSp3Lz54EcHF5d5bP/pR+YvWEhqaupztW3jpk3Kf2/94UfeevstbKwHEBQURE5ODnfvFpJfkK/S1vLS1tZm0uQpymvW1cqK119/nQ4dO3H37l0AFixcRO/evenfvx87d3qWqMPIyIi6dety86ZqAp2dnU2fvv0xMTEhPT2doqIiAOrXr4+xcT2ioqLLFWPyzWSaNm2KhoYGxcXFFW6jEEKIsknSLYQQosbKyMjg1KnTDLO3Q0NDg1OnT5GRmalSxsLCAgMDA3bvUk1YtLW1VaahfzxmDMOH22Nqaoqenh7a2tpcvRqqckx4RIQyeQFISU6hzWttnhjfd999z3K3ZQwdMphz585z+NcjXL9+vUJtDI+IUEl2UlPTCA8PV34uKioiMzOThg1VH5QVGqb455i0Bwlh2KPbUtNo0KAhAPr6+jRvboG7uxtubkuVZbS0tMjJyVGp98rlKxWK/6G2bdtiaWnF5EkTlds0NbXQ19dDX0+P/IKCv2MMU4kRIEyh2hZ9fT0MDQ3Jzc0FHqzvf5hwAwQEBKClpUXLli3Jzc195rZpamoyadJErAcMoEmTJujoaKOjo0N+fv4zXYPHxSckKBNueHCNateuzdUQ1Tj09PSweGRK/eP7gFJnHQAlfgyo6EPRCgoK0NLSQldXl4K/+0gIIUTlkqRbCCFEjbbby4vFixYC4OTsUmJ/7doGAIz+6GOVxAzg7t0HiYqtjQ1z57qwYOFCAvwDyM3LY9y4L+ncqZNK+XuF91Q+F1OMpsaTH3/i7rGC/QcO8O677/JOr15Mm+bIuPEOeHt7P0jeH0uAtEtZH17inMXFFD42Rb24uBgNTdU4Hp3G/jBpv3fvn7qKKUZT88H5a9euDcD0GTMJDAxSqefh9PCHbuerTj8uLwOD2ri7u3O0lLvkBY8kjI+295+4S7ZFU7N8j515nraNH/cVYz/7lHmuX6NQKLh9O5/5813R0dZ56jmLiopKJLe1tEv2bf5jU7lr1zYgOSWFoUPtS5TNvnWr1HNlZmZSVFREvbp1nxrTQ+np6WRlZdGqVctyla9nXI+8vDxJuIUQogpJ0i2EEKJGO3PGB21tHYopxsfHt8T+iIhICgoKMDVtip+fX6l1WFlZ4h/gz7ZtPym3PenOYkXFxMQSE7OZ77/fzPp1axk+zB5vb2/SMzJoZGKiUrZdu3aVcs6KSktLIynpJubm5sr18c/j7t27aGqpJsUhIcG0bNmSa9euPXf9jzM1NaVx48bKNcqdO3fm/v37REdHP1fbrKwsOX78N375ZT/w4C5xixYtiIyIVJYpLCxES1NL5bj09HQMDQ3R19dX3hUvT98GB4fQyMSEe/fulfsJ/IWFhURERPJK61fwPXu2zPLFxcUcPHSYoUMG4+GxssS6bgMDA+7cuaP8QeLVV18t8WBCIYQQlUueXi6EEKJGKyoq4q23e/H22++oTP1+KC8vj42bvmP+167Y2Q3F3Nyc/7Rvz6effIyd3VAAYmNj6dihA2+99RYtWjRnxozpdOzY8bni0tPTY/GihXTv/gampqZYWVrSsWNHIiMfJGx//HGBBg0aMGH8OMzNzfl4zBh69VLfq5nc3d2Z6DCBzz79hBYtmtOmTRuG2dvzxRefV7iuuPh4evbsiYmJCXX/vgPrsWIVQ4cOwXHqFFq3bk2rVq2wtbFh5iOvsnpWd+7cYdVKD9q2fY2uXbuyaOF8Dh/+VTm1+lnbFhN7jf/9779YWnahVatWLFv6LSYNG6q2NS6OTp06YWZmRn1jYzQ0NAgMDCI/P585s2dhbm7OoIEDsbeze8JZ/nH23DkCAi7xw9bNvPW//2FmZoalZRdmzZqpfOp9aXx8fena1aocV+qBpUuXkZiYyJFfDzF06BBeeeUVmje3YPiwYfz2m7dydgBAt65dy5XMCyGEeHZyp1sIIUSN93Bt75MsW+ZGeno6Ex0m0KxZM7KzswkODmH13w8j275jJ+3bt2fjhnUUFxdz4OAhtm376bneT3z//n2MjY1ZvWolDRs2JCMjk2PHjikf2BYVFcUcJ2cmTXRgypTJHDl6lI2bNjFq5IhnPufz8Ny1m/z8AsaN+xIXF2du385HoVDw/eYtFa5rwYKFuLrOY+SID7l58ybd3uiBr68vH435BMepk5kwYTyFhYVERUXjuav0h9BVxLVr1zh6zJvtP/1EvXr1OHnqJHOcnJ+7batWrca8WTM8d+4gPz+fHTs98T5+HKM6/7x+a+OmTaxcuQJfn9Po6+vTtVt34uPjmThxMi5znRk5cgTnz5/H3cOD5W7LymzLqNEfMXvWTDw83GnQoD6pqan4+V0kLe3JD2rbtWs33seOUKdOnRLr1EuTlZXFAGtbHCaMZ/LkSZiZmnLr1i0UCgWLFi4mOzsbgCZNmmBp2YWJkyaVWacQQohnp/FSUzN5VKUQQgghaqRpjlPp06c373/Qp+zC/2KbNm0gODiEtWvXVVqdzk5zqFu3LjNnza60OoUQQpQk08uFEEIIIWq4hQsXczsvr1LrTEtPZ5nb8kqtUwghREkyvVwIIYQQooaLj49n6w8/VmqdmzZ9V6n1CSGEKJ1MLxdCCCGEEEIIIaqITC8XQgghhBBCCCGqiCTdQgghhBBCCCFEFZGkWwghhBBCCCGEqCKSdAshhBBCCCGEEFVEkm4hhBBCCCGEEKKKSNIthBBCCCGEEEJUEUm6hRBCCCGEEEKIKiJJtxBCCCGEEEIIUUUk6RZCCCGEEEIIIarI/wEm1o98YdcydgAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "jetTransient": { + "display_id": null + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "city_summer_means = {}\n", + "for city in CITY_PROFILES:\n", + " v = climate.where(climate[\"city\"] == city)\n", + " if city == \"Sydney\" or city == \"Sao Paulo\":\n", + " s = v.where((v[\"day\"] <= 80) | (v[\"day\"] >= 355))\n", + " else:\n", + " s = v.where((v[\"day\"] >= SUMMER_START) & (v[\"day\"] <= SUMMER_END))\n", + " city_summer_means[city] = s[\"temperature\"].mean()\n", + "\n", + "sorted_cities = sorted(city_summer_means.items(), key=lambda x: x[1], reverse=True)\n", + "names = [c for c, _ in sorted_cities]\n", + "means = [m for _, m in sorted_cities]\n", + "colors = [\"#e63946\" if m > 30 else \"#f4a261\" if m > 20 else \"#457b9d\" for m in means]\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 4))\n", + "bars = ax.barh(names, means, color=colors, edgecolor=\"white\")\n", + "ax.bar_label(bars, fmt=\"%.1f °C\", padding=4)\n", + "ax.set_xlabel(\"Mean summer temperature (°C)\")\n", + "ax.set_title(\"Cities ranked by summer heat\")\n", + "ax.set_xlim(0, max(means) * 1.15)\n", + "ax.grid(True, axis=\"x\", linestyle=\"--\", alpha=0.4)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "d1601d7b", + "metadata": {}, + "source": [ + "---\n", + "## Part 6 — Mutations\n", + "\n", + "CTable supports structural and value mutations: adding/dropping columns, deleting rows, sorting in place." + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "2c3a977b", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-14T12:39:27.972265822Z", + "start_time": "2026-04-14T12:39:27.857019185Z" + }, + "execution": { + "iopub.execute_input": "2026-04-07T12:06:16.566822Z", + "iopub.status.busy": "2026-04-07T12:06:16.563241Z", + "iopub.status.idle": "2026-04-07T12:06:16.715871Z", + "shell.execute_reply": "2026-04-07T12:06:16.712493Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Table with feels_like column:\n", + " city day temperature humidity wind_speed pressure feels_like \n", + " +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# CTable indexing with mixed dtypes, persistent sidecars, and a packed .b2z bundle. + +import shutil +import tempfile +from dataclasses import dataclass +from pathlib import Path + +import blosc2 + + +@dataclass +class Measurement: + sensor_id: int = blosc2.field(blosc2.int32()) + temperature: float = blosc2.field(blosc2.float64()) + region: str = blosc2.field(blosc2.string(max_length=12), default="") + active: bool = blosc2.field(blosc2.bool(), default=True) + status: str = blosc2.field(blosc2.string(max_length=12), default="") + + +def load_rows(table: blosc2.CTable, nrows: int = 240) -> None: + regions = ["north", "south", "east", "west"] + for i in range(nrows): + region = regions[i % len(regions)] + active = (i % 7) not in (0, 6) + status = "alert" if i % 23 == 0 else ("warm" if i % 11 == 0 else "ok") + table.append([i, 12.5 + (i % 40) * 0.35, region, active, status]) + + +bundle_path = Path("indexed_measurements.b2z").resolve() +workspace = Path(tempfile.mkdtemp()) +table_path = workspace / "indexed_measurements.b2d" + +pt = None +packed = None + +try: + print("Creating a CTable with mixed dtypes...") + pt = blosc2.CTable(Measurement, urlpath=str(table_path), mode="w") + load_rows(pt) + + # Create a couple of indexes on columns with different dtypes. + print("\nCreating indexes...") + idx_sensor = pt.create_index("sensor_id", kind=blosc2.IndexKind.FULL) + idx_active = pt.create_index("active") + print("Indexes created:", pt.indexes) + print("sensor_id stale?", idx_sensor.stale) + print("active stale?", idx_active.stale) + + # Queries can combine indexed and non-indexed predicates. + recent_active = pt.where((pt["sensor_id"] >= 180) & pt["active"] & (pt["region"] == "north")) + print("\nLive rows with sensor_id >= 180, active=True, region='north':", len(recent_active)) + print("sensor_ids:", recent_active["sensor_id"]) + print("statuses:", recent_active["status"].to_numpy()) + + # Close the table, pack the TreeStore into a single .b2z file, and reopen it. + del pt + pt = None + + if bundle_path.exists(): + bundle_path.unlink() + + store = blosc2.TreeStore(str(table_path), mode="r") + try: + packed_path = store.to_b2z(filename=str(bundle_path), overwrite=True) + finally: + store.close() + + print(f"\nPacked bundle created at: {packed_path}") + + packed = blosc2.open(str(bundle_path), mode="r") + print("Reopened object type:", type(packed).__name__) + print("Indexes after reopen from .b2z:", packed.indexes) + + # Query directly against the .b2z bundle; no unpack step is needed. + warm_active = packed.where(packed["active"] & (packed["status"] == "warm") & (packed["sensor_id"] > 100)) + print("\nRows from .b2z with active=True, status='warm', sensor_id > 100:", len(warm_active)) + print("sensor_ids:", warm_active["sensor_id"]) + print("regions:", warm_active["region"].to_numpy()) + + print("\nThe packed file is kept on disk.") + print(f"Inspect it later with: f = blosc2.open({bundle_path.name!r}, mode='r')") + print("Then call: f.info()") + print("For a quick check of the available info entry point, print: f.info") + +finally: + if packed is not None: + del packed + if pt is not None: + del pt + shutil.rmtree(workspace, ignore_errors=True) diff --git a/examples/ctable/mutations.py b/examples/ctable/mutations.py new file mode 100644 index 00000000..a4e70831 --- /dev/null +++ b/examples/ctable/mutations.py @@ -0,0 +1,72 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Mutations: delete, compact, sort_by, add/drop/rename columns, assign. + +from dataclasses import dataclass + +import blosc2 + + +@dataclass +class Employee: + id: int = blosc2.field(blosc2.int64(ge=0)) + name: str = blosc2.field(blosc2.string(max_length=16), default="") + salary: float = blosc2.field(blosc2.float64(ge=0), default=0.0) + + +data = [ + (0, "Alice", 85_000.0), + (1, "Bob", 72_000.0), + (2, "Carol", 91_000.0), + (3, "Dave", 65_000.0), + (4, "Eve", 110_000.0), + (5, "Frank", 78_000.0), +] + +t = blosc2.CTable(Employee, new_data=data) +print("Original:") +print(t) + +# -- delete(): logical deletion (tombstone) --------------------------------- +t.delete([1, 3]) # remove Bob and Dave +print(f"After deleting rows 1 and 3: {len(t)} live rows") +print(t) + +# -- compact(): physically close the gaps ----------------------------------- +t.compact() +print("After compact():") +print(t) + +# -- sort_by(): returns a sorted copy by default ---------------------------- +sorted_t = t.sort_by("salary", ascending=False) +print("Sorted by salary descending:") +print(sorted_t) + +# -- sort_by(inplace=True) -------------------------------------------------- +t.sort_by("name", inplace=True) +print("Sorted in-place by name:") +print(t) + +# -- add_column(): new column filled with a default ------------------------- +t.add_column("bonus", blosc2.float64(ge=0), default=0.0) +print("After add_column('bonus'):") +print(t) + +# -- assign(): fill the new column with computed values --------------------- +bonuses = t["salary"].to_numpy() * 0.10 +t["bonus"].assign(bonuses) +print("After assigning 10% bonuses:") +print(t) + +# -- rename_column() -------------------------------------------------------- +t.rename_column("bonus", "annual_bonus") +print(f"Column names after rename: {t.col_names}") + +# -- drop_column() ---------------------------------------------------------- +t.drop_column("annual_bonus") +print(f"Column names after drop: {t.col_names}") diff --git a/examples/ctable/nullable.py b/examples/ctable/nullable.py new file mode 100644 index 00000000..cdbd1bf6 --- /dev/null +++ b/examples/ctable/nullable.py @@ -0,0 +1,148 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Nullable columns: null_value sentinels, null-aware aggregates, +# is_null / notnull, sort nulls-last, Arrow null masking, CSV empty cells. +# +# CTable does not have a built-in "missing" bit per row like pandas does. +# Instead it uses a *sentinel value* approach: you choose a specific value +# that represents "null" for a column, and the library treats it +# transparently in aggregates, sorting, unique(), value_counts(), and +# Arrow export. +# +# This is especially useful for integer and string columns that have no +# natural null (unlike float, which can use NaN). + +import os +import tempfile +from dataclasses import dataclass + +import blosc2 + +# --------------------------------------------------------------------------- +# Schema with nullable columns +# --------------------------------------------------------------------------- +# Use null_value= on any spec to declare the sentinel. +# The sentinel bypasses validation constraints (ge/le etc.) so you can +# store it even when it would otherwise violate them. + + +@dataclass +class Reading: + sensor_id: int = blosc2.field(blosc2.int32(ge=0)) + # -999 is "no reading" for temperature (normally ge=-50, le=60) + temperature: float = blosc2.field(blosc2.float64(ge=-50.0, le=60.0, null_value=-999.0), default=-999.0) + # "" is "unknown" for location (string) + location: str = blosc2.field(blosc2.string(max_length=16, null_value=""), default="") + # -1 is "not measured" for signal strength (normally ge=0, le=100) + signal: int = blosc2.field(blosc2.int8(ge=0, le=100, null_value=-1), default=-1) + + +data = [ + (0, 22.3, "roof", 87), + (1, -999.0, "cellar", 41), # temperature unknown + (2, 18.7, "", -1), # location and signal unknown + (3, 31.5, "garage", -1), # signal unknown + (4, -999.0, "", 62), # temperature and location unknown + (5, 15.1, "roof", 95), +] + +t = blosc2.CTable(Reading, new_data=data) +print("Table with nullable columns:") +print(t) + +# --------------------------------------------------------------------------- +# Detecting nulls +# --------------------------------------------------------------------------- +print("\n--- is_null() / notnull() ---") +temp_null = t["temperature"].is_null() +print(f"temperature is_null : {temp_null.tolist()}") +print(f"temperature null_count: {t['temperature'].null_count()}") + +loc_null = t["location"].is_null() +print(f"location is_null : {loc_null.tolist()}") + +# Use notnull() as a filter mask +valid_temps = t["temperature"].to_numpy()[t["temperature"].notnull()] +print(f"Valid temperatures : {valid_temps}") + +# --------------------------------------------------------------------------- +# Null-aware aggregates +# --------------------------------------------------------------------------- +print("\n--- Aggregates skip null sentinels ---") +print(f"temperature.mean() = {t['temperature'].mean():.2f} (only 3 non-null readings)") +print(f"temperature.min() = {t['temperature'].min():.2f}") +print(f"temperature.max() = {t['temperature'].max():.2f}") +print(f"signal.sum() = {t['signal'].sum()} (non-null: 87+41+62+95 = 285)") + +# --------------------------------------------------------------------------- +# unique() and value_counts() exclude the null sentinel +# --------------------------------------------------------------------------- +print("\n--- unique / value_counts exclude null ---") +print(f"location unique : {t['location'].unique().tolist()}") +print(f"signal value_counts : {t['signal'].value_counts()}") + +# --------------------------------------------------------------------------- +# Appending: the sentinel bypasses validation constraints +# --------------------------------------------------------------------------- +print("\n--- Append with sentinel bypasses ge/le constraints ---") +# temperature has ge=-50, le=60; normally -999 would fail — but it's the sentinel +t.append((6, -999.0, "attic", 55)) +print(f"Appended sensor 6 (temperature=null). Rows: {len(t)}") +assert t["temperature"].null_count() == 3 + +# --------------------------------------------------------------------------- +# Sorting: nulls always go last, regardless of ascending/descending +# --------------------------------------------------------------------------- +print("\n--- sort_by: nulls go last ---") +s_asc = t.sort_by("temperature") +print("Ascending (nulls last):") +print([round(v, 1) for v in s_asc["temperature"].to_numpy().tolist()]) + +s_desc = t.sort_by("temperature", ascending=False) +print("Descending (nulls still last):") +print([round(v, 1) for v in s_desc["temperature"].to_numpy().tolist()]) + +# --------------------------------------------------------------------------- +# Arrow interop: null sentinels become proper Arrow nulls +# --------------------------------------------------------------------------- +try: + import pyarrow as _pa # noqa: F401 + + arrow = t.to_arrow() + temp_col = arrow.column("temperature") + loc_col = arrow.column("location") + print("\n--- Arrow export ---") + print(f"Arrow temperature null_count: {temp_col.null_count}") + print(f"Arrow location null_count : {loc_col.null_count}") + print(f"Arrow temperature values : {temp_col.to_pylist()}") +except ImportError: + print("\npyarrow not installed — skipping Arrow demo.") + +# --------------------------------------------------------------------------- +# CSV round-trip: empty cells become the null sentinel on import +# --------------------------------------------------------------------------- +with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + f.write("sensor_id,temperature,location,signal\n") + f.write("10,25.1,lab,80\n") + f.write("11,,office,\n") # temperature and signal missing → sentinels + f.write("12,18.3,,70\n") # location missing → sentinel "" + csv_path = f.name + +print("\n--- from_csv with empty cells ---") +t2 = blosc2.CTable.from_csv(csv_path, Reading) +print(t2) +print(f"temperature null_count: {t2['temperature'].null_count()}") +print(f"signal null_count : {t2['signal'].null_count()}") +print(f"location null_count : {t2['location'].null_count()}") +os.unlink(csv_path) + +# --------------------------------------------------------------------------- +# describe() shows null count for nullable columns +# --------------------------------------------------------------------------- +print("\n--- describe() ---") +t.describe() diff --git a/examples/ctable/persistence.py b/examples/ctable/persistence.py new file mode 100644 index 00000000..81f4ffcd --- /dev/null +++ b/examples/ctable/persistence.py @@ -0,0 +1,79 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Persistence: write to disk, open read-only/read-write, generic open(), save, load. + +import shutil +import tempfile +from dataclasses import dataclass + +import numpy as np + +import blosc2 + + +@dataclass +class Measurement: + sensor_id: int = blosc2.field(blosc2.int32(ge=0)) + temperature: float = blosc2.field(blosc2.float64(), default=0.0) + day: int = blosc2.field(blosc2.int16(ge=1, le=365), default=1) + + +rng = np.random.default_rng(0) +N = 10_000 +data = [ + (int(rng.integers(0, 20)), float(rng.normal(15.0, 10.0)), int(rng.integers(1, 366))) for _ in range(N) +] + +tmpdir = tempfile.mkdtemp(prefix="blosc2_ctable_") +disk_path = f"{tmpdir}/measurements" +copy_path = f"{tmpdir}/measurements_copy" + +try: + # -- Create directly on disk (mode="w") --------------------------------- + # Extensionless paths default to a directory-backed TreeStore. + t = blosc2.CTable(Measurement, new_data=data, urlpath=disk_path, mode="w") + print(f"Created on disk: {len(t):,} rows at '{disk_path}'") + t.info() + t.close() + + # -- Open read-only (default) ------------------------------------------- + ro = blosc2.CTable.open(disk_path) # mode="r" by default + print(f"Opened read-only: {len(ro):,} rows") + print(f" mean temperature: {ro['temperature'].mean():.2f}") + + try: + ro.append(Measurement(sensor_id=0, temperature=20.0, day=1)) + except ValueError as e: + print(f" Write blocked (read-only): {e}") + ro.close() + + # -- Generic open() materializes the CTable ----------------------------- + opened = blosc2.open(disk_path, mode="r") + print(f"Generic open(): {type(opened).__name__} with {len(opened):,} rows") + opened.close() + + # -- Open read-write and mutate ----------------------------------------- + rw = blosc2.CTable.open(disk_path, mode="a") + rw.append(Measurement(sensor_id=99, temperature=99.0, day=100)) + print(f"\nAfter append (read-write): {len(rw):,} rows") + rw.close() + + # -- save(): copy in-memory table to disk ------------------------------- + mem = blosc2.CTable(Measurement, new_data=data[:100]) + mem.save(copy_path) + print(f"In-memory table saved to '{copy_path}'") + + # -- load(): pull a disk table fully into RAM --------------------------- + ram = blosc2.CTable.load(disk_path) + print(f"Loaded into RAM: {len(ram):,} rows (cbytes={ram.cbytes:,})") + with blosc2.CTable.open(disk_path) as check: + assert len(ram) == len(check) + +finally: + shutil.rmtree(tmpdir) + print("\nTemporary files removed.") diff --git a/examples/ctable/querying.py b/examples/ctable/querying.py new file mode 100644 index 00000000..433c5110 --- /dev/null +++ b/examples/ctable/querying.py @@ -0,0 +1,60 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Querying: where() filters, select() column projection, and chaining. + +from dataclasses import dataclass + +import blosc2 + + +@dataclass +class Sale: + id: int = blosc2.field(blosc2.int64(ge=0)) + region: str = blosc2.field(blosc2.string(max_length=16), default="") + amount: float = blosc2.field(blosc2.float64(ge=0), default=0.0) + returned: bool = blosc2.field(blosc2.bool(), default=False) + + +data = [ + (0, "North", 120.0, False), + (1, "South", 340.0, False), + (2, "North", 85.0, True), + (3, "East", 210.0, False), + (4, "West", 430.0, False), + (5, "South", 60.0, True), + (6, "East", 300.0, False), + (7, "North", 500.0, False), + (8, "West", 175.0, True), + (9, "South", 220.0, False), +] + +t = blosc2.CTable(Sale, new_data=data) + +# -- where(): row filter ---------------------------------------------------- +high_value = t.where(t["amount"] > 200) +print(f"Sales > $200: {len(high_value)} rows") +print(high_value) + +not_returned = t.where(not t["returned"]) +print(f"Not returned: {len(not_returned)} rows") + +# -- chained filters (views are composable) --------------------------------- +north = t.where(t["region"] == "North") +north_big = north.where(north["amount"] > 100) +print(f"North region + amount > 100: {len(north_big)} rows") +print(north_big) + +# -- select(): column projection (no data copy) ----------------------------- +slim = t.select(["id", "amount"]) +print("id + amount only:") +print(slim) + +# -- combined: select columns, then filter rows ----------------------------- +result = t.select(["region", "amount"]).where(not t["returned"]) +print("Region + amount for non-returned sales:") +print(result) diff --git a/examples/ctable/real_world.py b/examples/ctable/real_world.py new file mode 100644 index 00000000..927f05bd --- /dev/null +++ b/examples/ctable/real_world.py @@ -0,0 +1,114 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Real-world example: weather station data. +# +# Simulates a year of readings from 10 stations, then: +# - filters to a single station +# - finds the 5 hottest days +# - computes correlations between meteorological variables +# - saves the filtered data to disk and reloads it + +import shutil +import tempfile +from dataclasses import dataclass + +import numpy as np + +import blosc2 + + +@dataclass +class WeatherReading: + station_id: int = blosc2.field(blosc2.int32(ge=0, le=9)) + temperature: float = blosc2.field(blosc2.float32(ge=-80.0, le=60.0), default=20.0) + humidity: float = blosc2.field(blosc2.float32(ge=0.0, le=100.0), default=50.0) + wind_speed: float = blosc2.field(blosc2.float32(ge=0.0, le=200.0), default=0.0) + pressure: float = blosc2.field(blosc2.float32(ge=800.0, le=1100.0), default=1013.0) + day_of_year: int = blosc2.field(blosc2.int16(ge=1, le=365), default=1) + + +# -- Generate a full year of readings for 10 stations ---------------------- +rng = np.random.default_rng(42) +N_STATIONS = 10 +N_DAYS = 365 +N = N_STATIONS * N_DAYS # 3 650 rows + +station_ids = np.tile(np.arange(N_STATIONS, dtype=np.int32), N_DAYS) +temperatures = rng.normal(15.0, 12.0, N).clip(-80, 60).astype(np.float32) +humidities = rng.uniform(20.0, 95.0, N).astype(np.float32) +wind_speeds = rng.exponential(10.0, N).clip(0, 200).astype(np.float32) +pressures = rng.normal(1013.0, 8.0, N).clip(800, 1100).astype(np.float32) +days = np.repeat(np.arange(1, N_DAYS + 1, dtype=np.int16), N_STATIONS) + +arr = np.zeros( + N, + dtype=[ + ("station_id", np.int32), + ("temperature", np.float32), + ("humidity", np.float32), + ("wind_speed", np.float32), + ("pressure", np.float32), + ("day_of_year", np.int16), + ], +) +for col, val in [ + ("station_id", station_ids), + ("temperature", temperatures), + ("humidity", humidities), + ("wind_speed", wind_speeds), + ("pressure", pressures), + ("day_of_year", days), +]: + arr[col] = val + +t = blosc2.CTable(WeatherReading, new_data=arr, validate=False) +print(f"Full dataset: {len(t):,} rows ({N_STATIONS} stations × {N_DAYS} days)") +t.info() + +# -- Filter to station 3 ---------------------------------------------------- +station3 = t.where(t["station_id"] == 3) +print(f"Station 3: {len(station3)} readings") +print(f" mean temperature : {station3['temperature'].mean():.1f} °C") +print(f" mean humidity : {station3['humidity'].mean():.1f} %") +print(f" mean wind speed : {station3['wind_speed'].mean():.1f} km/h\n") + +# -- 5 hottest days at station 3 (sort full table, then filter) ------------ +sorted_by_temp = t.sort_by("temperature", ascending=False) +hottest_s3 = sorted_by_temp.where(sorted_by_temp["station_id"] == 3) +print("5 hottest days at station 3:") +print(hottest_s3.head(5)) + +# -- Covariance of numeric variables (all stations) ------------------------- +numeric = t.select(["temperature", "humidity", "wind_speed", "pressure"]) +cov = numeric.cov() +labels = ["temp", "humidity", "wind", "pressure"] +col_w = 11 +print("Covariance matrix (all stations):") +print(" " * 10 + "".join(f"{lbl:>{col_w}}" for lbl in labels)) +for i, lbl in enumerate(labels): + print(f"{lbl:<10}" + "".join(f"{cov[i, j]:>{col_w}.3f}" for j in range(4))) + +# -- Save station 3 data to disk and reload --------------------------------- +tmpdir = tempfile.mkdtemp(prefix="blosc2_weather_") +path = f"{tmpdir}/station3" +try: + # Views cannot be sorted or saved directly — materialise via Arrow first + s3_copy = blosc2.CTable.from_arrow(station3.to_arrow()) + s3_copy.sort_by("day_of_year", inplace=True) + sorted_s3 = s3_copy + sorted_s3.save(path, overwrite=True) + print(f"\nStation 3 data saved to '{path}'") + + reloaded = blosc2.CTable.load(path) + print( + f"Reloaded: {len(reloaded)} rows, " + f"days {reloaded['day_of_year'].min()}–{reloaded['day_of_year'].max()}" + ) +finally: + shutil.rmtree(tmpdir) + print("Temporary files removed.") diff --git a/examples/ctable/schema.py b/examples/ctable/schema.py new file mode 100644 index 00000000..5a5b0d05 --- /dev/null +++ b/examples/ctable/schema.py @@ -0,0 +1,61 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# Schema layer: dataclass field specs, constraints, validation, and null_value. + +from dataclasses import dataclass + +import blosc2 + + +@dataclass +class Product: + id: int = blosc2.field(blosc2.int64(ge=0)) + price: float = blosc2.field(blosc2.float64(ge=0.0, le=10_000.0), default=0.0) + stock: int = blosc2.field(blosc2.int32(ge=0), default=0) + # null_value="" means an empty string represents "unknown category" + category: str = blosc2.field(blosc2.string(max_length=32, null_value=""), default="") + on_sale: bool = blosc2.field(blosc2.bool(), default=False) + + +t = blosc2.CTable(Product) + +# Valid row +t.append(Product(id=1, price=29.99, stock=100, category="electronics", on_sale=False)) +t.append(Product(id=2, price=4.50, stock=200, category="food", on_sale=True)) +# "" is the null sentinel for category — stored as-is, not rejected +t.append(Product(id=3, price=0.0, stock=0, category="", on_sale=False)) +print("Valid rows appended successfully.") +print(t) + +# Inspect the compiled schema +print("Schema:") +for col in t.schema.columns: + nv = getattr(col.spec, "null_value", None) + nv_str = f" null_value={nv!r}" if nv is not None else "" + print(f" {col.name:<12} dtype={col.dtype} spec={col.spec}{nv_str}") + +# Constraint violation: price < 0 +try: + t.append(Product(id=4, price=-1.0, stock=10, category="misc", on_sale=False)) +except Exception as e: + print(f"\nCaught validation error (price < 0): {e}") + +# Constraint violation: id < 0 +try: + t.append(Product(id=-5, price=10.0, stock=10, category="misc", on_sale=False)) +except Exception as e: + print(f"Caught validation error (id < 0): {e}") + +# String too long (max_length=32) +try: + t.append(Product(id=5, price=1.0, stock=1, category="a" * 50, on_sale=False)) +except Exception as e: + print(f"Caught validation error (string too long): {e}") + +print(f"\nTable still has {len(t)} valid rows.") +print(f"category null_count: {t['category'].null_count()} (the row with category='' is null)") diff --git a/examples/mmap-rw.py b/examples/mmap-rw.py index 2fee6d7c..f27e6c19 100644 --- a/examples/mmap-rw.py +++ b/examples/mmap-rw.py @@ -24,7 +24,7 @@ blosc2.asarray(a, urlpath=urlpath, mmap_mode="w+", initial_mapping_size=initial_mapping_size) # Read the ndarray back via the general open function -a_read = blosc2.open(urlpath, mmap_mode="r") +a_read = blosc2.open(urlpath, mode="r", mmap_mode="r") assert np.all(a == a_read) blosc2.remove_urlpath(urlpath) diff --git a/examples/ndarray/blosc2_3_10_demo.py b/examples/ndarray/blosc2_3_10_demo.py index 17a1c8b9..00725cd0 100644 --- a/examples/ndarray/blosc2_3_10_demo.py +++ b/examples/ndarray/blosc2_3_10_demo.py @@ -33,7 +33,7 @@ # Reopen persistent expression, compute, and write to disk with blosc2 t0 = time.time() -lexpr = blosc2.open(urlpath=url_path) +lexpr = blosc2.open(urlpath=url_path, mode="r") dt = time.time() - t0 print(f"In {round(dt * 1000, 3)} ms opened lazy expression: shape = {lexpr.shape}, dtype = {lexpr.dtype}") t1 = time.time() diff --git a/examples/ndarray/compute_expr.py b/examples/ndarray/compute_expr.py index 60ecf606..b6ed8ce3 100644 --- a/examples/ndarray/compute_expr.py +++ b/examples/ndarray/compute_expr.py @@ -50,7 +50,7 @@ # Get a LazyExpr instance (da**2 + db**2 + 2 * da * db + 1).save(urlpath="c.b2nd") -dc = blosc2.open("c.b2nd") +dc = blosc2.open("c.b2nd", mode="r") # Evaluate: output is a NDArray dc2 = dc.compute() diff --git a/examples/ndarray/dsl_save.py b/examples/ndarray/dsl_save.py index 24870afb..55e975e9 100644 --- a/examples/ndarray/dsl_save.py +++ b/examples/ndarray/dsl_save.py @@ -46,7 +46,7 @@ def heat_step(u, v): print("LazyUDF saved to heat_step.b2nd") # ── reload in a 'fresh' context (no reference to heat_step) ───────── -reloaded = blosc2.open("heat_step.b2nd") +reloaded = blosc2.open("heat_step.b2nd", mode="r") assert isinstance(reloaded, blosc2.LazyUDF), "Expected a LazyUDF after open()" assert isinstance(reloaded.func, DSLKernel), "func must be a DSLKernel after reload" assert reloaded.func.dsl_source is not None, "dsl_source must survive the round-trip" @@ -64,7 +64,7 @@ def heat_step(u, v): lazy2 = blosc2.lazyudf(heat_step, (u2, v), dtype=np.float64) lazy2.save(urlpath="heat_step2.b2nd") -reloaded2 = blosc2.open("heat_step2.b2nd") +reloaded2 = blosc2.open("heat_step2.b2nd", mode="r") result2 = reloaded2.compute() expected2 = u2[()] + 0.1 * (v[()] - u2[()]) assert np.allclose(result2[()], expected2) diff --git a/examples/ndarray/meta.py b/examples/ndarray/meta.py index c9d4a498..5fa13d12 100644 --- a/examples/ndarray/meta.py +++ b/examples/ndarray/meta.py @@ -27,7 +27,7 @@ print(a.info) # Read a b2nd array from disk -b = blosc2.open(urlpath) +b = blosc2.open(urlpath, mode="r") # Deal with meta m1 = b.schunk.meta.get("m5", b"0000") diff --git a/examples/ndarray/persistency.py b/examples/ndarray/persistency.py index 014519a5..4541da47 100644 --- a/examples/ndarray/persistency.py +++ b/examples/ndarray/persistency.py @@ -20,6 +20,6 @@ a = blosc2.asarray(nparray, urlpath=urlpath, mode="w") # Read the array from disk -b = blosc2.open(urlpath) +b = blosc2.open(urlpath, mode="r") # And see its contents print(b[...]) diff --git a/examples/ndarray/reduce_and_enlarge.py b/examples/ndarray/reduce_and_enlarge.py index e6841441..928a4fe8 100644 --- a/examples/ndarray/reduce_and_enlarge.py +++ b/examples/ndarray/reduce_and_enlarge.py @@ -49,7 +49,7 @@ url_path = "my_expr.b2nd" # Open the saved file -lazy_expr = blosc2.open(urlpath=url_path) +lazy_expr = blosc2.open(urlpath=url_path, mode="r") print(lazy_expr) print(f"expr (after open) shape: {lazy_expr.shape}; dtype: {lazy_expr.dtype}") # Evaluate and print the result of the lazy expression (should be a 2x4 arr) diff --git a/examples/ndarray/reduce_expr_save.py b/examples/ndarray/reduce_expr_save.py index 99f5c58a..1fdb257e 100644 --- a/examples/ndarray/reduce_expr_save.py +++ b/examples/ndarray/reduce_expr_save.py @@ -24,13 +24,13 @@ # Get a LazyExpr instance c = a**2 + b**2 + 2 * a * b + 1 c.save(urlpath="c.b2nd") -c = blosc2.open("c.b2nd") +c = blosc2.open("c.b2nd", mode="r") # Evaluate: output is a NDArray d = blosc2.lazyexpr("a + c.sum() + a.std()", operands={"a": a, "c": c}) d.save(urlpath="lazy-d.b2nd") # Load the expression from disk -d = blosc2.open("lazy-d.b2nd") +d = blosc2.open("lazy-d.b2nd", mode="r") print(f"Expression: {d}") assert isinstance(d, blosc2.LazyExpr) e = d.compute() diff --git a/plans/changing-default-open-mode.md b/plans/changing-default-open-mode.md new file mode 100644 index 00000000..9b97cc84 --- /dev/null +++ b/plans/changing-default-open-mode.md @@ -0,0 +1,341 @@ +# Plan For Changing `blosc2.open()` Default Mode To Read-Only + +## Goal + +Change the default mode for `blosc2.open(...)` from `"a"` to `"r"` so that +opening an existing object is non-mutating and unsurprising by default. + +The change should: + +- reduce accidental write access +- avoid implicit unpack / rewrite work for store-backed containers +- align with user expectations for a generic `open(...)` API +- preserve a smooth migration path for existing code that relied on writable + opens without an explicit `mode=` + +This plan is for later consideration and rollout design. It does not assume +that the change should land immediately. + +## Motivation + +Today, `blosc2.open(...)` defaults to `"a"` in +[src/blosc2/schunk.py](/Users/faltet/blosc/python-blosc2/src/blosc2/schunk.py). + +That means: + +- opening a `.b2z` store without `mode=` may create a writable working copy +- append-mode store opens may unpack zip-backed stores into a temporary working + directory immediately +- code that only intends to inspect metadata or query data can still enter a + mutation-capable path by accident + +This is especially surprising for: + +- `TreeStore` +- `DictStore` +- `CTable` +- other container-like objects opened through the generic dispatcher + +By contrast, users generally expect a bare `open(path)` call to be safe for +inspection unless they explicitly request write access. + +## Current Situation + +### Default values today + +The following default to `"a"` today: + +- `blosc2.open(...)` +- `DictStore(...)` +- `TreeStore(...)` +- `CTable(...)` constructor when opening/creating through `urlpath` + +At the same time: + +- `CTable.open(...)` already defaults to `"r"` + +This creates an inconsistency where: + +- `blosc2.open("table.b2z")` is writable by default +- `blosc2.CTable.open("table.b2z")` is read-only by default + +### Concrete user surprise + +For a `.b2z` store, append mode currently does extra work: + +1. create a working directory (usually temporary) +2. extract the archive into that working directory +3. serve reads/writes from the extracted layout +4. repack on close + +This is implemented in +[src/blosc2/dict_store.py](/Users/faltet/blosc/python-blosc2/src/blosc2/dict_store.py). + +That behavior is reasonable when the caller explicitly asked for `"a"`, but +surprising when it is triggered only because `mode` was omitted. + +## Desired End State + +The target behavior is: + +```python +blosc2.open(path) +``` + +should behave as if the user had written: + +```python +blosc2.open(path, mode="r") +``` + +unless the object category does not support read-only opening for technical +reasons. In such cases, the exception should be explicit and documented. + +The user should need to opt into mutation with: + +- `mode="a"` +- `mode="w"` + +## Design Principles + +The migration should follow these rules: + +- do not silently change semantics without a warning phase +- make the warning text concrete and actionable +- update all docs and examples before flipping the default +- keep the opt-in writable paths unchanged +- avoid introducing ambiguity about whether a store may be mutated +- prefer explicit `mode=` in library docs even after the default changes + +## Recommended Rollout + +### Phase 0: prepare the codebase + +Before warning users: + +1. audit internal calls to `blosc2.open(...)` +2. make all internal call sites spell out `mode=` +3. update examples, docs, and tests to use explicit modes +4. document the difference between: + - `mode="r"`: inspect/query only + - `mode="a"`: may unpack and repack stores + - `mode="w"`: overwrite/create + +This phase reduces ambiguity and makes later warning noise much more useful. + +### Phase 1: deprecation warning + +Keep the runtime default as `"a"`, but emit a `FutureWarning` when: + +- `blosc2.open(...)` is called without an explicit `mode=` + +The warning should fire only when `mode` was omitted, not when the caller +explicitly requested `"a"`. + +Recommended warning text: + +```python +FutureWarning( + "blosc2.open() currently defaults to mode='a', but this will change " + "to mode='r' in a future release. Pass mode='a' explicitly to keep " + "writable behavior, or mode='r' for read-only access." +) +``` + +Notes: + +- the wording should mention both the current and future defaults +- the wording should explain how to preserve current behavior +- the wording should not be container-specific + +### Phase 2: flip the default + +In the next planned breaking-compatible release window: + +- change the default mode in `blosc2.open(...)` from `"a"` to `"r"` + +At that point: + +- calls with omitted `mode` become read-only +- code that needs writable behavior must use `mode="a"` explicitly + +### Phase 3: remove warning-specific scaffolding + +After the default flip has been out for one full release cycle: + +- remove temporary warning helpers and migration notes that are no longer + useful +- keep release notes and changelog entries for historical context + +## Implementation Notes + +### Tracking whether `mode` was omitted + +To emit a warning only when appropriate, `blosc2.open(...)` needs to +distinguish: + +- caller omitted `mode` +- caller passed `mode="a"` explicitly + +A practical implementation is: + +1. change the function signature internally to use a sentinel +2. resolve the effective mode inside the function +3. warn only when the sentinel path is used + +For example: + +```python +_MODE_SENTINEL = object() + + +def open(urlpath, mode=_MODE_SENTINEL, **kwargs): + mode_was_omitted = mode is _MODE_SENTINEL + if mode_was_omitted: + mode = "a" # Phase 1 + warnings.warn(...) +``` + +Later, in Phase 2: + +```python +if mode_was_omitted: + mode = "r" +``` + +This is better than relying on `mode="a"` in the signature because that +signature cannot tell whether the user explicitly passed `"a"`. + +### Scope of change + +This plan is specifically about `blosc2.open(...)`. + +It does **not** require changing the defaults of: + +- `DictStore(...)` +- `TreeStore(...)` +- `CTable(...)` + +at the same time. + +However, the docs should explain that: + +- constructor-style APIs may still default to `"a"` +- generic `blosc2.open(...)` becomes read-only by default + +This narrower scope reduces breakage and focuses on the highest-surprise entry +point first. + +## Compatibility Risks + +The main breakage risk is downstream code that relies on: + +```python +obj = blosc2.open(path) +obj[...] = ... +``` + +without ever spelling out `mode="a"`. + +After the default flip, that code may: + +- fail with a read-only error +- stop persisting modifications +- expose behavior differences only at runtime + +This is why the warning phase is important. + +### Secondary risk: tests that mutate after open + +Internal and downstream tests may open objects generically and then mutate +them. These need to be found and updated during Phase 0. + +### Secondary risk: docs and notebooks + +Tutorials that currently omit `mode=` may accidentally teach users the old +behavior. These should be updated before the warning phase begins. + +## Documentation Changes + +### API docs + +Update the docstring for `blosc2.open(...)` to: + +- describe the migration +- clearly document the meaning of each mode +- mention that read-only is the recommended mode for inspection/querying + +### Examples + +Update examples to use explicit modes consistently: + +- inspection/querying: `mode="r"` +- mutation of existing stores: `mode="a"` +- create/overwrite: `mode="w"` + +### User-facing migration note + +Add a short migration note to release notes: + +- “`blosc2.open()` now defaults to read-only; pass `mode='a'` explicitly if + you need writable behavior.” + +## Testing Plan + +### Phase 1 tests + +Add tests that verify: + +- omitted `mode` emits `FutureWarning` +- explicit `mode="a"` does not warn +- explicit `mode="r"` does not warn +- effective behavior remains writable during the warning phase + +### Phase 2 tests + +After the flip, add/update tests that verify: + +- omitted `mode` is read-only +- writes after omitted-mode open fail clearly +- explicit `mode="a"` still allows mutation +- `.b2z` omitted-mode open does not enter append-style write setup + +### Documentation tests + +Where practical, examples should use explicit `mode=` so doctests remain clear +and stable across the transition. + +## Optional Compatibility Escape Hatch + +If downstream breakage risk is considered high, one temporary option is an +environment-variable override for one transition cycle, for example: + +- `BLOSC2_OPEN_DEFAULT_MODE=a` + +This should only be used if needed. It adds complexity and should not become a +permanent configuration surface unless there is a strong operational reason. + +## Related Follow-Up Worth Considering + +Even if the default changes to `"r"`, append mode for `.b2z` may still be more +eager than desirable. + +A separate improvement could make `.b2z` append behavior lazier: + +- open in `"a"` without extracting immediately +- extract only on first mutation +- keep read-only-style fast paths for pure reads + +That is orthogonal to the default-mode change and can be planned separately. + +## Summary + +The recommended path is: + +1. make internal/docs/example usage explicit +2. add a `FutureWarning` when `blosc2.open(...)` is called without `mode=` +3. flip the default from `"a"` to `"r"` in the next suitable release window +4. keep writable behavior available via explicit `mode="a"` + +This delivers a safer and less surprising user experience while still giving +existing code a clear migration path. diff --git a/plans/ctable-implementation-log.md b/plans/ctable-implementation-log.md new file mode 100644 index 00000000..ab81a188 --- /dev/null +++ b/plans/ctable-implementation-log.md @@ -0,0 +1,415 @@ +# CTable Implementation Log + +This document records everything implemented across the CTable feature: +the `ctable-schema.md` redesign (schema, validation, serialization, optimizations) +and the `ctable-persistency.md` phase (file-backed storage, `open()`, read-only mode). + +--- + +## Phase 1 — Schema redesign (`ctable-schema.md`) + +The goal was to replace the original Pydantic-`BaseModel`-based schema API with a +**dataclass-first schema API** using declarative spec objects (`b2.int64()`, +`b2.float64()`, etc.) and to wire in full constraint validation on insert. + +--- + +## New files + +### `src/blosc2/schema.py` + +Defines the public schema vocabulary. + +**Contents:** + +- `SchemaSpec` — abstract base class for all column type descriptors. +- `int64`, `float64`, `bool`, `complex64`, `complex128`, `string`, `bytes` — + concrete spec classes. Each carries: + - `dtype` — the NumPy storage dtype + - `python_type` — the corresponding Python type + - Constraint attributes: `ge`, `gt`, `le`, `lt` (numeric); `min_length`, + `max_length`, `pattern` (string/bytes) + - `to_pydantic_kwargs()` — returns only the non-`None` constraints as a dict, + used internally to build Pydantic validator models + - `to_metadata_dict()` — returns a JSON-compatible dict used for serialization +- `field(spec, *, default, cparams, dparams, chunks, blocks)` — attaches a spec + and per-column storage options to a dataclass field via + `dataclasses.field(metadata={"blosc2": {...}})`. +- `BLOSC2_FIELD_METADATA_KEY = "blosc2"` — stable key for the metadata dict. + +**Key design note:** `bool` and `bytes` shadow Python builtins inside this module. +Private aliases `_builtin_bool` and `_builtin_bytes` are used where the originals +are needed. + +--- + +### `src/blosc2/schema_compiler.py` + +Compiles a dataclass row definition into an internal `CompiledSchema`. + +**Contents:** + +- `ColumnConfig(slots=True)` — holds per-column NDArray storage options: + `cparams`, `dparams`, `chunks`, `blocks`. +- `CompiledColumn(slots=True)` — holds everything about one column: + `name`, `py_type`, `spec`, `dtype`, `default`, `config`, `display_width`. +- `CompiledSchema(slots=True)` — holds the full compiled schema: + `row_cls`, `columns`, `columns_by_name`, `validator_model` (filled lazily by + `schema_validation.py`). +- `compile_schema(row_cls)` — main entry point. Walks `dataclasses.fields()`, + reads `blosc2` metadata from each field, infers specs from plain annotations + where no `b2.field()` is present, validates annotation/spec compatibility, and + returns a `CompiledSchema`. +- `get_blosc2_field_metadata(dc_field)` — extracts the `"blosc2"` metadata dict + from a dataclass field, or returns `None`. +- `infer_spec_from_annotation(annotation)` — builds a default spec from a plain + Python type (`int` → `int64()`, `float` → `float64()`, etc.). Used for inferred + shorthand fields like `id: int` (no `b2.field()`). +- `validate_annotation_matches_spec(name, annotation, spec)` — rejects + declarations where the Python annotation is incompatible with the spec (e.g. + `id: str = b2.field(b2.int64())`). +- `compute_display_width(spec)` — returns a sensible terminal column width based + on dtype kind. +- `schema_to_dict(schema)` — serializes a `CompiledSchema` to a JSON-compatible + dict. Handles `MISSING` defaults (→ `None`), complex defaults + (→ `{"__complex__": True, "real": ..., "imag": ...}`), and optional per-column + storage config fields. +- `schema_from_dict(data)` — reconstructs a `CompiledSchema` from a serialized + dict. Does not require the original Python dataclass. Returns `row_cls=None`. + Raises `ValueError` on unknown `kind` or unsupported `version`. + +--- + +### `src/blosc2/schema_validation.py` + +Row-level constraint validation backed by Pydantic. All Pydantic imports are +isolated here so the rest of the codebase never touches Pydantic directly. + +**Contents:** + +- `build_validator_model(schema)` — builds a `pydantic.create_model(...)` class + from the compiled schema. Each column's `to_pydantic_kwargs()` result is passed + to `pydantic.Field(...)`. The result is cached on `schema.validator_model` so it + is built only once per schema. +- `validate_row(schema, row_dict)` — validates one `{col_name: value}` dict. + Calls the cached Pydantic model, catches `ValidationError`, and re-raises as a + plain `ValueError` so callers never need to import Pydantic. +- `validate_rows_rowwise(schema, rows)` — validates a list of row dicts. Raises + `ValueError` on the first violation, including the row index. + +**When used:** called by `CTable.append()` when `self._validate` is `True`. + +--- + +### `src/blosc2/schema_vectorized.py` + +NumPy-based constraint validation for bulk inserts. Used by `CTable.extend()` to +check entire column arrays at once without per-row Python overhead. + +**Contents:** + +- `validate_column_values(col, values)` — checks all constraint attributes + present on `col.spec` against a NumPy array of values. Uses `np.any(arr < ge)` + style checks. For string/bytes, uses `np.vectorize(len)` to check lengths. + Reports the first offending value in the error message. +- `validate_column_batch(schema, columns)` — calls `validate_column_values` for + every column present in the `columns` dict. + +**Why separate from Pydantic validation:** `extend()` can receive millions of +rows. Row-by-row Pydantic validation would be unacceptably slow for large batches. +NumPy operations run in C with no per-element Python overhead. + +--- + +## Changes to existing files + +### `src/blosc2/ctable.py` + +**Schema detection at construction:** + +```python +if dataclasses.is_dataclass(row_type) and isinstance(row_type, type): + self._schema = compile_schema(row_type) +else: + self._schema = _compile_pydantic_schema(row_type) # legacy path +``` + +**New constructor parameters:** `validate=True`, `cparams=None`, `dparams=None`. +Stored as `self._validate`, `self._table_cparams`, `self._table_dparams`. + +**`_init_columns`:** builds NDArrays from `self._schema.columns` instead of +iterating `row_type.model_fields`. + +**`_resolve_column_storage`:** merges column-level and table-level storage +settings. Column-level wins. + +**`_normalize_row_input`:** normalizes list/tuple/dict/dataclass instance/ +`np.void` to a `{col_name: value}` dict. + +**`_coerce_row_to_storage`:** coerces each value to the column's NumPy dtype +using `np.array(val, dtype=col.dtype).item()`. + +**`append()` new flow:** +1. `_normalize_row_input(data)` → dict +2. `validate_row(schema, row)` if `self._validate` (Pydantic row validation) +3. `_coerce_row_to_storage(row)` → storage dict +4. Find write position, resize if needed, write column by column. + +**`extend()` new signature:** `extend(data, *, validate=None)`. +- `validate=None` uses `self._validate` (table default). +- `validate=True/False` overrides for this call only. +- Vectorized validation runs on raw column arrays before `blosc2.asarray` conversion. + +**Schema introspection (new):** +- `table.schema` property — returns `self._schema`. +- `table.column_schema(name)` — returns `CompiledColumn` for a given column name. +- `table.schema_dict()` — delegates to `schema_to_dict(self._schema)`. + +**Legacy Pydantic adapter kept:** +- `NumpyDtype`, `MaxLen`, `_resolve_field_dtype`, `_LegacySpec`, + `_compile_pydantic_schema` all remain so existing Pydantic-`BaseModel`-based + schemas continue to work during the transition. + +### `src/blosc2/__init__.py` + +Added to delayed imports: + +```python +from .schema import bool, bytes, complex64, complex128, field, float64, int64, string +``` + +Added to `__all__`: +`"bool"`, `"bytes"`, `"complex64"`, `"complex128"`, `"field"`, `"float64"`, +`"int64"`, `"string"`. + +--- + +## Tests + +All tests live in `tests/ctable/`. + +| File | Covers | +|---|---| +| `test_schema_specs.py` | Spec dtypes, python types, constraint storage, `to_pydantic_kwargs`, `to_metadata_dict`, `blosc2` namespace exports | +| `test_schema_compiler.py` | `compile_schema` with explicit `b2.field()`, inferred shorthand, mismatch rejection, defaults, cparams; `schema_to_dict` / `schema_from_dict` roundtrip | +| `test_schema_validation.py` | `append` and `extend` constraint enforcement; boundary values; `validate=False` bypass; `gt`/`lt` exclusive bounds; NumPy structured array path | +| `test_ctable_dataclass_schema.py` | End-to-end CTable construction, `append` with tuple/list/dict, `extend` with iterable and structured array, per-call `validate=` override, schema introspection | +| `test_construct.py` | Construction variants, `append`/`extend`/resize, column integrity, `_valid_rows` | +| `test_column.py` | Column indexing, slicing, iteration, `to_numpy()`, mask independence | +| `test_compact.py` | Manual and auto compaction | +| `test_delete_rows.py` | Single/list/slice deletion, out-of-bounds, edge cases, stress | +| `test_extend_delete.py` | Interleaved extend/delete cycles, mask correctness, resize behavior | +| `test_row_logic.py` | Row indexer (int/slice/list), views, chained views | + +Total: **135 tests, all passing** (after Phase 1 + optimizations). + +--- + +## Phase 1 design decisions + +**Why two validation paths?** +`append()` handles one row at a time — Pydantic is fast enough and also performs +type coercion and default filling. `extend()` handles bulk data — vectorized NumPy +checks are orders of magnitude faster for large batches. + +**Why `validate=None` as the default on `extend()`?** +`None` means "inherit the table-level flag". `True`/`False` are explicit overrides. +This avoids a boolean that accidentally silences the table-level setting. + +**Why keep the Pydantic adapter?** +Existing code using `class RowModel(BaseModel)` continues to work without +modification. The adapter is not on the critical path for new code. + +**Why `schema_to_dict` / `schema_from_dict` now?** +Persistence requires a self-contained schema representation that survives without +the original Python dataclass. Establishing the serialization format before +persistence was built ensured the format was stable before anything depended on it. + +--- + +## Phase 1 optimizations (post-schema) + +Several performance improvements were made after the schema work was complete: + +**`_last_pos` cache** +Added `_last_pos: int | None` to `CTable`. Tracks the physical index of the next +write slot so that `append()` and `extend()` no longer need to scan backward through +chunk metadata on every call. Set to `None` after any deletion (triggers one lazy +recalculation on the next write). Set to `_n_rows` after `compact()`. Eliminated a +backward O(n_chunks) scan per insert. + +**`_grow()` helper** +Extracted the capacity-doubling logic into `_grow()`. Removes duplication between +`append()` and `extend()`. + +**In-place delete** +`delete()` now writes the updated boolean array back with `self._valid_rows[:] = +valid_rows_np` (in-place slice assignment) instead of creating a new NDArray. +Avoids a full allocation on each delete. + +**`head()` / `tail()` refactored** +Both methods now reuse `_find_physical_index()` instead of containing their own +chunk-walk loops. + +**`_make_view()` classmethod** +Added to construct view CTables without going through `__init__`. Avoids +allocating and immediately discarding NDArrays that were never used. + +**`_NumericSpec` mixin + new spec types** +All numeric specs (`int8` through `uint64`, `float32`, `float64`) share a common +`_NumericSpec` mixin for `ge`/`gt`/`le`/`lt` constraint handling, eliminating +boilerplate. New specs added: `int8`, `int16`, `int32`, `uint8`, `uint16`, +`uint32`, `uint64`, `float32`. + +**String vectorized validation** +`validate_column_values` uses `np.char.str_len()` (true C-level) for `U`/`S` dtype +arrays instead of `np.vectorize(len)` (Python loop in disguise). The check also +extracted `_validate_string_lengths()` to reduce cyclomatic complexity. + +**Column name validation** +`compile_schema` now calls `_validate_column_name()` on every field. Rejects names +that are empty, start with `_`, or contain `/` — rules that apply equally to +in-memory and persistent tables. + +--- + +## Phase 2 — Persistency (`ctable-persistency.md`) + +### New file: `src/blosc2/ctable_storage.py` + +A storage-backend abstraction that keeps all file I/O out of `ctable.py`. + +**`TableStorage`** — interface class defining: +`create_column`, `open_column`, `create_valid_rows`, `open_valid_rows`, +`save_schema`, `load_schema`, `table_exists`, `is_read_only`. + +**`InMemoryTableStorage`** — trivial implementation that creates plain in-memory +`blosc2.NDArray` objects and is a no-op for `save_schema`. Used when `urlpath` is +not provided (existing default behaviour, unchanged). + +**`FileTableStorage`** — file-backed implementation. + +Disk layout: + +``` +/ + _meta.b2frame ← blosc2.SChunk; vlmeta holds kind, version, schema JSON + _valid_rows.b2nd ← file-backed boolean NDArray (tombstone mask) + _cols/ + .b2nd ← one file-backed NDArray per column +``` + +Key implementation notes: +- `save_schema` always opens `_meta.b2frame` with `mode="w"` (create path only). +- `load_schema` / `check_kind` use `blosc2.open()` (not `blosc2.SChunk(..., + mode="a")`), which is the correct API for reopening an existing SChunk file. +- File-backed NDArrays (`urlpath=..., mode="w"`) support in-place writes + (`col[pos] = value`, `col[start:end] = arr`) that persist immediately. This is + why resize (`_grow()`), append, extend, and delete all work transparently on + persistent tables. +- `_n_rows` on reopen is reconstructed as `blosc2.count_nonzero(valid_rows)` — + always correct because unwritten slots are `False`, same as deleted slots. +- `_last_pos` is set to `None` on reopen and resolved lazily by `_resolve_last_pos()` + on the first write. + +### Changes to `src/blosc2/ctable.py` + +**Constructor** + +New parameters: `urlpath: str | None = None`, `mode: str = "a"`. + +Logic: +- `urlpath=None` → `InMemoryTableStorage` → existing behaviour unchanged. +- `urlpath` + existing table + `mode != "w"` → open existing (load schema from + disk, open file-backed arrays, reconstruct state). +- `urlpath` + `mode="w"` or no existing table → create new (compile schema, + save to disk, create file-backed arrays). +- Passing `new_data` when opening an existing table raises `ValueError`. + +**`CTable.open(cls, urlpath, *, mode="r")`** + +New classmethod for ergonomic read-only access. Opens the table, verifies +`kind="ctable"` in vlmeta, reconstructs schema from JSON (no dataclass needed), +returns a fully usable `CTable`. + +**Read-only enforcement** + +`_read_only: bool` flag set from `storage.is_read_only()`. Guards added to the top +of `append()`, `extend()`, `delete()`, `compact()` — each raises +`ValueError("Table is read-only (opened with mode='r').")`. + +**`_make_view(cls, parent, new_valid_rows)`** + +New classmethod that constructs a view `CTable` directly via `cls.__new__` without +calling `__init__`. Replaces the old `CTable(self._row_type, expected_size=...)` + +`retval._cols = self._cols` pattern, which was wasteful (allocated NDArrays then +discarded them) and broke when `_row_type` is `None` (tables opened via `open()`). + +**`schema_dict()`** + +No longer needs a local import of `schema_to_dict` — now imported at the module top. + +### New test file: `tests/ctable/test_persistency.py` + +23 tests covering: + +| Test group | What it checks | +|---|---| +| Layout | `_meta.b2frame`, `_valid_rows.b2nd`, `_cols/.b2nd` all exist after creation | +| Metadata | `kind`, `version`, `schema` in vlmeta; column names and order in schema JSON | +| Round-trips | Data survives reopen via both `CTable(Row, urlpath=..., mode="a")` and `CTable.open()` | +| Column order | Preserved exactly from schema JSON, not from filesystem order | +| Constraints | Validation re-enabled after reopen (schema reconstructed from disk) | +| Append/extend/delete after reopen | Mutations visible in subsequent opens | +| `_valid_rows` on disk | Tombstone mask correctly stored and loaded | +| `mode="w"` | Overwrites existing table; subsequent open sees empty table | +| Read-only | `append`, `extend`, `delete`, `compact` all raise on `mode="r"` | +| Read-only reads | `row[]`, column access, `head()`, `tail()`, `where()` all work | +| Error cases | `FileNotFoundError` for missing path; `ValueError` for wrong kind | +| Column name validation | Empty, `_`-prefixed, `/`-containing names rejected | +| `new_data` guard | `ValueError` when `new_data` passed to open-existing path | +| Capacity growth | `_grow()` (resize) works on file-backed arrays and survives reopen | + +Total: **158 tests, all passing**. + +### New benchmark: `bench/ctable/bench_persistency.py` + +Four sections: + +1. **`extend()` bulk insert** — in-memory vs file-backed at 1k–1M rows. + Overhead converges to ~1x at 1M rows (compression dominates, not I/O). +2. **`open()` / reopen time** — ~4–10 ms regardless of table size. Fixed cost: + open 3 files (meta, valid_rows, one column) + parse schema JSON. +3. **`append()` single-row** — file-backed is ~6x slower per row (~3 ms vs ~0.5 ms). + Recommendation: batch inserts via `extend()` for persistent tables. +4. **Column `to_numpy()`** — essentially identical between backends (≤1.06x ratio). + Decompression dominates; file I/O is negligible once data is loaded. + +--- + +## Phase 2 design decisions + +**Why direct files instead of TreeStore?** +TreeStore stores snapshots of in-memory arrays. In-place writes to a +TreeStore-retrieved NDArray do not persist after reopen. File-backed NDArrays +created with `urlpath=...` support in-place writes natively. Using direct `.b2nd` +files aligns with how the rest of blosc2 handles persistent arrays. + +**Why `blosc2.SChunk` vlmeta for metadata, not JSON files?** +`vlmeta` is compressed and is already part of the blosc2 ecosystem. +`blosc2.open()` works on `.b2frame` files the same way it works on `.b2nd` files, +keeping the open path uniform. + +**Why not store `_last_pos` in metadata?** +`_resolve_last_pos()` reconstructs it in O(n_chunks) with no full decompression. +Storing it would create a write on every `append()` just to update a counter in the +SChunk — not worth the extra I/O. + +**Why `_make_view()` instead of calling `__init__`?** +`__init__` now has storage-routing logic and would try to create new NDArrays even +for views (which immediately get thrown away). `_make_view()` via `__new__` is +explicit and zero-waste. + +**Why `CTable.open()` defaults to `mode="r"`?** +The most common read-back scenario is inspection or analysis, not modification. +Defaulting to read-only prevents accidental mutations on shared or archived tables. diff --git a/plans/ctable-indexing.md b/plans/ctable-indexing.md new file mode 100644 index 00000000..077db1cc --- /dev/null +++ b/plans/ctable-indexing.md @@ -0,0 +1,713 @@ +# CTable Indexing Integration Plan + +## Goal + +Add persistent, table-owned indexing to `CTable` so that: + +- indexes can be created on `CTable` columns +- persistent indexes live inside the `TreeStore` that backs the table +- `CTable.where(...)` can reuse the existing index machinery as directly as possible +- index management feels aligned with the current `NDArray` indexing API + +This plan is for design and implementation guidance only. It does not assume +that all pieces must land in one patch. + +## Current Situation + +### What already exists + +`CTable` already supports persistent storage on top of `TreeStore`: + +- `/_meta` +- `/_valid_rows` +- `/_cols/` + +This is implemented in [src/blosc2/ctable_storage.py](/Users/faltet/blosc/python-blosc2/src/blosc2/ctable_storage.py) +and used by [src/blosc2/ctable.py](/Users/faltet/blosc/python-blosc2/src/blosc2/ctable.py). + +The generic indexing engine already exists for 1-D `NDArray` targets: + +- summary / bucket / partial / full indexes +- persistent descriptors in `array.schunk.vlmeta` +- sidecar arrays stored next to the indexed array +- query planning via `plan_query(...)` +- ordered reuse via `plan_ordered_query(...)` + +This lives in [src/blosc2/indexing.py](/Users/faltet/blosc/python-blosc2/src/blosc2/indexing.py) +and is exposed through `NDArray.create_index()`, `drop_index()`, `rebuild_index()`, +`compact_index()`, `index()`, and `indexes`. + +### What is missing + +`CTable` cannot currently reuse that machinery cleanly because: + +1. `CTable.where(...)` eagerly computes a boolean filter and never gives the + planner a table-aware lazy query shape. +2. the current index engine assumes that one index belongs to one `NDArray` + and stores its descriptor in that array's `vlmeta`. +3. persistent sidecar path derivation is based on `array.urlpath`, which places + index files next to the array file rather than inside a table-owned subtree. +4. `CTable` has row visibility semantics through `_valid_rows`, which means + "row still exists" and "row currently matches" are distinct concerns. + +## Design Principles + +The implementation should follow these rules: + +- indexes are table-managed, not column-autonomous +- column indexes are still built from and logically targeted at individual column arrays +- persistent index artifacts must be part of the table store layout +- the public API should mirror existing `NDArray` indexing names where possible +- delete visibility should not force index rebuilds when it can be handled by + post-filtering with `_valid_rows` +- planner and evaluator logic should be reused, not reimplemented from scratch +- unsupported queries must keep a correct scan fallback + +## Proposed Storage Layout + +Extend the persistent `CTable` layout with a reserved index subtree: + +- `/_meta` +- `/_valid_rows` +- `/_cols/` +- `/_indexes//...` + +Recommended concrete shape: + +- `/_indexes//_meta` +- `/_indexes//summary.chunk` +- `/_indexes//summary.block` +- `/_indexes//bucket.values` +- `/_indexes//bucket.bucket_positions` +- `/_indexes//bucket.offsets` +- `/_indexes//bucket_nav.l1` +- `/_indexes//bucket_nav.l2` +- `/_indexes//partial.values` +- `/_indexes//partial.positions` +- `/_indexes//partial.offsets` +- `/_indexes//partial_nav.l1` +- `/_indexes//partial_nav.l2` +- `/_indexes//full.values` +- `/_indexes//full.positions` +- `/_indexes//full_nav.l1` +- `/_indexes//full_nav.l2` +- `/_indexes//full_run..values` +- `/_indexes//full_run..positions` + +Notes: + +- `token` should match the current indexing token model: + - field token for column indexes + - normalized expression token for expression indexes +- all index payloads should stay under `/_indexes//` +- query-cache payloads, if reused for `CTable`, should also be table-owned and + not emitted as sibling files outside the table root + +## Metadata Placement + +The top-level table manifest in `/_meta.vlmeta` should gain index catalog +entries and epoch counters. + +Recommended fields: + +``` +{ + "kind": "ctable", + "version": 1, + "schema": {...}, + "index_catalog_version": 1, + "value_epoch": 0, + "visibility_epoch": 0, + "indexes": { + "id": { + "name": "id", + "token": "id", + "target": {"source": "column", "column": "id"}, + "kind": "full", + "version": 1, + "persistent": True, + "stale": False, + "built_value_epoch": 3, + ... + } + } +} +``` + +Notes: + +- do not keep a historical list of epochs +- overwrite descriptor metadata on rebuild +- descriptors remain small; large payloads stay in `/_indexes/...` +- index catalog ownership remains at the table level, not per-column + +## Public API + +The `CTable` surface should mirror `NDArray` as closely as possible: + +```python +table.create_index("id", kind=blosc2.IndexKind.FULL) +table.drop_index("id") +table.rebuild_index("id") +table.compact_index("id") +table.index("id") +table.indexes +``` + +### Initial target support + +Phase 1 should support column indexes only: + +- `table.create_index("id", kind=...)` +- `table.create_index(field="id", kind=...)` + +Phase 2 can add expression indexes: + +- `table.create_index(expression="abs(score - baseline)", operands=...)` + +but only when all operands resolve to columns from the same `CTable`. + +### Descriptor identity + +Use one active index per target, matching current `NDArray` behavior: + +- one index per column token +- one index per normalized expression token +- optional `name=` remains a label, not identity + +## Query Integration Model + +### Current `CTable` behavior + +Today, `CTable` column comparisons produce `NDArray` or `LazyExpr` results over +physical rows, and `CTable.where(...)` then: + +1. computes the filter +2. pads or trims it +3. intersects it with `_valid_rows` +4. returns a view + +This is correct but fully scan-based. + +### Proposed behavior + +Teach `CTable.where(...)` to detect when the incoming predicate is a `LazyExpr` +that can be interpreted as a table query over table-owned columns. + +For such predicates: + +1. normalize the expression into a table-query descriptor +2. ask a new `CTable` planner for candidate physical row positions +3. intersect candidates with `_valid_rows` +4. evaluate any residual predicate only on surviving candidates +5. produce the final boolean mask or direct row-position set +6. return the usual `CTable` view + +If any step is unsupported, fall back to the current eager full-filter path. + +## Planner Strategy + +Do not build a second independent indexing engine for `CTable`. + +Instead, refactor the current engine into: + +- reusable target normalization +- reusable index build logic +- reusable query plan primitives +- storage backends: + - `NDArrayIndexStorage` + - `CTableIndexStorage` + +### Reusable concepts from current `indexing.py` + +The following should be kept conceptually unchanged: + +- index kinds: summary / bucket / partial / full +- descriptor structure where practical +- target token resolution +- exact and segment planning +- ordered full-index reuse +- full-index compaction model + +### New `CTable` planner layer + +Add a thin planner layer that: + +- maps expression operands back to `CTable` columns +- resolves which indexed columns can participate +- requests index plans from the underlying column index implementation +- intersects or combines candidate physical positions +- reports a reason when indexed planning is not possible + +For v1: + +- single-column predicates should be first-class +- multi-column conjunctions should be supported when each term can be planned independently +- disjunctions can initially fall back to scan if they complicate correctness + +## Row Visibility Semantics + +`CTable` indexes should be defined over physical row positions, not over the +current live-row numbering. + +That means: + +- index payloads refer to physical positions in the backing arrays +- `_valid_rows` remains the source of truth for row visibility +- deleted rows are filtered at query execution time + +This is important because deletes in `CTable` do not rewrite columns; they only +flip visibility bits. + +## Epoch Model + +The epoch model is intentionally small. + +### Table-level counters + +Store only: + +- `value_epoch` +- `visibility_epoch` + +Both are monotonically increasing integers in top-level table metadata. + +### Per-index metadata + +Each descriptor stores: + +- `built_value_epoch` + +Optionally later: + +- `built_visibility_epoch` + +but this is not required in the first implementation. + +### Why this is enough + +- if indexed values or row order change, the index may be invalid: + bump `value_epoch` +- if only `_valid_rows` changes, the index still points to correct physical + rows; execution can intersect with current visibility: + bump `visibility_epoch` + +No epoch history is retained. There is no cleanup problem because only current +scalar values are stored. + +## Mutation Rules + +### Mutations that should bump `value_epoch` + +- `append(...)` +- `extend(...)` +- column writes through `Column.__setitem__` +- `Column.assign(...)` +- `compact()` +- `sort_by(inplace=True)` +- any future row rewrite / reorder operation +- add / drop / rename column for affected targets + +### Mutations that should bump `visibility_epoch` only + +- `delete(...)` + +### Initial stale policy + +For a first implementation, keep rebuild behavior conservative: + +- if a mutation changes indexed values or row positions: + - set affected indexes stale + - bump `value_epoch` +- if only visibility changes: + - do not set indexes stale + - bump `visibility_epoch` + +This is simpler than trying to preserve append-compatible incremental +maintenance on day one. + +## Incremental Maintenance Policy + +The current `NDArray` engine supports limited append maintenance for some index +types. `CTable` does not need to replicate all of that immediately. + +Recommended rollout: + +### Phase 1 + +- create / drop / rebuild / compact indexes +- mark value-changing mutations stale +- keep deletes valid via `_valid_rows` + +### Phase 2 + +- optimize append / extend maintenance for column indexes +- reuse full-index append-run logic where practical +- decide whether summary / bucket / partial can be refreshed incrementally for + appended ranges without rebuilding everything + +The plan should prefer correctness and clear ownership before maintenance +optimizations. + +## Ordered Queries + +The smoothest integration with current `CTable` querying is: + +- filtering remains `table.where(predicate)` +- ordered access is added later in a table-appropriate way + +Possible later APIs: + +- `table.where(expr).sort_by("id")` with index reuse +- `table.where(expr).argsort(order="id")` on a row-index result abstraction +- dedicated row-position helpers for internal use + +For the first version, the main target should be indexed filtering, not full +ordered traversal. + +However, the storage format should not block future ordered reuse, so `full` +indexes should still store enough information to support: + +- ordered filtered row positions +- stable tie handling +- secondary refinement + +## Refactoring Needed in `indexing.py` + +The current implementation mixes three concerns: + +1. planner / evaluator logic +2. metadata ownership +3. sidecar path naming and opening + +To support `CTable`, split these concerns. + +### Step A: storage abstraction + +Introduce an internal storage protocol with responsibilities like: + +- load/save index catalog +- derive payload location for a component +- open/store/remove sidecar arrays +- load/save query-cache catalog and payloads + +Concrete implementations: + +- `NDArrayIndexStorage` +- `CTableIndexStorage` + +### Step B: generic target abstraction + +Introduce an internal target wrapper that represents: + +- base length +- dtype +- chunks / blocks +- slice access for the indexed value stream +- optional block-read helpers +- identity for query cache keys + +For `CTable`, the target for a column index is the column `NDArray`, but +descriptor ownership and sidecar storage are table-owned. + +### Step C: planner entry points + +Keep the existing `NDArray` public entry points intact, but allow internal +planner functions to accept the new abstractions rather than hard-coded raw +`NDArray` ownership assumptions. + +## `CTable` Internal Changes + +### New helpers on `CTable` + +Add private helpers for: + +- resolving the root table from a view +- checking whether a `LazyExpr` is table-plannable +- mapping operands back to column names +- building a physical-position result into a boolean mask +- reading and writing index metadata via storage + +### New helpers on `FileTableStorage` + +Add persistent helpers for: + +- `index_root(token)` +- `index_component_key(token, component_name)` +- create/open/delete index sidecars under `/_indexes/...` +- load/save index catalog in `/_meta` +- load/save table epoch counters + +### View behavior + +Views should not own indexes. + +Rules: + +- creating or dropping indexes on a view should raise +- querying a view may reuse root-table indexes +- planner must always combine indexed matches with the view's current mask + +## Expression Index Scope + +Expression indexes are valuable but should not be part of the first patch +unless the column-index path is already stable. + +Recommended sequence: + +1. column indexes only +2. exact-match multi-column filtering using multiple column indexes +3. expression indexes over same-table columns +4. ordered reuse + +When expression indexes are added, require: + +- all operands belong to the same base `CTable` +- expression normalization produces a stable token +- dependencies are stored by column name, not transient operand aliases + +## Query Cache Scope + +The existing query cache in `indexing.py` is array-owned. + +For `CTable`, if reused, it should be table-owned as well: + +- cache identity should include the table root plus query descriptor +- cache invalidation should happen on `value_epoch` changes +- visibility-only changes can either: + - invalidate conservatively in v1, or + - be ignored if cached results are always post-filtered through current `_valid_rows` + +To keep the first version smaller, query-cache reuse can be deferred entirely. + +## Validation and Reserved Names + +Extend reserved internal names for persistent `CTable` layout: + +- `_meta` +- `_valid_rows` +- `_cols` +- `_indexes` + +If the schema compiler already blocks these, document it. If not, extend the +reserved-name validation explicitly. + +## Error Handling + +Recommended behavior: + +- creating an index on a view: `ValueError` +- creating an index on a missing column: `KeyError` +- creating an unsupported index target: `TypeError` or `ValueError` +- querying with a non-plannable expression: silent scan fallback +- querying with malformed index metadata: clear error on open/use +- compacting a non-`full` index: same semantics as current engine + +## Testing Plan + +### Storage and metadata + +Add tests for: + +- create persistent `CTable` column index +- reopen table and see the index catalog +- verify index payloads are stored under `/_indexes/...` +- verify no sidecar siblings are emitted outside the table root layout +- drop index removes `/_indexes//...` + +### Query correctness + +Add tests for: + +- equality and range predicates on indexed columns +- same queries on reopened persistent tables +- results match scan-based filtering +- deleted rows are excluded without rebuilding the index +- appending after index creation follows the chosen stale policy + +### View semantics + +Add tests for: + +- view queries can reuse parent indexes +- creating indexes on views is rejected +- view mask and `_valid_rows` are both respected + +### Mutation semantics + +Add tests for: + +- delete bumps visibility only and keeps index query correctness +- overwrite of indexed column marks index stale +- compact marks index stale +- inplace sort marks index stale +- rebuild refreshes `built_value_epoch` + +### Multi-column planning + +Add tests for: + +- one indexed term + one unindexed residual term +- two indexed conjunctive terms +- unsupported disjunction falls back correctly + +## Documentation Plan + +The feature should not land with code and tests only. It needs user-facing +documentation from the start. + +### Examples + +Add runnable examples under `examples/ctable` covering at least: + +- creating a `CTable` index on one column +- querying a `CTable` with an indexed predicate +- reopening a persistent table and reusing the index +- basic index management such as `indexes`, `index(...)`, `drop_index(...)`, + and `rebuild_index(...)` + +### Tutorial + +Add a dedicated tutorial notebook at: + +- `doc/getting_started/tutorials/15.indexing-ctables.ipynb` + +The tutorial should explain: + +- what a `CTable` index is +- how indexes relate to columns and to the table as a whole +- how persistence works for indexed tables +- what kinds of queries benefit from indexes +- what happens after deletes and other mutations +- how to inspect and maintain indexes + +### API docstrings and Sphinx integration + +Do not treat docstrings as optional follow-up work. + +For every new public `CTable` indexing API entry point, add fully descriptive +docstrings with small examples, following the style already used elsewhere in +the codebase. + +This includes, as applicable: + +- `CTable.create_index(...)` +- `CTable.drop_index(...)` +- `CTable.rebuild_index(...)` +- `CTable.compact_index(...)` +- `CTable.index(...)` +- `CTable.indexes` + +The docstrings should cover: + +- parameters +- return values +- persistence behavior +- mutation / stale behavior where relevant +- short examples that show the intended usage + +These APIs should also be integrated into the Sphinx docs so they are reachable +from the generated documentation, not only from source docstrings. + +## Recommended Implementation Order + +### Phase 1: storage foundations + +1. add `/_indexes` reserved subtree conventions +2. extend `FileTableStorage` with index catalog and sidecar helpers +3. add table-level epoch metadata + +### Phase 2: API skeleton + +4. add `CTable.create_index`, `drop_index`, `rebuild_index`, `compact_index`, + `index`, and `indexes` +5. implement build/drop/rebuild against column targets only +6. keep query path unchanged initially + +### Phase 3: planner integration + +7. refactor `indexing.py` storage ownership assumptions +8. add `CTable` query planner shim +9. teach `CTable.where(...)` to use indexed planning when possible +10. keep scan fallback for everything else + +### Phase 4: mutation policy + +11. wire `value_epoch` / `visibility_epoch` +12. mark affected indexes stale on value-changing mutations +13. keep delete visibility index-safe without rebuild + +### Phase 5: follow-up optimizations + +14. consider append-aware maintenance +15. consider expression indexes +16. consider ordered reuse for table queries +17. consider query-cache reuse + +### Phase 6: documentation + +18. add `examples/ctable` indexing examples +19. add `doc/getting_started/tutorials/15.indexing-ctables.ipynb` +20. add full public docstrings with examples for the `CTable` indexing API +21. integrate the new API and tutorial into Sphinx documentation + +## Non-Goals for the First Implementation + +Do not include these in the first patch unless they come almost for free: + +- full expression-index support +- ordered query reuse for `CTable` +- disjunction planning across multiple indexes +- aggressive incremental maintenance for all index kinds +- index-aware query caching +- cross-table expression operands + +## Future Work + +One possible future storage evolution would be to make each persisted column a +subtree root instead of a single leaf object. + +That would allow a layout more like: + +- `/_cols/id/data` +- `/_cols/id/indexes/...` +- `/_cols/id/missing/...` +- `/_cols/id/sidecars/...` +- `/_cols/score/data` +- `/_cols/score/indexes/...` + +Potential benefits: + +- stronger locality between a column and its derived artifacts +- easier `rename_column()` and `drop_column()` handling +- a natural home for future per-column sidecars beyond indexes +- room for explicit missing-value bitmaps, nullability metadata, sketches, or + other derived column structures + +Potential costs: + +- this would be a real `CTable` storage-schema change, not just an indexing feature +- current persisted tables would need migration or dual-layout support +- `FileTableStorage` and open/materialization logic would become more complex +- the benefit is broader than indexing, so it is better considered as part of a + larger storage-layout revision + +For that reason, this plan does not assume that redesign. It keeps the current +column-leaf layout and places indexes in a table-owned `/_indexes` subtree. + +## Summary + +The right model is: + +- indexes are table-managed, not column-autonomous +- column indexes are still built from and logically targeted at individual + column arrays +- persistent index artifacts live under `/_indexes` +- existing `indexing.py` logic is reused through refactoring, not duplicated +- deletes remain cheap by treating indexes as physical-row structures and + applying `_valid_rows` at execution time +- epoch tracking stays minimal: a small number of table-level counters, not a + growing history + +This keeps the user model coherent with current `CTable` persistence and as +close as possible to the existing `NDArray` indexing API. diff --git a/plans/ctable-persistency.md b/plans/ctable-persistency.md new file mode 100644 index 00000000..a2ff6db2 --- /dev/null +++ b/plans/ctable-persistency.md @@ -0,0 +1,536 @@ +# CTable Persistency Plan + +## Goal + +Add persistent `CTable` support on top of `TreeStore` while keeping the public +API simple: + +* in-memory tables when `urlpath is None` +* persistent tables when `urlpath` is provided + +The first persistency iteration should support: + +* creating a persistent table +* opening an existing persistent table +* reading rows, columns, and views from persisted tables +* appending rows + +The first persistency iteration should **not** promise: + +* full schema evolution +* dropping columns +* renaming columns +* transactional multi-entry updates + +For now, the supported schema evolution story is: + +* append rows only + +--- + +## Storage layout + +Each persisted `CTable` lives under a table root inside a `TreeStore`. + +Confirmed layout: + +* `table_root/_meta` +* `table_root/_valid_rows` +* `table_root/_cols/` + +Example: + +* `people/_meta` +* `people/_valid_rows` +* `people/_cols/id` +* `people/_cols/score` +* `people/_cols/active` + +Rationale: + +* `_meta` holds mutable metadata in `vlmeta` +* `_valid_rows` is real table data and should be stored as a normal persisted array +* `_cols/` stores one persisted NDArray per column + +The underscore-prefixed names form the internal namespace for a table root and +must be treated as reserved. + +--- + +## `_meta` entry + +`_meta` should be a small serialized `SChunk` used primarily to hold mutable +`vlmeta`. + +This is preferable to immutable metalayers because: + +* we may want to evolve metadata over time +* multiple `CTable` objects may live in the same `TreeStore` +* schema and table metadata should be updateable without rewriting the entire table + +For the first version: + +* `tree_store["/_meta"].vlmeta["kind"] = "ctable"` +* `tree_store["/_meta"].vlmeta["version"] = 1` +* `tree_store["/_meta"].vlmeta["schema"] = {...}` + +This gives `open()` a minimal, reliable contract for introspection. + +--- + +## Schema persistence format + +The schema should be stored as JSON-compatible data in: + +* `tree_store["/_meta"].vlmeta["schema"]` + +The schema document should be versioned and explicit. + +Recommended shape: + +```python +{ + "version": 1, + "columns": [ + { + "name": "id", + "py_type": "int", + "spec": {"kind": "int64", "ge": 0}, + "default": None, + }, + { + "name": "score", + "py_type": "float", + "spec": {"kind": "float64", "ge": 0, "le": 100}, + "default": None, + }, + { + "name": "active", + "py_type": "bool", + "spec": {"kind": "bool"}, + "default": True, + }, + ], +} +``` + +Notes: + +* `columns` must be an ordered list, not a dict. +* The order of the list is the source of truth for column order. +* Do not rely on dict ordering or TreeStore iteration order. +* The schema JSON should capture logical schema information only. + +For the first version, do **not** duplicate: + +* per-column `cparams` +* per-column `dparams` +* array chunk/block layout +* `expected_size` +* compaction settings + +Those can be introspected directly from the stored arrays when needed. + +--- + +## `_valid_rows` persistence + +`_valid_rows` should be stored as a normal persisted boolean NDArray under: + +* `table_root/_valid_rows` + +This is the correct representation because `_valid_rows` is: + +* table data, not metadata +* potentially large +* used in normal row visibility semantics +* already aligned with current delete/view/compaction logic + +Do not encode `_valid_rows` into schema JSON or small metadata blobs. + +--- + +## Column persistence + +Each column should be stored as its own persisted NDArray under: + +* `table_root/_cols/` + +This means: + +* each column can be opened independently +* column-level array settings remain attached to the actual stored array +* persistence layout matches the internal columnar design cleanly + +The schema JSON provides the logical order and type constraints; the arrays under +`_cols` provide the physical stored data. + +--- + +## Constructor semantics + +The recommended constructor shape is: + +```python +table = b2.CTable( + Row, + urlpath=None, + mode="a", + expected_size=1_048_576, + compact=False, + validate=True, +) +``` + +Semantics: + +* `urlpath is None` + create an in-memory `CTable` +* `urlpath is not None` + use persistent storage rooted at that path + +Recommended `mode` meanings: + +* `mode="w"` + create a new persistent table, overwriting any existing table root if the API + already supports that pattern elsewhere +* `mode="a"` + open existing or create new +* `mode="r"` + open existing read-only table + +The important public signal is: + +* `urlpath` chooses persistence +* `mode` chooses creation/open behavior + +Users should not need to pass a `TreeStore` object explicitly for the common path. + +--- + +## `open()` support + +An explicit `open()` API should be supported. + +Recommended shape: + +```python +table = b2.open(urlpath) +``` + +or, if needed for clarity: + +```python +table = b2.CTable.open(urlpath, mode="r") +``` + +For `open()` to detect a persisted `CTable`, it should inspect: + +* `urlpath/_meta` +* `urlpath/_meta`.vlmeta["kind"] + +If: + +* `_meta` exists +* `vlmeta["kind"] == "ctable"` + +then the object should be recognized as a persisted `CTable`. + +This keeps `urlpath` simple: it points to the table root, and `_meta` provides +the type marker and schema. + +--- + +## Multiple tables in one TreeStore + +The design must support multiple `CTable` objects in the same `TreeStore`. + +That is one reason `_meta` is a good choice: + +* each table root has its own `_meta` +* each table root can be introspected independently +* schema metadata is naturally scoped to one table subtree + +Example shared TreeStore: + +* `users/_meta` +* `users/_valid_rows` +* `users/_cols/id` +* `orders/_meta` +* `orders/_valid_rows` +* `orders/_cols/order_id` + +No additional global registry is required in the first version. + +--- + +## Column name validation + +Column name validation should be explicit and should be shared between: + +* in-memory `CTable` +* persistent `CTable` + +Reason: + +* a schema should not be valid in memory and then fail only when persisted + +Recommended first-rule constraints for column names: + +* must be a non-empty string +* must not contain `/` +* must not start with `_` +* must not collide with reserved internal names + +Reserved internal names for the table root layout: + +* `_meta` +* `_valid_rows` +* `_cols` + +This validation should happen during schema compilation, not only during +persistent-table creation. + +--- + +## Column order + +Column order should be preserved explicitly in the schema JSON. + +The source of truth is: + +* the order of `schema["columns"]` + +Do not rely on: + +* dict ordering as a persistence contract +* lexical ordering of `_cols/` +* TreeStore iteration order + +On load: + +* reconstruct `table.col_names` from the schema list order +* rebuild any name-to-column map separately + +--- + +## Read-only mode + +When `mode="r"`: + +Allowed: + +* opening the table +* reading rows +* reading columns +* creating non-mutating views +* `head()`, `tail()`, filtering, and other read-only operations + +Disallowed: + +* `append()` +* `delete()` +* `compact()` +* any operation that mutates stored arrays or metadata + +These should fail immediately with a clear error. + +If some existing view path currently requires mutation internally, that should be +cleaned up rather than weakening the read-only contract. + +--- + +## Failure model + +The first persistency version does not need full transactional semantics. + +Be explicit in the implementation and docs: + +* updates touching multiple entries are not guaranteed to be atomic +* partial writes are possible if a failure occurs mid-update + +That is acceptable for the first version as long as it is not hidden. + +The initial goal is a correct and understandable persistent layout, not a full +transaction layer. + +--- + +## Internal API sketch + +This is a proposed internal storage split, not a final public API requirement. + +Possible internal helpers: + +```python +class TableStorage: + def open_column(self, name: str): ... + def create_column( + self, + name: str, + *, + dtype, + shape, + chunks=None, + blocks=None, + cparams=None, + dparams=None + ): ... + def open_valid_rows(self): ... + def create_valid_rows( + self, *, shape, chunks=None, blocks=None, cparams=None, dparams=None + ): ... + def load_schema(self) -> dict: ... + def save_schema(self, schema: dict) -> None: ... + def exists(self) -> bool: ... + def is_read_only(self) -> bool: ... + + +class InMemoryTableStorage(TableStorage): ... + + +class TreeStoreTableStorage(TableStorage): ... +``` + +Then `CTable` can route based on `urlpath`: + +* `urlpath is None` -> `InMemoryTableStorage` +* `urlpath is not None` -> `TreeStoreTableStorage` + +This keeps persistence a backend concern instead of scattering TreeStore logic +throughout all of `CTable`. + +--- + +## Concrete implementation sequence + +### Step 1: extend constructor/open signatures + +Update `src/blosc2/ctable.py` to accept: + +```python +class CTable: + def __init__( + self, + row_type, + new_data=None, + *, + urlpath: str | None = None, + mode: str = "a", + expected_size: int = 1_048_576, + compact: bool = False, + validate: bool = True, + ) -> None: ... +``` + +And add: + +```python +@classmethod +def open(cls, urlpath: str, *, mode: str = "r") -> "CTable": ... +``` + +### Step 2: add storage backend abstraction + +Create a new module: + +* `src/blosc2/ctable_storage.py` + +Add: + +* `TableStorage` +* `InMemoryTableStorage` +* `TreeStoreTableStorage` + +### Step 3: implement TreeStore layout helpers + +In `TreeStoreTableStorage`, add helpers for: + +* `_meta` path +* `_valid_rows` path +* `_cols/` paths +* reading/writing `vlmeta["kind"]` +* reading/writing `vlmeta["version"]` +* reading/writing `vlmeta["schema"]` + +### Step 4: persist schema JSON + +Connect compiled schema export/import to `_meta.vlmeta["schema"]`. + +The schema compiler work should provide: + +```python +def schema_to_dict(schema: CompiledSchema) -> dict: ... +def schema_from_dict(data: dict) -> CompiledSchema: ... +``` + +### Step 5: create/open persistent arrays + +Wire `CTable` initialization so that: + +* create path creates `_meta`, `_valid_rows`, and `_cols/` +* open path loads schema first, then opens `_valid_rows` and columns + +### Step 6: enforce read-only behavior + +Add an internal read-only flag so mutating methods fail early when opened with +`mode="r"`. + +Methods to guard first: + +* `append` +* `extend` +* `delete` +* `compact` + +### Step 7: test persistency layout and round-trips + +Add tests covering: + +* create persistent `CTable` +* reopen persistent `CTable` +* schema JSON present in `_meta.vlmeta` +* `_valid_rows` persisted correctly +* column order preserved after reopen +* multiple tables inside one TreeStore +* read-only mode errors on mutation + +--- + +## Proposed tests + +Suggested test file: + +* `tests/ctable/test_persistency.py` + +Suggested test cases: + +* `test_create_persistent_ctable_layout` +* `test_open_persistent_ctable` +* `test_schema_saved_in_meta_vlmeta` +* `test_valid_rows_persisted` +* `test_column_order_roundtrip` +* `test_multiple_ctables_in_same_treestore` +* `test_read_only_mode_rejects_mutation` + +--- + +## Recommendation + +The recommended persistency design is: + +1. use `urlpath` to switch between in-memory and persistent `CTable` +2. store one table per TreeStore subtree +3. use: + * `_meta` + * `_valid_rows` + * `_cols/` +4. store schema JSON in `_meta.vlmeta["schema"]` +5. store explicit markers in `_meta.vlmeta`: + * `"kind": "ctable"` + * `"version": 1` +6. preserve column order in the schema JSON as an ordered `columns` list +7. keep the first version limited to append-row persistence, not full schema evolution + +This gives `CTable` a clear persistent layout, keeps `open()` introspection +simple, and stays consistent with the existing columnar design. diff --git a/plans/ctable-schema.md b/plans/ctable-schema.md new file mode 100644 index 00000000..d9cd3fb1 --- /dev/null +++ b/plans/ctable-schema.md @@ -0,0 +1,1258 @@ +# CTable Schema Redesign + +## Motivation + +The current `CTable` prototype in PR #598 uses `pydantic.BaseModel` plus +`Annotated[...]` metadata to define table schemas. That works, but it is not the +best long-term API for a columnar container in `python-blosc2`. + +The main issues with the current shape are: + +* It mixes row validation concerns with physical storage concerns. +* It relies on custom metadata objects (`NumpyDtype`, `MaxLen`) embedded in + Pydantic annotations. +* It is verbose for simple schemas. +* It does not provide an obvious place for NDArray-specific per-column options + such as `cparams`, `dparams`, `chunks`, `blocks`, or future indexing hints. + +What we want instead is: + +* A schema API that is easy to read and write. +* A place to attach Blosc2-specific per-column configuration. +* A way to express logical constraints such as `ge=0`, `le=100`, `max_length=10`. +* Internal validation without forcing the public API to be Pydantic-shaped. +* A clean distinction between: + * logical field type and constraints + * physical storage type + * per-column storage options + +The proposed solution is a **dataclass-first schema API** with **declarative field +spec objects** and **optional internal Pydantic-backed validation**. + +The intended usage style is: + +* canonical form for constrained or storage-tuned columns: + `id: int = b2.field(b2.int64(ge=0))` +* shorthand for simple inferred columns: + `id: int` +* not preferred as a primary style: + `id = b2.field(b2.int64(ge=0))` + +The reason is that the canonical form preserves normal Python type annotations, +which are valuable for readability, static tooling, and schema inspection. + +--- + +## Proposed public API + +### Schema declaration + +The intended schema declaration style is: + +```python +from dataclasses import dataclass + +import blosc2 as b2 + + +@dataclass +class Row: + id: int = b2.field(b2.int64(ge=0)) + score: float = b2.field( + b2.float64(ge=0, le=100), + cparams={"codec": b2.Codec.LZ4, "clevel": 5}, + ) + active: bool = b2.field(b2.bool(), default=True) +``` + +This is the target user-facing API for `CTable`. + +This should be documented as the **canonical** schema declaration style. + +For simple unconstrained cases, `CTable` may support an inferred shorthand: + +```python +@dataclass +class Row: + id: int + score: float + active: bool = True +``` + +which is interpreted approximately as: + +```python +@dataclass +class Row: + id: int = b2.field(b2.int64()) + score: float = b2.field(b2.float64()) + active: bool = b2.field(b2.bool(), default=True) +``` + +This shorthand should be limited to simple built-in Python types where the +mapping is obvious. + +### Naming convention + +Use **lowercase names** for schema descriptor objects: + +* `b2.int64` +* `b2.float64` +* `b2.bool` +* later: `b2.string(max_length=...)`, `b2.bytes(max_length=...)`, `b2.complex128` + +Reason: + +* `b2.int64(...)` is not just a dtype; it is a schema descriptor with constraints. +* The lowercase form keeps the API closer in spirit to NumPy and PyTorch. +* If plain NumPy dtypes are needed, callers can use `np.int64`, `np.float64`, + `np.bool_`, etc. +* `b2.bool(...)` is preferred over `b2.bool_(...)` for readability, even though + NumPy uses `bool_`. This is closer to PyTorch style and fits better for a + schema-builder API. + +### Field helper + +`b2.field(...)` should be the standard way to attach schema metadata to a +dataclass field. + +Expected shape: + +```python +b2.field( + b2.float64(ge=0, le=100), + default=..., + cparams=..., + dparams=..., + chunks=..., + blocks=..., +) +``` + +At minimum for the first version: + +* `spec` +* `default` +* `cparams` +* `dparams` +* `chunks` +* `blocks` + +The implementation should store these in `dataclasses.field(metadata=...)`. + +The unannotated form: + +```python +id = b2.field(b2.int64(ge=0)) +``` + +should not be the primary API. It may be supported later only if there is a +strong reason, but the preferred style should retain: + +* a Python type annotation in the annotation slot +* `b2.field(...)` in the field/default slot + +That keeps the schema aligned with normal dataclass usage. + +--- + +## Core design + +### 1. Dataclass is the schema carrier + +The dataclass defines: + +* field names +* Python-level row shape +* user-visible defaults + +Example: + +```python +@dataclass +class Row: + id: int = b2.field(b2.int64(ge=0)) + score: float = b2.field(b2.float64(ge=0, le=100)) + active: bool = b2.field(b2.bool(), default=True) +``` + +This keeps the declaration small and idiomatic. + +The Python annotation should remain part of the design, not be replaced by +`b2.field(...)` alone. The annotation provides value independently of the +Blosc2 schema descriptor. + +### 2. Schema spec objects are the source of truth + +Each lowercase builder object is a lightweight immutable schema descriptor. + +Examples: + +```python +b2.int64(ge=0) +b2.float64(ge=0, le=100) +b2.bool() +b2.string(max_length=32) +b2.bytes(max_length=64) +``` + +Each spec object should carry only schema-level metadata, for example: + +* logical kind +* storage dtype +* numeric constraints (`ge`, `gt`, `le`, `lt`, `multiple_of`) +* string constraints (`max_length`, `min_length`, `pattern`) +* nullability +* maybe logical annotations later (`categorical`, `timezone`, `unit`) + +They should **not** directly carry per-column NDArray instance settings such as +`cparams` or `chunks`; those belong in `b2.field(...)`. + +### 3. Column field metadata carries NDArray-specific configuration + +`b2.field(...)` metadata should be the place for: + +* column storage options +* per-column compression settings +* chunk/block tuning +* persistence options in future versions + +This keeps the separation clean: + +* `b2.float64(ge=0, le=100)` answers: "what values are valid?" +* `b2.field(..., cparams=..., chunks=...)` answers: "how is this column stored?" + +### 4. Schema compilation step inside CTable + +`CTable` should not consume raw dataclass fields repeatedly. On construction, it +should compile the row class into an internal schema representation. + +For example: + +```python +compiled = CompiledSchema( + row_cls=Row, + columns=[ + CompiledColumn( + name="id", + py_type=int, + spec=b2.int64(ge=0), + dtype=np.int64, + default=MISSING, + cparams=..., + dparams=..., + chunks=..., + blocks=..., + validator_info=..., + ), + ..., + ], + validator_model=..., +) +``` + +This compiled form should drive: + +* NDArray creation +* row validation +* bulk validation +* introspection and future serialization + +--- + +## Validation strategy + +### Use Pydantic internally, but do not make it the public schema API + +Pydantic is a good fit for validation because it is: + +* mature +* well-tested +* expressive +* fast enough for row-level operations + +However, it should be an **implementation detail**, not the public schema surface. + +The public schema should remain: + +* dataclass-based +* Blosc2-specific +* independent of any one validation library + +### Why not use Pydantic as the schema source directly? + +Because storage and validation are overlapping but not identical concerns. + +Examples: + +* `dtype=np.int16` is both logical and physical. +* `cparams`, `chunks`, `blocks`, `dparams` are not Pydantic concepts. +* a future column index, bloom filter, or codec hint is not a validation concept. + +Therefore, the internal architecture should be: + +* user declares a dataclass + `b2.field(...)` +* `CTable` compiles it into: + * storage schema + * validation schema + +### Row-level validation + +For `append(row)` and other row-wise inserts: + +* compile a cached internal Pydantic model once per schema +* validate incoming rows against that model +* convert the validated row into column values + +This is the simplest and safest path. + +Expected behavior: + +* `table.append(Row(...))` +* `table.append({"id": 1, "score": 2.0, "active": True})` +* `table.append((1, 2.0, True))` + +All may be accepted, but internally normalized through one validator path. + +### Bulk validation + +For `extend(...)`, row-by-row Pydantic validation may be too expensive for large +batches. Bulk inserts need a separate strategy. + +Recommended modes: + +* `validate=True` + Full validation. May use row-wise Pydantic validation for smaller inputs and + vectorized checks where available. +* `validate=False` + Trust caller, perform dtype coercion only. +* optional later: `validate="sample"` or `validate="vectorized"` + +For numeric and simple string constraints, vectorized checks are preferable when +possible: + +* `ge`, `gt`, `le`, `lt` +* `max_length`, `min_length` +* null checks +* dtype coercion checks + +This means the architecture should support both: + +* Pydantic row validation +* vectorized array validation + +The compiled schema should expose enough information for both. + +### Performance stance + +Pydantic should be treated as: + +* a strong default for correctness +* fast enough for row-wise validation +* not necessarily the fastest choice for large batch validation + +This is important because the performance bottleneck for `extend()` is more about +per-row Python overhead than about Pydantic specifically. + +--- + +## Detailed API proposal + +### Schema spec classes + +Add schema descriptor classes under `blosc2`, for example: + +* `int8`, `int16`, `int32`, `int64` +* `uint8`, `uint16`, `uint32`, `uint64` +* `float32`, `float64` +* `bool` +* `complex64`, `complex128` +* `string` +* `bytes` + +Minimal constructor examples: + +```python +b2.int64(ge=0) +b2.float64(ge=0, le=100) +b2.string(max_length=32) +b2.bytes(max_length=64) +b2.bool() +``` + +Internal common fields: + +* `dtype` +* `constraints` +* `python_type` + +### Field helper + +`b2.field(spec, **kwargs)` should return a `dataclasses.field(...)` object with +Blosc2 metadata attached. + +Example metadata layout: + +```python +{ + "blosc2": { + "spec": ..., + "cparams": ..., + "dparams": ..., + "chunks": ..., + "blocks": ..., + } +} +``` + +This metadata key should be stable and reserved. + +### CTable constructor + +The desired constructor remains: + +```python +table = b2.CTable(Row) +``` + +Optional overrides: + +```python +table = b2.CTable( + Row, + expected_size=1_000_000, + compact=False, + validate=True, +) +``` + +`CTable` should detect that `Row` is a dataclass schema and compile it. + +### Possible compatibility layer + +If needed temporarily, `CTable` may continue accepting the old Pydantic model +style during a transition period: + +```python +table = b2.CTable(LegacyPydanticRow) +``` + +But that should be documented as legacy or transitional once the dataclass API +lands. + +--- + +## Internal compilation pipeline + +### Step 1. Inspect dataclass fields + +For each dataclass field: + +* field name +* Python annotation +* default or default factory +* Blosc2 metadata from `b2.field(...)` + +Reject invalid shapes early: + +* missing `b2.field(...)` +* missing schema spec +* incompatible Python annotation vs schema spec +* unsupported defaults + +If inferred shorthand is supported, refine the first two rules to: + +* either a supported plain annotation, or an explicit `b2.field(...)` +* if `b2.field(...)` is present, it must contain a schema spec + +### Step 2. Build compiled column descriptors + +For each field, produce a `CompiledColumn` object containing: + +* `name` +* `py_type` +* `spec` +* `dtype` +* `default` +* `cparams` +* `dparams` +* `chunks` +* `blocks` +* validation constraints + +### Step 3. Derive physical NDArray creation arguments + +From the compiled column descriptor, derive: + +* `dtype` +* shape +* chunks +* blocks +* `cparams` +* `dparams` + +This should happen once during table initialization. + +### Step 4. Derive validation model + +Translate each schema spec into a Pydantic field definition. + +Examples: + +* `int64(ge=0)` -> integer field with `ge=0` +* `float64(ge=0, le=100)` -> float field with `ge=0`, `le=100` +* `string(max_length=32)` -> string field with `max_length=32` + +Cache the compiled Pydantic model class per row schema. + +### Step 5. Expose introspection hooks + +Expose enough metadata for: + +* debugging +* `table.info()` +* future schema serialization +* future schema-driven docs and reprs + +Possible user-facing hooks later: + +* `table.schema` +* `table.schema.columns` +* `table.schema.as_dict()` + +--- + +## Handling defaults + +Defaults should follow dataclass semantics as closely as possible. + +Examples: + +```python +active: bool = b2.field(b2.bool(), default=True) +``` + +For the first implementation, keep this conservative: + +* support scalar defaults +* reject mutable defaults directly + +On insert: + +* omitted values should be filled from defaults + +--- + +## Insert semantics + +### append() + +`append()` should accept a small set of normalized shapes: + +* dataclass row instance +* dict-like row +* tuple/list in schema order + +Recommended internal path: + +1. normalize the input to a field mapping +2. validate with cached validator model +3. coerce to final column values +4. append into underlying NDArrays + +### extend() + +`extend()` should accept: + +* iterable of row objects +* dict-of-arrays +* structured NumPy array +* maybe another `CTable` + +Recommended internal path: + +1. normalize to column batches where possible +2. validate according to `validate=` mode +3. coerce dtypes +4. write in bulk + +For `dict-of-arrays` and structured arrays, vectorized validation should be the +preferred long-term path. + +--- + +## Per-column NDArray options + +One of the main reasons for `b2.field(...)` is that different columns may want +different storage settings. + +Examples: + +* a boolean column may want different compression parameters from a float column +* a high-cardinality string column may need different chunk sizes +* a metric column may use a specific codec or filter tuning + +So the schema system must allow: + +```python +@dataclass +class Row: + id: int = b2.field(b2.int64(ge=0), cparams={"codec": b2.Codec.ZSTD, "clevel": 1}) + score: float = b2.field( + b2.float64(ge=0, le=100), cparams={"codec": b2.Codec.LZ4HC, "clevel": 9} + ) + active: bool = b2.field(b2.bool(), cparams={"codec": b2.Codec.LZ4}) +``` + +The implementation should define precedence rules clearly: + +* column-level options override table defaults +* table-level options fill in unspecified values + +This implies `CTable(...)` may also take default storage options: + +```python +table = b2.CTable(Row, cparams=..., dparams=...) +``` + +Column-level overrides should merge against those defaults, not replace them +blindly. + +--- + +## Compatibility and migration + +### Goal + +Move toward the dataclass-based schema API without locking the project into the +current Pydantic-shaped declaration model. + +### Migration path + +Phase 1: + +* introduce schema spec classes and `b2.field(...)` +* support dataclass schemas in `CTable` +* keep existing prototype behavior separate + +Phase 2: + +* add row validation via cached internal Pydantic model +* add bulk validation modes +* document the dataclass schema API as preferred + +Phase 3: + +* optionally add a compatibility adapter for existing Pydantic models +* deprecate ad hoc `Annotated[...]` metadata conventions if they remain exposed + +### Non-goal + +Do not make the first implementation solve every possible schema feature. The +first goal is to get the schema shape and internal architecture right. + +--- + +## Serialization implications + +Even if `save()` / `load()` are not implemented yet, this schema design should +anticipate persistence. + +Eventually a persisted `CTable` will need to store: + +* column names +* logical schema descriptors +* per-column defaults +* per-column NDArray storage options +* maybe validation constraints + +That argues strongly for having a stable compiled schema representation early. + +The compiled schema should be serializable to: + +* JSON-compatible metadata +* or a small msgpack payload + +The public dataclass itself does not need to be serialized directly. Only the +compiled schema matters for persistence. + +--- + +## Open questions + +### 1. Should Python annotations be required to match the schema spec? + +Example: + +```python +id: int = b2.field(b2.int64(ge=0)) +``` + +Recommended answer: yes, broadly, with sensible compatibility rules. + +Allowed: + +* `int` with `int64` +* `float` with `float64` +* `bool` with `bool` + +Potentially allowed later: + +* `str` with `string` +* `bytes` with `bytes` + +Reject obviously inconsistent declarations early. + +In other words: + +* `id: int = b2.field(b2.int64(ge=0))` is good +* `id: int` is acceptable shorthand for inferred `b2.int64()` +* `id = b2.field(b2.int64(ge=0))` is not the preferred style because it drops + the Python annotation + +### 2. Should `b2.field()` require a spec? + +Recommended answer: yes for the first version. + +Allowing `b2.field(default=True)` without a spec means we must infer too much +from the Python annotation and lose clarity. + +This still allows fully inferred fields that do not use `b2.field(...)` at all: + +```python +active: bool = True +``` + +but once `b2.field(...)` is used, it should carry an explicit schema spec. + +### 3. How much should Pydantic-specific behavior leak? + +Recommended answer: as little as possible. + +Users should not need to know whether validation is backed by Pydantic, +vectorized NumPy checks, or another mechanism. + +--- + +## Concrete implementation sequence + +This section turns the design into a proposed execution order with concrete +files, class names, and function signatures. + +### Step 1: add schema descriptor primitives + +Create a new module: + +* `src/blosc2/schema.py` + +Primary contents: + +```python +from __future__ import annotations + +from dataclasses import MISSING, Field as DataclassField, field as dc_field +from typing import Any + +import numpy as np +``` + +Proposed public classes and functions: + +```python +class SchemaSpec: + dtype: np.dtype + python_type: type[Any] + + def to_pydantic_kwargs(self) -> dict[str, Any]: ... + def to_metadata_dict(self) -> dict[str, Any]: ... + + +class int64(SchemaSpec): + def __init__(self, *, ge=None, gt=None, le=None, lt=None): ... + + +class float64(SchemaSpec): + def __init__(self, *, ge=None, gt=None, le=None, lt=None): ... + + +class bool(SchemaSpec): + def __init__(self): ... + + +class string(SchemaSpec): + def __init__(self, *, min_length=None, max_length=None, pattern=None): ... + + +class bytes(SchemaSpec): + def __init__(self, *, min_length=None, max_length=None): ... + + +def field( + spec: SchemaSpec, + *, + default=MISSING, + cparams: dict[str, Any] | None = None, + dparams: dict[str, Any] | None = None, + chunks: tuple[int, ...] | None = None, + blocks: tuple[int, ...] | None = None, +) -> DataclassField: ... +``` + +Internal helper constants: + +```python +BLOSC2_FIELD_METADATA_KEY = "blosc2" +``` + +Notes: + +* Start with only the spec classes needed for the first `CTable` iteration: + `int64`, `float64`, `bool`. +* Add `string` and `bytes` only if needed in the same slice of work. +* Avoid over-generalizing the first implementation. + +### Step 2: add schema compiler and compiled representations + +Create a new module: + +* `src/blosc2/schema_compiler.py` + +Primary internal dataclasses: + +```python +from dataclasses import dataclass +from typing import Any + + +@dataclass(slots=True) +class ColumnConfig: + cparams: dict[str, Any] | None + dparams: dict[str, Any] | None + chunks: tuple[int, ...] | None + blocks: tuple[int, ...] | None + + +@dataclass(slots=True) +class CompiledColumn: + name: str + py_type: Any + spec: Any + dtype: np.dtype + default: Any + config: ColumnConfig + + +@dataclass(slots=True) +class CompiledSchema: + row_cls: type[Any] + columns: list[CompiledColumn] + columns_by_name: dict[str, CompiledColumn] + validator_model: type[Any] | None = None +``` + +Primary internal functions: + +```python +def compile_schema(row_cls: type[Any]) -> CompiledSchema: ... +def infer_spec_from_annotation(annotation: Any, default: Any = MISSING) -> Any: ... +def validate_annotation_matches_spec(annotation: Any, spec: Any) -> None: ... +def get_blosc2_field_metadata(dc_field) -> dict[str, Any] | None: ... +``` + +Behavior: + +* accept a dataclass type only +* for explicit `b2.field(...)`, read the spec from metadata +* for inferred fields like `id: int`, derive `b2.int64()` +* reject unsupported annotations early +* normalize all defaults/config into `CompiledSchema` + +### Step 3: export the schema API from `blosc2` + +Update: + +* `src/blosc2/__init__.py` + +Exports to add: + +```python +from .schema import bool, bytes, field, float64, int64, string +``` + +And in `__all__`: + +```python +"bool", +"bytes", +"field", +"float64", +"int64", +"string", +``` + +Notes: + +* Be careful with `bool` and `bytes` in `__init__.py` because they shadow + builtins within the module namespace. That is acceptable if done deliberately, + but it should be reviewed explicitly. +* If shadowing proves too awkward internally, keep the implementation names + private and re-export the public names only. + +### Step 4: refactor `CTable` to consume compiled schemas + +Update: + +* `src/blosc2/ctable.py` + +Primary constructor signature: + +```python +class CTable(Generic[RowT]): + def __init__( + self, + row_type: type[RowT], + new_data=None, + *, + expected_size: int = 1_048_576, + compact: bool = False, + validate: bool = True, + cparams: dict[str, Any] | None = None, + dparams: dict[str, Any] | None = None, + ) -> None: ... +``` + +New internal state: + +```python +self._schema: CompiledSchema +self._validate: bool +self._table_cparams: dict[str, Any] | None +self._table_dparams: dict[str, Any] | None +``` + +New internal helper methods: + +```python +def _init_columns(self, expected_size: int) -> None: ... +def _resolve_column_storage(self, col: CompiledColumn) -> dict[str, Any]: ... +def _normalize_row_input(self, data: Any) -> dict[str, Any]: ... +def _coerce_row_to_storage(self, row: dict[str, Any]) -> dict[str, Any]: ... +``` + +Behavior changes: + +* replace direct inspection of `row_type.model_fields` +* build columns from `self._schema.columns` +* derive column dtypes from compiled schema +* merge table-level and field-level storage settings + +### Step 5: implement row validation adapter + +Create a new internal module: + +* `src/blosc2/schema_validation.py` + +Primary functions: + +```python +from typing import Any + + +def build_validator_model(schema: CompiledSchema) -> type[Any]: ... +def validate_row(schema: CompiledSchema, row: dict[str, Any]) -> dict[str, Any]: ... +def validate_rows_rowwise( + schema: CompiledSchema, rows: list[dict[str, Any]] +) -> list[dict[str, Any]]: ... +``` + +Behavior: + +* build and cache a Pydantic model per compiled schema +* map `SchemaSpec` constraints into Pydantic field definitions +* return normalized Python values ready for storage coercion + +Implementation note: + +* Cache the generated validator model on `CompiledSchema.validator_model`. +* Keep all Pydantic-specific logic isolated in this module. + +### Step 6: wire validation into `append()` + +Update: + +* `src/blosc2/ctable.py` + +Target signatures: + +```python +def append(self, data: Any) -> None: ... +def _append_validated_row(self, row: dict[str, Any]) -> None: ... +``` + +Concrete behavior: + +1. normalize incoming row shape +2. if `self._validate` is true, validate via `schema_validation.validate_row` +3. coerce to storage values +4. append into column NDArrays + +Inputs to support in the first cut: + +* dataclass row instance +* dict +* tuple/list in schema order + +Inputs that can wait until later if needed: + +* structured NumPy scalar +* Pydantic model instance + +### Step 7: add `extend(..., validate=...)` + +Update: + +* `src/blosc2/ctable.py` + +Proposed signature: + +```python +def extend(self, data: Any, *, validate: bool | None = None) -> None: ... +``` + +Supporting internal helpers: + +```python +def _normalize_rows_input( + self, data: Any +) -> tuple[list[dict[str, Any]] | None, dict[str, Any] | None]: ... +def _extend_rowwise(self, rows: list[dict[str, Any]], *, validate: bool) -> None: ... +def _extend_columnwise(self, columns: dict[str, Any], *, validate: bool) -> None: ... +``` + +First implementation target: + +* support iterable of rows via `_extend_rowwise` +* preserve correctness first, optimize later + +Second implementation target: + +* add `_extend_columnwise` for structured arrays and dict-of-arrays +* add vectorized validation for simple constraints + +### Step 8: add vectorized validation helpers + +Create a new internal module: + +* `src/blosc2/schema_vectorized.py` + +Primary functions: + +```python +from typing import Any + + +def validate_column_values(col: CompiledColumn, values: Any) -> None: ... +def validate_column_batch(schema: CompiledSchema, columns: dict[str, Any]) -> None: ... +``` + +Initial checks to support: + +* numeric `ge`, `gt`, `le`, `lt` +* string and bytes `min_length`, `max_length` +* dtype compatibility after coercion + +This module should remain optional in the first PR if the rowwise path is enough +to land the architecture cleanly. + +### Step 9: add schema introspection to `CTable` + +Update: + +* `src/blosc2/ctable.py` + +Proposed property: + +```python +@property +def schema(self) -> CompiledSchema: ... +``` + +Optional helper methods: + +```python +def schema_dict(self) -> dict[str, Any]: ... +def column_schema(self, name: str) -> CompiledColumn: ... +``` + +Goal: + +* make the new schema layer visible and debuggable +* provide a stable base for future save/load work + +### Step 10: add tests in focused modules + +Add: + +* `tests/ctable/test_schema_specs.py` +* `tests/ctable/test_schema_compiler.py` +* `tests/ctable/test_schema_validation.py` +* `tests/ctable/test_ctable_dataclass_schema.py` + +Test scope by file: + +`tests/ctable/test_schema_specs.py` + +* spec construction +* dtype mapping +* metadata export + +`tests/ctable/test_schema_compiler.py` + +* explicit `b2.field(...)` +* inferred shorthand from plain annotations +* annotation/spec mismatch rejection +* defaults handling + +`tests/ctable/test_schema_validation.py` + +* Pydantic validator generation +* constraint enforcement + +`tests/ctable/test_ctable_dataclass_schema.py` + +* `CTable(Row)` construction +* append with dataclass/dict/tuple +* extend with iterable of rows +* per-column `cparams` override plumbing + +### Step 11: keep the legacy prototype isolated during transition + +Short-term implementation choice: + +* if the current `ctable.py` prototype is still in active flux, prefer landing + the schema/compiler modules first and then refactoring `CTable` over them +* do not expand the old Pydantic-specific schema path further + +Possible follow-up helper: + +```python +def compile_legacy_pydantic_schema(row_cls: type[Any]) -> CompiledSchema: ... +``` + +But only add that if compatibility becomes necessary. + +### Step 12: persistence groundwork + +No need to implement `save()` / `load()` immediately, but define serialization +hooks on the schema side now. + +Add to `CompiledSchema` or a related helper: + +```python +def schema_to_dict(schema: CompiledSchema) -> dict[str, Any]: ... +def schema_from_dict(data: dict[str, Any]) -> CompiledSchema: ... +``` + +This should remain internal until the persisted format is stable. + +The persistency design itself is specified in: + +* [ctable-persistency.md](/Users/faltet/blosc/python-blosc2/plans/ctable-persistency.md) + +The schema-layer contract for persistency is: + +* schema must serialize to a versioned JSON-compatible dict +* column order must be preserved explicitly in the serialized `columns` list +* the serialized schema must be sufficient to reconstruct `CompiledSchema` + without requiring the original Python dataclass definition at load time + +### Step 13: delivery order across PRs + +Recommended PR slicing: + +PR 1: + +* `src/blosc2/schema.py` +* `src/blosc2/schema_compiler.py` +* exports in `src/blosc2/__init__.py` +* tests for schema specs and compiler + +PR 2: + +* `CTable` constructor refactor to use compiled schema +* `append()` row normalization +* row-wise validation module +* `tests/ctable/test_ctable_dataclass_schema.py` + +PR 3: + +* `extend(..., validate=...)` +* vectorized validation helpers +* schema introspection property +* more tests for batch validation and overrides + +PR 4: + +* persistence groundwork on the schema side +* optional compatibility adapter for legacy Pydantic model declarations + +PR 5: + +* TreeStore-backed persistency as described in + [ctable-persistency.md](/Users/faltet/blosc/python-blosc2/plans/ctable-persistency.md) +* `urlpath` / `mode` constructor semantics +* explicit `open()` support +* `_meta`, `_valid_rows`, `_cols/` storage layout +* persistency tests + +### Step 14: concrete first-PR checklist + +The smallest coherent first implementation should be: + +1. add `src/blosc2/schema.py` +2. add `src/blosc2/schema_compiler.py` +3. export `field`, `int64`, `float64`, `bool` +4. add tests for: + * explicit field specs + * inferred shorthand + * mismatch rejection +5. stop there + +That first PR gives the project: + +* the public schema vocabulary +* the internal compiled representation +* confidence in the canonical API shape + +before touching too much `CTable` mutation logic. + +After that first PR lands, follow the later phases in this order: + +1. dataclass-driven `CTable` construction and append path +2. validation and batch-insert behavior +3. schema introspection +4. TreeStore-backed persistency + +--- + +## Recommendation + +The recommended direction is: + +1. Make **dataclasses** the public schema declaration mechanism for `CTable`. +2. Introduce **lowercase schema spec objects** such as `b2.int64(...)`. +3. Use **`b2.field(...)`** to carry both the schema spec and per-column NDArray + configuration. +4. Compile the schema once into an internal representation. +5. Use **Pydantic internally for row validation**, but keep it hidden behind the + Blosc2 schema API. +6. Add a separate **bulk validation path** for large inserts so `extend()` does + not depend entirely on per-row Pydantic validation. + +This design gives the project: + +* a cleaner user API +* a better place for columnar storage configuration +* a clear boundary between schema, validation, and storage +* flexibility to evolve validation internals later +* a strong base for future persistence and schema introspection diff --git a/plans/ctable-user-guide.md b/plans/ctable-user-guide.md new file mode 100644 index 00000000..a49aea7a --- /dev/null +++ b/plans/ctable-user-guide.md @@ -0,0 +1,497 @@ +# CTable User Guide + +This document explains how to use `CTable` as it currently stands. + +--- + +## What is CTable? + +`CTable` is a columnar compressed table built on top of `blosc2.NDArray`. Each +column is stored as a separate compressed array. Rows are never physically removed +on deletion — instead a boolean mask (`_valid_rows`) marks live rows, and +compaction can be triggered manually or automatically. + +--- + +## Defining a schema + +A schema is a Python `@dataclass` where each field uses `b2.field()` to declare +the column type and constraints. + +```python +from dataclasses import dataclass +import blosc2 as b2 + + +@dataclass +class Row: + id: int = b2.field(b2.int64(ge=0)) + score: float = b2.field(b2.float64(ge=0, le=100), default=0.0) + active: bool = b2.field(b2.bool(), default=True) +``` + +### Available spec types + +| Spec | NumPy dtype | Constraints | +|---|---|---| +| `b2.int64(ge, gt, le, lt)` | `int64` | numeric bounds | +| `b2.float64(ge, gt, le, lt)` | `float64` | numeric bounds | +| `b2.bool()` | `bool_` | — | +| `b2.complex64()` | `complex64` | — | +| `b2.complex128()` | `complex128` | — | +| `b2.string(min_length, max_length, pattern)` | `U` | length / regex | +| `b2.bytes(min_length, max_length)` | `S` | length | + +Constraints are enforced on every insert (see **Validation** below). + +### Inferred shorthand + +For columns with no constraints and no per-column storage options, you can omit +`b2.field()` entirely: + +```python +@dataclass +class Row: + id: int # inferred as b2.int64() + score: float # inferred as b2.float64() + flag: bool = True # inferred as b2.bool(), default=True +``` + +### Dataclass field ordering rule + +Python dataclasses require that fields **with defaults come after fields without +defaults**. Plan your schema accordingly: + +```python +@dataclass +class Row: + id: int = b2.field(b2.int64()) # required — no default + score: float = b2.field(b2.float64(), default=0.0) # optional + active: bool = b2.field(b2.bool(), default=True) # optional +``` + +--- + +## Creating a table + +```python +import blosc2 as b2 + +# Empty table (in-memory) +t = b2.CTable(Row) + +# Table pre-loaded with data +t = b2.CTable(Row, new_data=[(1, 95.0, True), (2, 80.0, False)]) + +# Reserve space upfront (avoids resizes) +t = b2.CTable(Row, expected_size=1_000_000) + +# Disable constraint validation (faster for trusted data) +t = b2.CTable(Row, validate=False) + +# Enable auto-compaction (fills gaps before resizing) +t = b2.CTable(Row, compact=True) + +# Table-level compression settings (applied to all columns unless overridden) +t = b2.CTable(Row, cparams={"codec": b2.Codec.ZSTD, "clevel": 5}) +``` + +### Persistent tables + +Pass `urlpath` to store the table on disk. Persistent `CTable` is backed by a +`TreeStore`, and `blosc2.open(urlpath)` can materialize it directly from the +root `/_meta` manifest. + +```python +# Create a new persistent table (overwrites any existing table at that path) +t = b2.CTable(Row, urlpath="people", mode="w", expected_size=1_000_000) +t.extend([(i, float(i % 100), True) for i in range(10_000)]) + +# Open an existing persistent table for reading and writing +t = b2.CTable(Row, urlpath="people", mode="a") +t.append((99999, 50.0, True)) + +# Open read-only (default for CTable.open) +t = b2.CTable.open("people") # mode="r" by default +t = b2.CTable.open("people", mode="r") # explicit + +# Open read/write via the classmethod +t = b2.CTable.open("people", mode="a") + +# Generic open() also materializes the richer object +t = b2.open("people") +``` + +`mode` values: + +| mode | behaviour | +|---|---| +| `"w"` | create (overwrite if the path already exists) | +| `"a"` | open existing or create new | +| `"r"` | open existing read-only | + +In-memory tables (`urlpath=None`, the default) behave exactly as before — no +`mode` or path handling is involved. + +Recommended conventions: + +- extensionless paths default to directory-backed stores +- `.b2d` and `.b2z` are still valid and useful conventions, but no longer required + +### Store layout + +``` +people/ ← TreeStore root (extensionless directory-backed example) + embed.b2e ← internal store metadata + _meta.b2f ← SChunk manifest with kind/version/schema in vlmeta + _valid_rows.b2nd ← tombstone mask + _cols/ + id.b2nd + score.b2nd + active.b2nd +``` + +You can inspect the raw metadata: + +```python +import blosc2, json + +store = blosc2.TreeStore("people", mode="r") +meta = store["/_meta"] +print(meta.vlmeta["kind"]) # "ctable" +print(meta.vlmeta["version"]) # 1 +schema = json.loads(meta.vlmeta["schema"]) +``` + +### Per-column storage options + +```python +@dataclass +class Row: + id: int = b2.field(b2.int64(), cparams={"codec": b2.Codec.LZ4, "clevel": 1}) + score: float = b2.field( + b2.float64(ge=0, le=100), + cparams={"codec": b2.Codec.ZSTD, "clevel": 9}, + default=0.0, + ) +``` + +Column-level `cparams`/`dparams`/`chunks`/`blocks` override the table-level +defaults for that column only. + +--- + +## Inserting data + +### `append()` — one row at a time + +Accepts a tuple, list, dict, or dataclass instance: + +```python +t.append((1, 95.0, True)) +t.append([2, 80.0, False]) +t.append({"id": 3, "score": 50.0, "active": True}) +``` + +Fields with defaults can be omitted: + +```python +t.append((4,)) # score=0.0 and active=True filled from defaults +``` + +### `extend()` — bulk insert + +Accepts a list of tuples, a NumPy structured array, or another `CTable`: + +```python +# List of tuples +t.extend([(i, float(i), True) for i in range(1000)]) + +# NumPy structured array +import numpy as np + +dtype = np.dtype([("id", np.int64), ("score", np.float64), ("active", np.bool_)]) +arr = np.array([(1, 50.0, True), (2, 75.0, False)], dtype=dtype) +t.extend(arr) + +# Another CTable +t.extend(other_table) +``` + +#### Per-call validation override + +```python +# Skip validation for one trusted batch (even if table was built with validate=True) +t.extend(trusted_data, validate=False) + +# Force validation for one batch (even if table was built with validate=False) +t.extend(external_data, validate=True) +``` + +--- + +## Validation + +When `validate=True` (the default), constraints declared in the schema are +enforced on every insert: + +```python +t.append((-1, 50.0, True)) # ValueError: id violates ge=0 +t.append((1, 150.0, True)) # ValueError: score violates le=100 +t.extend([(-1, 50.0, True)]) # ValueError: id violates ge=0 +``` + +Boundary values are accepted: + +```python +t.append((0, 0.0, True)) # ok — id=0 satisfies ge=0, score=0.0 satisfies ge=0 +t.append((1, 100.0, False)) # ok — score=100.0 satisfies le=100 +``` + +To skip validation entirely: + +```python +t = b2.CTable(Row, validate=False) +``` + +--- + +## Reading data + +### Row access + +```python +t.row[0] # first row → returns a single-row CTable view +t.row[-1] # last row +t.row[2:5] # slice → CTable view with rows 2, 3, 4 +t.row[::2] # every other row +t.row[[0, 5, 10]] # specific rows by logical index +``` + +Row access always uses **logical indices** (i.e. index 0 is the first live row, +not the first physical slot). + +### Column access + +```python +t["id"] # returns a Column object +t.score # attribute-style access also works + +# Iterate values +for val in t["score"]: + print(val) + +# Convert to NumPy array +arr = t["score"].to_numpy() + +# Single value +val = t["id"][5] # logical index 5 +``` + +### Column slicing + +```python +col_view = t["id"][0:10] # returns a Column view (mask applied) +arr = col_view.to_numpy() # materialise to NumPy +``` + +### head / tail + +```python +t.head(10) # CTable view of first 10 rows +t.tail(5) # CTable view of last 5 rows +``` + +--- + +## Deleting rows + +`delete()` marks rows as invalid in the tombstone mask — data is not physically +removed. + +```python +t.delete(0) # delete first live row +t.delete(-1) # delete last live row +t.delete([0, 2, 4]) # delete multiple rows by logical index +t.delete(list(range(10))) # delete first 10 live rows +``` + +Negative indices and mixed positive/negative lists are supported. + +--- + +## Compaction + +After many deletions, physical storage has gaps. Compaction moves all live rows +to the front and clears the rest. + +```python +t.compact() # manual compaction +``` + +Auto-compaction runs automatically before a resize when `compact=True`: + +```python +t = b2.CTable(Row, compact=True) +``` + +--- + +## Read-only mode + +When a table is opened with `mode="r"` (or via `CTable.open()` without specifying +mode), all mutating operations raise immediately: + +```python +t = b2.CTable.open("people") # read-only + +t.append((1, 50.0, True)) # ValueError: Table is read-only +t.extend([(1, 50.0, True)]) # ValueError: Table is read-only +t.delete(0) # ValueError: Table is read-only +t.compact() # ValueError: Table is read-only +``` + +All read operations work normally: `row[]`, column access, `head()`, `tail()`, +`where()`, `len()`, `info()`, `schema_dict()`. + +--- + +## Filtering + +`where()` applies a boolean expression and returns a read-only view: + +```python +view = t.where(t["score"] > 50) +view = t.where((t["id"] > 10) & (t["active"] == True)) +``` + +Views share `_cols` with the parent table and cannot be mutated (no `append` or +`extend`). + +--- + +## Table info + +```python +len(t) # number of live rows +t.nrows # same +t.ncols # number of columns +t.col_names # list of column names + +t.info() # prints a formatted summary with dtypes and memory usage +print(t) # prints the first rows in a table format +``` + +--- + +## Schema introspection + +```python +t.schema # CompiledSchema object +t.column_schema("id") # CompiledColumn for column "id" +t.schema_dict() # JSON-compatible dict of the full schema +``` + +`schema_dict()` example output: + +```python +{ + "version": 1, + "row_cls": "Row", + "columns": [ + {"name": "id", "kind": "int64", "ge": 0, "default": None}, + {"name": "score", "kind": "float64", "ge": 0, "le": 100, "default": 0.0}, + {"name": "active", "kind": "bool", "default": True}, + ], +} +``` + +The dict can be restored to a `CompiledSchema` without the original Python class: + +```python +from blosc2.schema_compiler import schema_from_dict + +restored = schema_from_dict(t.schema_dict()) +``` + +--- + +## Memory and compression + +```python +# Compressed size of all columns + valid_rows mask +cbytes = sum(col.cbytes for col in t._cols.values()) + t._valid_rows.cbytes + +# Uncompressed size +nbytes = sum(col.nbytes for col in t._cols.values()) + t._valid_rows.nbytes + +print(f"Compression ratio: {nbytes / cbytes:.2f}x") +``` + +--- + +## Complete example + +```python +from dataclasses import dataclass +import numpy as np +import blosc2 as b2 + + +@dataclass +class Measurement: + sensor_id: int = b2.field(b2.int64(ge=0)) + value: float = b2.field(b2.float64(ge=-1000, le=1000), default=0.0) + valid: bool = b2.field(b2.bool(), default=True) + + +# Create and populate (in-memory) +t = b2.CTable(Measurement, expected_size=10_000) +t.extend([(i, float(i % 200 - 100), i % 3 != 0) for i in range(5000)]) + +# Query +hot = t.where(t["value"] > 50) +print(f"Hot readings: {len(hot)}") + +# Delete invalid +invalid_indices = [i for i in range(len(t)) if not t.row[i].valid[0]] +if invalid_indices: + t.delete(invalid_indices) + +# Inspect +t.info() +print(t.schema_dict()) +``` + +## Persistency example + +```python +from dataclasses import dataclass +import blosc2 as b2 + + +@dataclass +class Measurement: + sensor_id: int = b2.field(b2.int64(ge=0)) + value: float = b2.field(b2.float64(ge=-1000, le=1000), default=0.0) + valid: bool = b2.field(b2.bool(), default=True) + + +# --- Session 1: create and populate --- +t = b2.CTable(Measurement, urlpath="sensors", mode="w", expected_size=100_000) +t.extend([(i, float(i % 200 - 100), i % 3 != 0) for i in range(50_000)]) +print(f"Saved {len(t)} rows to disk") +# Table is automatically persisted — no explicit save() needed. + +# --- Session 2: reopen and query --- +t = b2.CTable.open("sensors") # read-only by default +hot = t.where(t["value"] > 50) +print(f"Hot readings: {len(hot)}") +arr = t["sensor_id"].to_numpy() +print(f"First 5 sensor IDs: {arr[:5]}") + +# --- Session 3: reopen and append more data --- +t = b2.CTable(Measurement, urlpath="sensors", mode="a") +t.extend([(50_000 + i, float(i), True) for i in range(1_000)]) +print(f"Total rows: {len(t)}") +``` diff --git a/plans/tree_store_extensions.md b/plans/tree_store_extensions.md new file mode 100644 index 00000000..400be499 --- /dev/null +++ b/plans/tree_store_extensions.md @@ -0,0 +1,258 @@ +# TreeStore Extension Objects + +## Goal + +Define a general mechanism for representing richer logical objects on top of a +`TreeStore`, while keeping the underlying persisted container recognizable as a +plain `TreeStore`. + +The initial driver is persisted `CTable`, but the mechanism should be generic +enough to support other logical object kinds later. + +## Core Idea + +`TreeStore` already has a low-level container identity: + +- `storage.meta["b2tree"] = {"version": 1}` + +That answers: + +- "what physical container is stored here?" + +However, a `TreeStore` can also act as a substrate for a richer logical object. +For that, we introduce a reserved manifest entry: + +- `/_meta` + +This answers: + +- "what logical object is represented by this store subtree?" + +The manifest is a small persisted `SChunk` whose `vlmeta` is the source of +truth for logical-object identity and configuration. + +## Object Root Model + +An object root is any `TreeStore` subtree that contains: + +- `/_meta` + +Examples: + +- whole-store object: + - `/_meta` +- subtree object: + - `/users/_meta` + - `/orders/_meta` + +This gives one uniform rule: + +- if a subtree has `/_meta`, it may represent a richer logical object + +The whole-store case is just the special case where the object root is the +store root. + +## Why Use `/_meta` + +### Separation Of Roles + +- container `.meta` remains about the low-level container type (`b2tree`) +- `/_meta` is about higher-level logical identity +- user-facing `tstore.vlmeta` remains available for user metadata + +### Mutable Object Metadata + +Unlike fixed container metalayers, `/_meta.vlmeta` can evolve over time. +That matters for store-backed logical objects that may need mutable metadata, +such as: + +- schema evolution state +- object versioning +- feature flags +- migration markers + +### Generic Store Extension Point + +`/_meta` should not be a one-off `CTable` special case. +It should be the general manifest contract for any richer object represented on +top of a store subtree. + +## Manifest Representation + +`/_meta` should be: + +- a small persisted `SChunk` +- primarily used through `vlmeta` + +The initial required fields in `/_meta.vlmeta` are: + +- `kind` +- `version` + +Example: + +```python +tree_store["/_meta"].vlmeta["kind"] = "ctable" +tree_store["/_meta"].vlmeta["version"] = 1 +``` + +Additional fields are object-kind-specific. + +For example, a `CTable` manifest may add: + +- `schema` + +## Reserved Internal Names + +Within an object root, the following path is reserved: + +- `/_meta` + +Logical objects may reserve additional internal paths under the same root. + +For example, `CTable` is expected to reserve: + +- `/_valid_rows` +- `/_cols` + +These reserved names are internal implementation detail and must not be treated +as user data nodes. + +## `blosc2.open()` Contract + +When opening a persisted path: + +1. low-level store detection happens first +2. if the opened object is a `TreeStore`, object-manifest detection may happen +3. if a recognized manifest is found, materialize the richer logical object +4. otherwise, return the raw `TreeStore` + +For the whole-store case, the detection rule is: + +- open the path as a `TreeStore` +- look for `/_meta` +- if `/_meta.vlmeta["kind"]` is recognized, dispatch to the corresponding + higher-level constructor/open path + +This preserves the current layering: + +- low-level open still discovers a `TreeStore` +- logical-object open is an extra step on top + +## Root-Only First Implementation + +The design should anticipate subtree object roots, but the first implementation +does not need to support them yet. + +Initial scope: + +- only the store root may be materialized as a richer object +- only `/_meta` at store root is consulted by `blosc2.open(urlpath)` + +Deferred scope: + +- subtree object roots such as `/users/_meta` +- multiple richer objects in one `TreeStore` +- automatic materialization of `tstore["/subtree"]` +- explicit references to store-subtree logical objects + +This staged approach keeps the first implementation simple while preserving a +clear path toward multi-object stores later. + +## Dispatch API Shape + +The first implementation should support: + +```python +obj = blosc2.open(urlpath) +``` + +Behavior: + +- if `urlpath` resolves to a plain store with no recognized root manifest, + return `TreeStore` +- if `urlpath` resolves to a `TreeStore` with recognized `/_meta`, return the + richer object + +For the deferred subtree-aware model, the API question is still open: + +- `blosc2.open(urlpath, key="/users")` +- `blosc2.Ref` support for store-subtree objects +- other path-addressing schemes + +These should be designed in a later phase. + +## Error Handling + +The generic manifest contract should distinguish: + +- no `/_meta` present: + - return raw `TreeStore` +- `/_meta` present but missing required fields: + - error clearly +- `/_meta` present with unknown `kind`: + - either return raw `TreeStore` or raise a dedicated error + +Recommended first behavior: + +- missing manifest: return raw `TreeStore` +- malformed recognized manifest: raise error +- unknown manifest kind: return raw `TreeStore` + +This is conservative and avoids breaking forward compatibility unnecessarily. + +## Recommended Invariants + +- `/_meta` must always be a persisted `SChunk` +- `/_meta.vlmeta["kind"]` must be a string +- `/_meta.vlmeta["version"]` must be an integer +- logical object implementations own the schema of additional fields +- object materialization should not depend on `TreeStore` iteration order + +## Example: `CTable` + +With this contract, a root-level `CTable` would look like: + +- `/_meta` +- `/_valid_rows` +- `/_cols/id` +- `/_cols/score` + +And the manifest would contain: + +```python +{ + "kind": "ctable", + "version": 1, + "schema": {...}, +} +``` + +`blosc2.open(urlpath)` would: + +1. detect `b2tree` +2. open `TreeStore` +3. inspect `/_meta` +4. see `kind == "ctable"` +5. return `CTable` + +## Open Questions + +- Should unknown manifest kinds return raw `TreeStore`, warn, or raise? +- Should there eventually be a helper such as `blosc2.open_store_object(...)` + for explicit manifest-driven dispatch? +- Should `TreeStore` grow a helper for probing object roots, e.g. + `get_object_manifest("/")` or `has_object_manifest(path)`? +- Should object-manifest detection be limited to `TreeStore`, or later be + generalized to other store-like containers? + +## Recommended Next Step + +Use this contract for the first root-level `CTable` implementation: + +- generic manifest mechanism defined here +- `CTable` as the first supported manifest `kind` +- root-only dispatch in `blosc2.open()` + +Once that is stable, subtree object roots can be added without changing the +basic meaning of `/_meta`. diff --git a/plans/treestore-ctable-extension.md b/plans/treestore-ctable-extension.md new file mode 100644 index 00000000..993439f9 --- /dev/null +++ b/plans/treestore-ctable-extension.md @@ -0,0 +1,344 @@ +# TreeStore Root-Level `CTable` Extension Plan + +## Goal + +Allow a `CTable` stored as the sole logical object inside a `TreeStore` to be +opened directly via: + +```python +table = blosc2.open(urlpath) +``` + +That is, if a `TreeStore` at `urlpath` carries a recognized root manifest for +`CTable`, `blosc2.open(urlpath)` should return a `CTable` instance instead of a +raw `TreeStore`. + +This plan intentionally covers only the simple first round: + +- one `CTable` per `TreeStore` +- object root is the store root +- `/_meta` at store root is the manifest + +Subtree object roots and multiple tables per store are deferred. + +## Background + +`TreeStore` now has persistent low-level container metadata through: + +- `storage.meta["b2tree"] = {"version": 1}` + +That is enough for `blosc2.open()` to recognize the path as a `TreeStore`, but +not enough to know whether the store should materialize as a richer object. + +The generic extension contract in [tree_store_extensions.md](/Users/faltet/blosc/python-blosc2/tree_store_extensions.md) +introduces: + +- `/_meta` as the logical-object manifest for store-backed objects + +This plan applies that contract to `CTable`. + +## Storage Layout + +The persisted root-level `CTable` layout should be: + +- `/_meta` +- `/_valid_rows` +- `/_cols/` + +Example: + +- `/_meta` +- `/_valid_rows` +- `/_cols/id` +- `/_cols/score` +- `/_cols/active` + +Rationale: + +- `/_meta` stores logical-object manifest data +- `/_valid_rows` stores real row-visibility data +- `/_cols/` stores one persisted column array per field + +## Root Manifest + +`/_meta` should be a small persisted `SChunk` used primarily through `vlmeta`. + +Initial required manifest fields: + +- `kind` +- `version` +- `schema` + +Initial `CTable` manifest: + +```python +{ + "kind": "ctable", + "version": 1, + "schema": {...}, +} +``` + +Recommended concrete writes: + +```python +tstore["/_meta"].vlmeta["kind"] = "ctable" +tstore["/_meta"].vlmeta["version"] = 1 +tstore["/_meta"].vlmeta["schema"] = schema_payload +``` + +## Schema Persistence Format + +The schema should be stored in: + +- `/_meta.vlmeta["schema"]` + +The schema document should be JSON-compatible, explicit, and versioned. + +Recommended shape: + +```python +{ + "version": 1, + "columns": [ + { + "name": "id", + "py_type": "int", + "spec": {"kind": "int64", "ge": 0}, + "default": None, + }, + { + "name": "score", + "py_type": "float", + "spec": {"kind": "float64", "ge": 0, "le": 100}, + "default": None, + }, + { + "name": "active", + "py_type": "bool", + "spec": {"kind": "bool"}, + "default": True, + }, + ], +} +``` + +Notes: + +- `columns` must be an ordered list, not a dict +- column order comes from the schema list +- `TreeStore` iteration order must not be used as schema authority + +For the first version, do not duplicate data that can be inspected from the +stored column arrays: + +- per-column `cparams` +- per-column `dparams` +- chunk/block layout +- `expected_size` +- compaction settings + +## `_valid_rows` Persistence + +`/_valid_rows` should be a normal persisted boolean array. + +This is correct because: + +- it is table data, not metadata +- it may grow large +- it participates in normal row visibility semantics + +It should not be folded into `/_meta`. + +## Column Persistence + +Each column should be stored as its own persisted array under: + +- `/_cols/` + +This keeps the physical layout aligned with the internal columnar design and +lets per-column storage details remain attached to the actual persisted array. + +## Constructor Semantics + +The intended public constructor remains: + +```python +table = blosc2.CTable( + Row, + urlpath=None, + mode="a", + expected_size=1_048_576, + compact=False, + validate=True, +) +``` + +For the persistent path: + +- `urlpath is None`: + - in-memory `CTable` +- `urlpath is not None`: + - root-level `CTable` persisted on top of a `TreeStore` + +Recommended mode behavior: + +- `mode="w"`: + - create a fresh store-root `CTable` +- `mode="a"`: + - open existing or create new +- `mode="r"`: + - open existing read-only + +## `blosc2.open()` Materialization + +The root-level dispatch behavior should be: + +1. `blosc2.open(urlpath)` detects a `TreeStore` +2. it opens the `TreeStore` +3. it checks for `/_meta` +4. if `/_meta.vlmeta["kind"] == "ctable"`, it materializes `CTable` +5. otherwise it returns the raw `TreeStore` + +This preserves the current open layering: + +- first detect the low-level container +- then optionally materialize a richer object + +## Suggested Implementation Shape + +### Step 1: Add Root Manifest Helpers + +Add private helper(s) for root-manifest probing, e.g.: + +- `_open_treestore_root_object(store)` +- `_read_treestore_root_manifest(store)` + +Responsibilities: + +- check whether `/_meta` exists +- open `/_meta` +- validate that it is an `SChunk` +- read `kind` / `version` +- return a manifest payload suitable for dispatch + +### Step 2: Extend `blosc2.open()` + +In the special-store open path: + +- if opening yields a `TreeStore` +- probe the root manifest +- if recognized as `ctable`, return `CTable.open(...)` or equivalent internal + constructor +- otherwise return the `TreeStore` + +This logic should be localized so the generic `open()` path remains easy to +follow. + +### Step 3: Add `CTable` Root-Manifest Read/Write Helpers + +In the `CTable` persistence layer, add helpers for: + +- creating `/_meta` +- writing `kind` +- writing `version` +- writing `schema` +- reading and validating the root manifest + +This should be the only place that knows the `CTable` manifest schema. + +### Step 4: Wire Creation + +When a persistent `CTable` is created: + +- create/open the backing `TreeStore` +- create `/_meta` +- write the root manifest +- create `/_valid_rows` +- create `/_cols/` arrays + +### Step 5: Wire Reopen + +When a persistent `CTable` is reopened: + +- read `/_meta.vlmeta["schema"]` +- rebuild the compiled schema +- reopen `/_valid_rows` +- reopen each persisted column from `/_cols/` + +### Step 6: Keep Internal Names Reserved + +Validation should reject user column names that collide with internal names: + +- `_meta` +- `_valid_rows` +- `_cols` + +This already aligns with the existing schema compiler reserved-name logic. + +## Validation Rules + +For `CTable` root-manifest detection: + +- if `/_meta` does not exist: + - not a persisted `CTable` +- if `/_meta` exists but is malformed: + - raise clear error on attempted `CTable` materialization +- if `kind != "ctable"`: + - return raw `TreeStore` +- if `kind == "ctable"` but required fields are missing: + - raise clear error + +Recommended required fields for version 1: + +- `kind` +- `version` +- `schema` + +## Deferred Scope + +This plan intentionally does not cover: + +- multiple `CTable` objects in one `TreeStore` +- subtree object roots such as `/users/_meta` +- automatic materialization when indexing a subtree from `TreeStore` +- `Ref` support for store-subtree logical objects +- schema evolution beyond append-only behavior + +These should be handled in later phases after the root-level path is stable. + +## Tests + +Add coverage for: + +- create persistent root-level `CTable` +- reopen via `blosc2.open(urlpath)` and get `CTable` +- reopen via `CTable.open(urlpath, mode="r")` +- root manifest present and schema readable from `/_meta.vlmeta` +- store with no `/_meta` still opens as raw `TreeStore` +- store with unknown root-manifest `kind` still opens as raw `TreeStore` +- malformed `CTable` manifest raises clear error +- append rows after reopen +- read-only reopen rejects writes + +## Recommended Implementation Order + +1. write root-manifest probe helpers for `TreeStore` +2. extend `blosc2.open()` with root-manifest dispatch +3. add `CTable` manifest read/write helpers +4. wire persistent create/open around the manifest +5. add tests for dispatch and round-trip + +## Summary + +The first `TreeStore` extension should treat root `/_meta` as the logical +manifest for the whole store. + +For `CTable`, this yields a simple and coherent open story: + +- low-level metadata says "this is a `TreeStore`" +- root `/_meta` says "this store materializes as a `CTable`" +- `blosc2.open(urlpath)` returns the richer object directly + +This keeps the first implementation small while staying compatible with a later +generalization to subtree object roots. diff --git a/pyproject.toml b/pyproject.toml index ddd53b3d..04d0c275 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ "ndindex", "msgpack", "numexpr>=2.14.1; platform_machine != 'wasm32'", + "pydantic", "requests", "threadpoolctl; platform_machine != 'wasm32'", ] diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index dfd98c17..b28cb218 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -376,29 +376,18 @@ def isdtype(a_dtype: np.dtype, kind: str | np.dtype | tuple): from numpy import ( bool_, - complex64, complex128, e, euler_gamma, float16, - float32, float64, inf, - int8, - int16, - int32, int64, nan, newaxis, pi, - uint8, - uint16, - uint32, - uint64, ) -bool = bool - DEFAULT_COMPLEX = complex128 """ Default complex floating dtype.""" @@ -638,7 +627,10 @@ def _raise(exc): Disable the overloaded equal operator. """ -# Delayed imports for avoiding overwriting of python builtins +# Delayed imports for avoiding overwriting of python builtins. +# Note: bool, bytes, string shadow builtins in the blosc2 namespace by design — +# they are schema spec constructors (b2.bool(), b2.bytes(), etc.). +from .ctable import Column, CTable from .ndarray import ( abs, acos, @@ -740,6 +732,24 @@ def _raise(exc): var, where, ) +from .schema import ( + bool, + bytes, + complex64, + complex128, + field, + float32, + float64, + int8, + int16, + int32, + int64, + string, + uint8, + uint16, + uint32, + uint64, +) __all__ = [ # noqa : RUF022 # Constants @@ -760,6 +770,23 @@ def _raise(exc): "inf", "nan", "newaxis", + # Schema API (CTable) + "bool", + "bytes", + "complex64", + "complex128", + "field", + "float32", + "float64", + "int8", + "int16", + "int32", + "int64", + "string", + "uint8", + "uint16", + "uint32", + "uint64", # Classes "C2Array", "CParams", diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 809c209a..ceb78acd 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -834,7 +834,7 @@ def load_tensor(urlpath: str, dparams: dict | None = None) -> tensorflow.Tensor :func:`~blosc2.save_tensor` :func:`~blosc2.pack_tensor` """ - schunk = blosc2.open(urlpath, dparams=dparams) + schunk = blosc2.open(urlpath, mode="r", dparams=dparams) return _unpack_tensor(schunk) diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py new file mode 100644 index 00000000..b4a2d331 --- /dev/null +++ b/src/blosc2/ctable.py @@ -0,0 +1,3535 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +"""CTable: a columnar compressed table built on top of blosc2.NDArray.""" + +from __future__ import annotations + +import contextlib +import dataclasses +import itertools +import os +import pprint +import shutil +import weakref +from collections.abc import Iterable +from dataclasses import MISSING +from textwrap import TextWrapper +from typing import Any, Generic, TypeVar + +import numpy as np + +from blosc2 import compute_chunks_blocks +from blosc2.ctable_storage import FileTableStorage, InMemoryTableStorage, TableStorage +from blosc2.schema_compiler import schema_from_dict, schema_to_dict + +try: + from line_profiler import profile +except ImportError: + + def profile(func): + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + + wrapper.__name__ = func.__name__ + return wrapper + + +import blosc2 +from blosc2.info import InfoReporter, format_nbytes_info +from blosc2.schema import SchemaSpec +from blosc2.schema_compiler import ( + ColumnConfig, + CompiledColumn, + CompiledSchema, + _validate_column_name, + compile_schema, + compute_display_width, +) + +# --------------------------------------------------------------------------- +# Index proxy and CTableIndex +# --------------------------------------------------------------------------- + + +class _FakeVlMeta: + """Minimal vlmeta stand-in that accepts writes without touching a real SChunk.""" + + def __init__(self): + self._data: dict = {} + + def __getitem__(self, key): + return self._data[key] + + def __setitem__(self, key, value): + self._data[key] = value + + def get(self, key, default=None): + return self._data.get(key, default) + + +class _FakeSchunk: + """Minimal SChunk stand-in whose vlmeta stores in memory.""" + + def __init__(self): + self.vlmeta = _FakeVlMeta() + + +class _CTableIndexProxy: + """Minimal shim that lets the ``indexing`` module build sidecars for a + CTable column without touching the column's own ``schunk.vlmeta``. + + Attributes mirror those required by the internal build functions: + ``urlpath``, ``schunk``, ``shape``, ``ndim``, ``dtype``, ``chunks``, + ``blocks``, and item access via ``__getitem__``. + """ + + def __init__(self, col_array: blosc2.NDArray, anchor_urlpath: str | None) -> None: + self._col_array = col_array + self.urlpath = anchor_urlpath # controls sidecar placement + self.schunk = _FakeSchunk() + self.shape = col_array.shape + self.ndim = col_array.ndim + self.dtype = col_array.dtype + self.chunks = col_array.chunks + self.blocks = col_array.blocks + + def __getitem__(self, key): + return self._col_array[key] + + +class CTableIndex: + """A handle on an index attached to a :class:`CTable` column. + + Returned by :meth:`CTable.index` and items of :attr:`CTable.indexes`. + Provides :meth:`drop`, :meth:`rebuild`, and :meth:`compact` convenience + methods that delegate back to the owning table. + """ + + def __init__(self, table: CTable, col_name: str, descriptor: dict) -> None: + self._table = table + self._col_name = col_name + self._descriptor = descriptor + + @property + def col_name(self) -> str: + """Column name this index targets.""" + return self._col_name + + @property + def kind(self) -> str: + """Index kind string (``'bucket'``, ``'partial'``, or ``'full'``).""" + return self._descriptor.get("kind", "") + + @property + def stale(self) -> bool: + """True if the index is stale and needs rebuilding.""" + return bool(self._descriptor.get("stale", False)) + + @property + def name(self) -> str | None: + """Optional human-readable name assigned at creation time.""" + return self._descriptor.get("name") or None + + @property + def nbytes(self) -> int: + """Total uncompressed size in bytes for this index payload.""" + from blosc2.indexing import _component_nbytes, iter_index_components + + root = self._table._root_table + col_arr = root._cols[self._col_name] + descriptor = self._descriptor + return sum( + _component_nbytes(col_arr, descriptor, component) + for component in iter_index_components(col_arr, descriptor) + ) + + @property + def cbytes(self) -> int: + """Total compressed size in bytes for this index payload.""" + from blosc2.indexing import _component_cbytes, iter_index_components + + root = self._table._root_table + col_arr = root._cols[self._col_name] + descriptor = self._descriptor + return sum( + _component_cbytes(col_arr, descriptor, component) + for component in iter_index_components(col_arr, descriptor) + ) + + @property + def cratio(self) -> float: + """Compression ratio for this index payload.""" + cbytes = self.cbytes + if cbytes == 0: + return float("inf") + return self.nbytes / cbytes + + def storage_stats(self) -> tuple[int, int, float] | None: + """Return ``(nbytes, cbytes, cratio)`` when sidecars are directly measurable.""" + try: + nbytes = self.nbytes + cbytes = self.cbytes + except (FileNotFoundError, OSError, RuntimeError, KeyError, ValueError): + root = self._table._root_table + if not isinstance(root._storage, FileTableStorage): + return None + + from blosc2.indexing import iter_index_components + + descriptor = self._descriptor + col_arr = root._cols[self._col_name] + store = root._storage._open_store() + nbytes = 0 + cbytes = 0 + try: + for component in iter_index_components(col_arr, descriptor): + if component.path is None: + return None + key = self._component_store_key(component.path) + obj = store[key] + nbytes += int(obj.nbytes) + cbytes += int(obj.cbytes) + except (FileNotFoundError, OSError, RuntimeError, KeyError, ValueError): + return None + cratio = float("inf") if cbytes == 0 else nbytes / cbytes + return nbytes, cbytes, cratio + + @staticmethod + def _component_store_key(path: str) -> str: + """Return the logical TreeStore key for an index component path.""" + normalized = path.replace("\\", "/") + marker = "_indexes/" + idx = normalized.find(marker) + if idx < 0: + raise KeyError(f"Cannot resolve index component path {path!r} inside table store.") + relpath = normalized[idx:] + for suffix in (".b2nd", ".b2f"): + if relpath.endswith(suffix): + relpath = relpath[: -len(suffix)] + break + return "/" + relpath.lstrip("/") + + def drop(self) -> None: + """Drop this index from the owning table.""" + self._table.drop_index(self._col_name) + + def rebuild(self) -> CTableIndex: + """Rebuild this index and return the updated handle.""" + return self._table.rebuild_index(self._col_name) + + def compact(self) -> CTableIndex: + """Compact this index (merge incremental runs) and return the updated handle.""" + return self._table.compact_index(self._col_name) + + def __repr__(self) -> str: + stale_str = " (stale)" if self.stale else "" + name_str = f" name={self.name!r}" if self.name else "" + return f"" + + +class _CTableInfoReporter(InfoReporter): + """Info reporter that also preserves the historic ``t.info()`` call style.""" + + def __repr__(self) -> str: + items = self.obj.info_items + max_key_len = max(len(k) for k, _ in items) + parts = [] + for key, value in items: + if isinstance(value, dict): + parts.append(f"{key.ljust(max_key_len)} :") + pretty = pprint.pformat(value, sort_dicts=False) + parts.extend(f" {line}" for line in pretty.splitlines()) + continue + + wrapper = TextWrapper( + width=96, + initial_indent=key.ljust(max_key_len) + " : ", + subsequent_indent=" " * max_key_len + " : ", + ) + parts.append(wrapper.fill(str(value))) + return "\n".join(parts) + "\n" + + def __call__(self) -> None: + print(repr(self), end="") + + +class _InfoLiteral: + """Pretty-printer helper for unquoted literal values inside info dicts.""" + + def __init__(self, text: str) -> None: + self.text = text + + def __repr__(self) -> str: + return self.text + + +# RowT is intentionally left unbound so CTable works with both dataclasses +# and legacy Pydantic models during the transition period. +RowT = TypeVar("RowT") + +# Arrays larger than this threshold use blosc2.arange instead of np.arange to +# avoid large transient allocations when mapping logical to physical row positions. +_BLOSC2_ARANGE_THRESHOLD = 1_000_000 + + +def _arange(start, stop=None, step=1) -> blosc2.NDArray | np.ndarray: + """Return a range array, using blosc2 for large n to save memory.""" + if stop is None: + start, stop = 0, start + n = len(range(start, stop, step)) + return ( + blosc2.arange(start, stop, step) if n >= _BLOSC2_ARANGE_THRESHOLD else np.arange(start, stop, step) + ) + + +# --------------------------------------------------------------------------- +# Legacy Pydantic-compat helpers +# Keep these so existing code that uses Annotated[type, NumpyDtype(...)] or +# Annotated[str, MaxLen(...)] on a pydantic.BaseModel continues to work. +# --------------------------------------------------------------------------- + + +class NumpyDtype: + """Metadata tag for Pydantic-based schemas (legacy).""" + + def __init__(self, dtype): + self.dtype = dtype + + +class MaxLen: + """Metadata tag for fixed-width string/bytes columns in Pydantic-based schemas (legacy).""" + + def __init__(self, length: int): + self.length = int(length) + + +def _default_display_width(origin) -> int: + """Return a sensible display column width for a given Python type (legacy).""" + return {int: 12, float: 15, bool: 6, complex: 25}.get(origin, 20) + + +def _resolve_field_dtype(field) -> tuple[np.dtype, int]: + """Return (numpy dtype, display_width) for a Pydantic model field (legacy). + + Extracts dtype from NumpyDtype metadata when present (same class), otherwise + falls back to a sensible default for each Python primitive type. + """ + annotation = field.annotation + origin = getattr(annotation, "__origin__", annotation) + + # str / bytes → look for MaxLen metadata, build fixed-width dtype + if origin in (str, bytes) or annotation in (str, bytes): + is_bytes = origin is bytes or annotation is bytes + max_len = 32 + if hasattr(annotation, "__metadata__"): + for meta in annotation.__metadata__: + if isinstance(meta, MaxLen): + max_len = meta.length + break + kind = "S" if is_bytes else "U" + dt = np.dtype(f"{kind}{max_len}") + display_width = max(10, min(max_len, 50)) + return dt, display_width + + # Check for explicit NumpyDtype metadata (same class as defined here) + if hasattr(annotation, "__metadata__"): + for meta in annotation.__metadata__: + if isinstance(meta, NumpyDtype): + dt = np.dtype(meta.dtype) + display_width = _default_display_width(origin) + return dt, display_width + + # Primitive defaults + _PRIMITIVE_MAP = { + int: (np.int64, 12), + float: (np.float64, 15), + bool: (np.bool_, 6), + complex: (np.complex128, 25), + } + if origin in _PRIMITIVE_MAP: + dt_raw, display_width = _PRIMITIVE_MAP[origin] + return np.dtype(dt_raw), display_width + + return np.dtype(np.object_), 20 + + +class _LegacySpec(SchemaSpec): + """Internal compatibility spec wrapping a dtype extracted from a Pydantic schema.""" + + def __init__(self, dtype: np.dtype): + self.dtype = np.dtype(dtype) + self.python_type = object + + def to_pydantic_kwargs(self) -> dict[str, Any]: + return {} + + def to_metadata_dict(self) -> dict[str, Any]: + return {"kind": "legacy", "dtype": str(self.dtype)} + + +def _compile_pydantic_schema(row_cls: type) -> CompiledSchema: + """Compatibility adapter: build a CompiledSchema from a Pydantic BaseModel subclass.""" + columns: list[CompiledColumn] = [] + for name, pyd_field in row_cls.model_fields.items(): + dtype, display_width = _resolve_field_dtype(pyd_field) + spec = _LegacySpec(dtype) + col = CompiledColumn( + name=name, + py_type=object, + spec=spec, + dtype=dtype, + default=MISSING, + config=ColumnConfig(cparams=None, dparams=None, chunks=None, blocks=None), + display_width=display_width, + ) + columns.append(col) + return CompiledSchema( + row_cls=row_cls, + columns=columns, + columns_by_name={col.name: col for col in columns}, + ) + + +# --------------------------------------------------------------------------- +# Internal row/indexing helpers (unchanged) +# --------------------------------------------------------------------------- + + +def _find_physical_index(arr: blosc2.NDArray, logical_key: int) -> int: + """Translate a logical (valid-row) index into a physical array index. + + Iterates chunk metadata of the boolean *arr* (valid_rows) to locate the + *logical_key*-th True value without fully decompressing the array. + + Returns + ------- + int + Physical position in the underlying storage array. + + Raises + ------ + IndexError + If the logical index is out of range or the array is inconsistent. + """ + count = 0 + chunk_size = arr.chunks[0] + + for info in arr.iterchunks_info(): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + if info.special == blosc2.SpecialValue.ZERO: + continue + + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue + if count + actual_size <= logical_key: + count += actual_size + continue + return chunk_start + (logical_key - count) + + chunk_data = arr[chunk_start : chunk_start + actual_size] + n_true = int(np.count_nonzero(chunk_data)) + if count + n_true <= logical_key: + count += n_true + continue + + return chunk_start + int(np.flatnonzero(chunk_data)[logical_key - count]) + + raise IndexError("Unexpected error finding physical index.") + + +class _RowIndexer: + def __init__(self, table): + self._table_ref = weakref.ref(table) + + def __getitem__(self, item): + table = self._table_ref() + if table is None: + raise ReferenceError("owning CTable has been released") + return table._run_row_logic(item) + + +class _Row: + def __init__(self, table: CTable, nrow: int): + self._table = table + self._nrow = nrow + self._real_pos = None + + def _get_real_pos(self) -> int: + self._real_pos = _find_physical_index(self._table._valid_rows, self._nrow) + return self._real_pos + + def __getitem__(self, col_name: str): + if self._real_pos is None: + self._get_real_pos() + return self._table._cols[col_name][self._real_pos] + + +# --------------------------------------------------------------------------- +# Column +# --------------------------------------------------------------------------- + + +class Column: + _REPR_PREVIEW_ITEMS = 8 + + def __init__(self, table: CTable, col_name: str, mask=None): + self._table = table + self._col_name = col_name + self._mask = mask + + @property + def _raw_col(self): + return self._table._cols[self._col_name] + + @property + def _valid_rows(self): + if self._mask is None: + return self._table._valid_rows + + return (self._table._valid_rows & self._mask).compute() + + def __getitem__(self, key: int | slice | list | np.ndarray): + if isinstance(key, int): + n_rows = len(self) + if key < 0: + key += n_rows + if not (0 <= key < n_rows): + raise IndexError(f"index {key} is out of bounds for column with size {n_rows}") + pos_true = _find_physical_index(self._valid_rows, key) + return self._raw_col[int(pos_true)] + + elif isinstance(key, slice): + real_pos = blosc2.where(self._valid_rows, _arange(len(self._valid_rows))).compute() + start, stop, step = key.indices(len(real_pos)) + mask = blosc2.zeros(len(self._table._valid_rows), dtype=np.bool_) + if step == 1: + phys_start = real_pos[start] + phys_stop = real_pos[stop - 1] + mask[phys_start : phys_stop + 1] = True + else: + lindices = _arange(start, stop, step) + phys_indices = real_pos[lindices] + mask[phys_indices[:]] = True + return Column(self._table, self._col_name, mask=mask) + + elif isinstance(key, np.ndarray) and key.dtype == np.bool_: + # Boolean mask in logical space — same convention as numpy/pandas. + # key[i] == True means "include logical row i". + n_live = len(self) + if len(key) != n_live: + raise IndexError( + f"Boolean mask length {len(key)} does not match number of live rows {n_live}." + ) + all_pos = np.where(self._valid_rows[:])[0] + phys_indices = all_pos[key] + return self._raw_col[phys_indices] + + elif isinstance(key, (list, tuple, np.ndarray)): + real_pos = blosc2.where(self._valid_rows, _arange(len(self._valid_rows))).compute() + phys_indices = np.array([real_pos[i] for i in key], dtype=np.int64) + return self._raw_col[phys_indices] + + raise TypeError(f"Invalid index type: {type(key)}") + + def __setitem__(self, key: int | slice | list | np.ndarray, value): + if self._table._read_only: + raise ValueError("Table is read-only (opened with mode='r').") + if isinstance(key, int): + n_rows = len(self) + if key < 0: + key += n_rows + if not (0 <= key < n_rows): + raise IndexError(f"index {key} is out of bounds for column with size {n_rows}") + pos_true = _find_physical_index(self._valid_rows, key) + self._raw_col[int(pos_true)] = value + + elif isinstance(key, np.ndarray) and key.dtype == np.bool_: + # Boolean mask in logical space. + n_live = len(self) + if len(key) != n_live: + raise IndexError( + f"Boolean mask length {len(key)} does not match number of live rows {n_live}." + ) + all_pos = np.where(self._valid_rows[:])[0] + phys_indices = all_pos[key] + if isinstance(value, (list, tuple)): + value = np.array(value, dtype=self._raw_col.dtype) + self._raw_col[phys_indices] = value + + elif isinstance(key, (slice, list, tuple, np.ndarray)): + real_pos = blosc2.where(self._valid_rows, _arange(len(self._valid_rows))).compute() + if isinstance(key, slice): + lindices = range(*key.indices(len(real_pos))) + phys_indices = np.array([real_pos[i] for i in lindices], dtype=np.int64) + else: + phys_indices = np.array([real_pos[i] for i in key], dtype=np.int64) + + if isinstance(value, (list, tuple)): + value = np.array(value, dtype=self._raw_col.dtype) + self._raw_col[phys_indices] = value + + else: + raise TypeError(f"Invalid index type: {type(key)}") + self._table._root_table._mark_all_indexes_stale() + + def __iter__(self): + arr = self._valid_rows + chunk_size = arr.chunks[0] + + for info in arr.iterchunks_info(): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + if info.special == blosc2.SpecialValue.ZERO: + continue + + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue + yield from self._raw_col[chunk_start : chunk_start + actual_size] + continue + + mask_chunk = arr[chunk_start : chunk_start + actual_size] + data_chunk = self._raw_col[chunk_start : chunk_start + actual_size] + yield from data_chunk[mask_chunk] + + def __repr__(self) -> str: + preview_items = [] + for value in itertools.islice(self, self._REPR_PREVIEW_ITEMS + 1): + if isinstance(value, np.generic): + value = value.item() + preview_items.append(repr(value)) + + truncated = len(preview_items) > self._REPR_PREVIEW_ITEMS + if truncated: + preview_items = preview_items[: self._REPR_PREVIEW_ITEMS] + preview_items.append("...") + + preview = ", ".join(preview_items) + return f"Column({self._col_name!r}, dtype={self.dtype}, len={len(self)}, values=[{preview}])" + + def __len__(self): + return blosc2.count_nonzero(self._valid_rows) + + def __lt__(self, other): + return self._raw_col < other + + def __le__(self, other): + return self._raw_col <= other + + def __eq__(self, other): + return self._raw_col == other + + def __ne__(self, other): + return self._raw_col != other + + def __gt__(self, other): + return self._raw_col > other + + def __ge__(self, other): + return self._raw_col >= other + + @property + def dtype(self): + return self._raw_col.dtype + + def iter_chunks(self, size: int = 65536): + """Iterate over live column values in chunks of *size* rows. + + Yields numpy arrays of at most *size* elements each, skipping deleted + rows. The last chunk may be smaller than *size*. + + Parameters + ---------- + size: + Number of live rows per yielded chunk. Defaults to 65 536. + + Yields + ------ + numpy.ndarray + A 1-D array of up to *size* live values with this column's dtype. + + Examples + -------- + >>> for chunk in t["score"].iter_chunks(size=100_000): + ... process(chunk) + """ + valid = self._valid_rows + raw = self._raw_col + arr_len = len(valid) + phys_chunk = valid.chunks[0] + + pending: list[np.ndarray] = [] + pending_count = 0 + + for info in valid.iterchunks_info(): + actual = min(phys_chunk, arr_len - info.nchunk * phys_chunk) + start = info.nchunk * phys_chunk + + if info.special == blosc2.SpecialValue.ZERO: + continue + + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=valid.dtype)[0] + if not val: + continue + segment = raw[start : start + actual] + else: + mask = valid[start : start + actual] + segment = raw[start : start + actual][mask] + + if len(segment) == 0: + continue + + pending.append(segment) + pending_count += len(segment) + + while pending_count >= size: + combined = np.concatenate(pending) + yield combined[:size] + rest = combined[size:] + pending = [rest] if len(rest) > 0 else [] + pending_count = len(rest) + + if pending: + yield np.concatenate(pending) + + def to_numpy(self) -> np.ndarray: + """Return all live values as a NumPy array.""" + parts = list(self.iter_chunks(size=max(1, len(self)))) + if not parts: + return np.array([], dtype=self.dtype) + return np.concatenate(parts) if len(parts) > 1 else parts[0] + + def assign(self, data) -> None: + """Replace all live values in this column with *data*. + + Works on both full tables and views — on a view, only the rows + visible through the view's mask are overwritten. + + Parameters + ---------- + data: + List, numpy array, or any iterable. Must have exactly as many + elements as there are live rows in this column. Values are + coerced to the column's dtype if possible. + + Raises + ------ + ValueError + If ``len(data)`` does not match the number of live rows, or the + table is opened read-only. + TypeError + If values cannot be coerced to the column's dtype. + """ + if self._table._read_only: + raise ValueError("Table is read-only (opened with mode='r').") + n_live = len(self) + arr = np.asarray(data) + if len(arr) != n_live: + raise ValueError(f"assign() requires {n_live} values (live rows), got {len(arr)}.") + try: + arr = arr.astype(self.dtype) + except (ValueError, OverflowError) as exc: + raise TypeError(f"Cannot coerce data to column dtype {self.dtype!r}: {exc}") from exc + live_pos = np.where(self._valid_rows[:])[0] + self._raw_col[live_pos] = arr + self._table._root_table._mark_all_indexes_stale() + + # ------------------------------------------------------------------ + # Null sentinel support + # ------------------------------------------------------------------ + + @property + def null_value(self): + """The sentinel value that represents NULL for this column, or ``None``.""" + col_info = self._table._schema.columns_by_name.get(self._col_name) + if col_info is None: + return None + return getattr(col_info.spec, "null_value", None) + + def _null_mask_for(self, arr: np.ndarray) -> np.ndarray: + """Return a bool array True where *arr* contains the null sentinel. + + Always returns an array of the same length as *arr*; all False when + no null_value is configured. + """ + nv = self.null_value + if nv is None: + return np.zeros(len(arr), dtype=np.bool_) + if isinstance(nv, float) and np.isnan(nv): + return np.isnan(arr) + return arr == nv + + def is_null(self) -> np.ndarray: + """Return a boolean array True where the live value is the null sentinel.""" + return self._null_mask_for(self.to_numpy()) + + def notnull(self) -> np.ndarray: + """Return a boolean array True where the live value is *not* the null sentinel.""" + return ~self.is_null() + + def null_count(self) -> int: + """Return the number of live rows whose value equals the null sentinel. + + Returns ``0`` in O(1) if no ``null_value`` is configured for this column. + """ + if self.null_value is None: + return 0 + return int(self.is_null().sum()) + + def _nonnull_chunks(self): + """Yield chunks of live, non-null values. + + Each yielded array has the null sentinel values removed. If no + null_value is configured this behaves identically to + :meth:`iter_chunks`. + """ + nv = self.null_value + if nv is None: + yield from self.iter_chunks() + return + is_nan_nv = isinstance(nv, float) and np.isnan(nv) + for chunk in self.iter_chunks(): + if is_nan_nv: + mask = ~np.isnan(chunk) + else: + mask = chunk != nv + filtered = chunk[mask] + if len(filtered) > 0: + yield filtered + + def unique(self) -> np.ndarray: + """Return sorted array of unique live, non-null values. + + Null sentinel values are excluded. + Processes data in chunks — never loads the full column at once. + """ + seen: set = set() + for chunk in self._nonnull_chunks(): + seen.update(chunk.tolist()) + return np.array(sorted(seen), dtype=self.dtype) + + def value_counts(self) -> dict: + """Return a ``{value: count}`` dict sorted by count descending. + + Null sentinel values are excluded. + Processes data in chunks — never loads the full column at once. + + Example + ------- + >>> t["active"].value_counts() + {True: 8432, False: 1568} + """ + counts: dict = {} + for chunk in self._nonnull_chunks(): + for val in chunk.tolist(): + counts[val] = counts.get(val, 0) + 1 + return dict(sorted(counts.items(), key=lambda kv: -kv[1])) + + # ------------------------------------------------------------------ + # Aggregate helpers + # ------------------------------------------------------------------ + + def _require_nonempty(self, op: str) -> None: + if len(self) == 0: + raise ValueError(f"Column.{op}() called on an empty column.") + + def _require_kind(self, kinds: str, op: str) -> None: + """Raise TypeError if this column's dtype is not in *kinds*.""" + if self.dtype.kind not in kinds: + _kind_names = { + "b": "bool", + "i": "signed int", + "u": "unsigned int", + "f": "float", + "c": "complex", + "U": "string", + "S": "bytes", + } + raise TypeError( + f"Column.{op}() is not supported for dtype {self.dtype!r} " + f"({_kind_names.get(self.dtype.kind, self.dtype.kind)})." + ) + + # ------------------------------------------------------------------ + # Aggregates + # ------------------------------------------------------------------ + + def sum(self): + """Sum of all live, non-null values. + + Supported dtypes: bool, int, uint, float, complex. + Bool values are counted as 0 / 1. + Null sentinel values are skipped. + """ + self._require_kind("biufc", "sum") + self._require_nonempty("sum") + # Use a wide accumulator to reduce overflow risk + acc_dtype = ( + np.float64 + if self.dtype.kind == "f" + else ( + np.complex128 if self.dtype.kind == "c" else np.int64 if self.dtype.kind in "biu" else None + ) + ) + result = acc_dtype(0) + for chunk in self._nonnull_chunks(): + result += chunk.sum(dtype=acc_dtype) + # Return in the column's natural dtype when it fits, else keep wide + if self.dtype.kind in "biu": + return int(result) + return result + + def min(self): + """Minimum live, non-null value. + + Supported dtypes: bool, int, uint, float, string, bytes. + Strings are compared lexicographically. + Null sentinel values are skipped. + """ + self._require_kind("biufUS", "min") + self._require_nonempty("min") + result = None + is_str = self.dtype.kind in "US" + for chunk in self._nonnull_chunks(): + # numpy .min()/.max() don't support string dtypes in recent NumPy; + # fall back to Python's built-in min/max which work on any comparable type. + chunk_min = min(chunk) if is_str else chunk.min() + if result is None or chunk_min < result: + result = chunk_min + if result is None: + raise ValueError("min() called on a column where all values are null.") + return result + + def max(self): + """Maximum live, non-null value. + + Supported dtypes: bool, int, uint, float, string, bytes. + Strings are compared lexicographically. + Null sentinel values are skipped. + """ + self._require_kind("biufUS", "max") + self._require_nonempty("max") + result = None + is_str = self.dtype.kind in "US" + for chunk in self._nonnull_chunks(): + chunk_max = max(chunk) if is_str else chunk.max() + if result is None or chunk_max > result: + result = chunk_max + if result is None: + raise ValueError("max() called on a column where all values are null.") + return result + + def mean(self) -> float: + """Arithmetic mean of all live, non-null values. + + Supported dtypes: bool, int, uint, float. + Null sentinel values are skipped. + Always returns a Python float. + """ + self._require_kind("biuf", "mean") + self._require_nonempty("mean") + total = np.float64(0) + count = 0 + for chunk in self._nonnull_chunks(): + total += chunk.sum(dtype=np.float64) + count += len(chunk) + if count == 0: + return float("nan") + return float(total / count) + + def std(self, ddof: int = 0) -> float: + """Standard deviation of all live, non-null values (single-pass, Welford's algorithm). + + Parameters + ---------- + ddof: + Delta degrees of freedom. ``0`` (default) gives the population + std; ``1`` gives the sample std (divides by N-1). + + Supported dtypes: bool, int, uint, float. + Null sentinel values are skipped. + Always returns a Python float. + """ + self._require_kind("biuf", "std") + self._require_nonempty("std") + + # Chan's parallel update — combines per-chunk (n, mean, M2) tuples. + # This is numerically stable and requires only a single pass. + n_total = np.int64(0) + mean_total = np.float64(0) + M2_total = np.float64(0) + + for chunk in self._nonnull_chunks(): + chunk = chunk.astype(np.float64) + n_b = np.int64(len(chunk)) + mean_b = chunk.mean() + M2_b = np.float64(((chunk - mean_b) ** 2).sum()) + + if n_total == 0: + n_total, mean_total, M2_total = n_b, mean_b, M2_b + else: + delta = mean_b - mean_total + n_new = n_total + n_b + mean_total = (n_total * mean_total + n_b * mean_b) / n_new + M2_total += M2_b + delta**2 * n_total * n_b / n_new + n_total = n_new + + divisor = n_total - ddof + if divisor <= 0: + return float("nan") + return float(np.sqrt(M2_total / divisor)) + + def any(self) -> bool: + """Return True if at least one live, non-null value is True. + + Supported dtypes: bool. + Null sentinel values are skipped. + Short-circuits on the first True found. + """ + self._require_kind("b", "any") + return any(chunk.any() for chunk in self._nonnull_chunks()) + + def all(self) -> bool: + """Return True if every live, non-null value is True. + + Supported dtypes: bool. + Null sentinel values are skipped. + Short-circuits on the first False found. + """ + self._require_kind("b", "all") + return all(chunk.all() for chunk in self._nonnull_chunks()) + + +# --------------------------------------------------------------------------- +# CTable +# --------------------------------------------------------------------------- + + +def _fmt_bytes(n: int) -> str: + """Human-readable byte count (e.g. '1.23 MB').""" + if n < 1024: + return f"{n} B" + if n < 1024**2: + return f"{n / 1024:.2f} KB" + if n < 1024**3: + return f"{n / 1024**2:.2f} MB" + return f"{n / 1024**3:.2f} GB" + + +_EXPECTED_SIZE_DEFAULT = 1_048_576 + + +class CTable(Generic[RowT]): + def __init__( + self, + row_type: type[RowT], + new_data=None, + *, + urlpath: str | None = None, + mode: str = "a", + expected_size: int | None = None, + compact: bool = False, + validate: bool = True, + cparams: dict[str, Any] | None = None, + dparams: dict[str, Any] | None = None, + ) -> None: + # Auto-size: if the caller didn't specify expected_size and new_data has a + # known length, pre-allocate just enough (×2 for headroom, min 64). + # Fall back to 1 M when new_data has no __len__ or is absent. + if expected_size is None: + if new_data is not None and hasattr(new_data, "__len__"): + expected_size = max(len(new_data) * 2, 64) + else: + expected_size = _EXPECTED_SIZE_DEFAULT + self._row_type = row_type + self._validate = validate + self._table_cparams = cparams + self._table_dparams = dparams + self._cols: dict[str, blosc2.NDArray] = {} + self._col_widths: dict[str, int] = {} + self.col_names: list[str] = [] + self.row = _RowIndexer(self) + self.auto_compact = compact + self.base = None + + # Choose storage backend + if urlpath is not None: + if mode == "w" and os.path.exists(urlpath): + if os.path.isdir(urlpath): + shutil.rmtree(urlpath) + else: + os.remove(urlpath) + storage: TableStorage = FileTableStorage(urlpath, mode) + else: + storage = InMemoryTableStorage() + self._storage = storage + self._read_only = storage.is_read_only() + + if storage.table_exists() and mode != "w": + # ---- Open existing persistent table ---- + if new_data is not None: + raise ValueError( + "Cannot pass new_data when opening an existing table. Use mode='w' to overwrite." + ) + storage.check_kind() + schema_dict = storage.load_schema() + self._schema: CompiledSchema = schema_from_dict(schema_dict) + self._schema = CompiledSchema( + row_cls=row_type, + columns=self._schema.columns, + columns_by_name=self._schema.columns_by_name, + ) + self.col_names = [c["name"] for c in schema_dict["columns"]] + self._valid_rows = storage.open_valid_rows() + for name in self.col_names: + col = storage.open_column(name) + self._cols[name] = col + cc = self._schema.columns_by_name[name] + self._col_widths[name] = max(len(name), cc.display_width) + self._n_rows = int(blosc2.count_nonzero(self._valid_rows)) + self._last_pos = None # resolve lazily on first write + else: + # ---- Create new table ---- + if storage.is_read_only(): + raise FileNotFoundError(f"No CTable found at {urlpath!r}") + + # Build compiled schema from either a dataclass or a legacy Pydantic model + if dataclasses.is_dataclass(row_type) and isinstance(row_type, type): + self._schema = compile_schema(row_type) + else: + self._schema = _compile_pydantic_schema(row_type) + + self._n_rows = 0 + self._last_pos = 0 + + default_chunks, default_blocks = compute_chunks_blocks((expected_size,)) + self._valid_rows = storage.create_valid_rows( + shape=(expected_size,), + chunks=default_chunks, + blocks=default_blocks, + ) + self._init_columns(expected_size, default_chunks, default_blocks, storage) + storage.save_schema(schema_to_dict(self._schema)) + + if new_data is not None: + self._load_initial_data(new_data) + + def close(self) -> None: + """Close any persistent backing store held by this table.""" + storage = getattr(self, "_storage", None) + if storage is not None and hasattr(storage, "close"): + storage.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + return False + + def __del__(self): + with contextlib.suppress(Exception): + storage = getattr(self, "_storage", None) + if storage is not None and hasattr(storage, "discard"): + storage.discard() + elif storage is not None and hasattr(storage, "close"): + storage.close() + + def _init_columns( + self, expected_size: int, default_chunks, default_blocks, storage: TableStorage + ) -> None: + """Create one NDArray per column using the compiled schema.""" + for col in self._schema.columns: + self.col_names.append(col.name) + self._col_widths[col.name] = max(len(col.name), col.display_width) + col_storage = self._resolve_column_storage(col, default_chunks, default_blocks) + self._cols[col.name] = storage.create_column( + col.name, + dtype=col.dtype, + shape=(expected_size,), + chunks=col_storage["chunks"], + blocks=col_storage["blocks"], + cparams=col_storage.get("cparams"), + dparams=col_storage.get("dparams"), + ) + + def _resolve_column_storage( + self, + col: CompiledColumn, + default_chunks, + default_blocks, + ) -> dict[str, Any]: + """Merge table-level and column-level storage settings. + + Column-level settings (from ``b2.field(...)``) take precedence over + table-level defaults passed to ``CTable.__init__``. + """ + result: dict[str, Any] = { + "chunks": col.config.chunks if col.config.chunks is not None else default_chunks, + "blocks": col.config.blocks if col.config.blocks is not None else default_blocks, + } + cparams = col.config.cparams if col.config.cparams is not None else self._table_cparams + dparams = col.config.dparams if col.config.dparams is not None else self._table_dparams + if cparams is not None: + result["cparams"] = cparams + if dparams is not None: + result["dparams"] = dparams + return result + + def _normalize_row_input(self, data: Any) -> dict[str, Any]: + """Normalize a row input to a ``{col_name: value}`` dict. + + Accepted shapes: + - list / tuple → positional, zipped with ``col_names`` + - dict → used as-is + - dataclass → ``dataclasses.asdict`` + - np.void / structured scalar → field-name access + """ + if isinstance(data, dict): + return data + if isinstance(data, (list, tuple)): + return dict(zip(self.col_names, data, strict=False)) + if dataclasses.is_dataclass(data) and not isinstance(data, type): + return dataclasses.asdict(data) + if isinstance(data, (np.void, np.record)): + return {name: data[name] for name in self.col_names} + # Fallback: try positional indexing + return {name: data[i] for i, name in enumerate(self.col_names)} + + def _coerce_row_to_storage(self, row: dict[str, Any]) -> dict[str, Any]: + """Coerce each value in *row* to the column's storage dtype.""" + result = {} + for col in self._schema.columns: + val = row[col.name] + result[col.name] = np.array(val, dtype=col.dtype).item() + return result + + def _resolve_last_pos(self) -> int: + """Return the physical index of the next write slot. + + Returns the cached ``_last_pos`` when available. After a deletion + ``_last_pos`` is ``None``; this method then walks chunk metadata of + ``_valid_rows`` from the end (no full decompression) to find the last + ``True`` position, caches the result, and returns it. + """ + if self._last_pos is not None: + return self._last_pos + + arr = self._valid_rows + chunk_size = arr.chunks[0] + last_true_pos = -1 + + for info in reversed(list(arr.iterchunks_info())): + actual_size = min(chunk_size, arr.shape[0] - info.nchunk * chunk_size) + chunk_start = info.nchunk * chunk_size + + if info.special == blosc2.SpecialValue.ZERO: + continue + if info.special == blosc2.SpecialValue.VALUE: + val = np.frombuffer(info.repeated_value, dtype=arr.dtype)[0] + if not val: + continue + last_true_pos = chunk_start + actual_size - 1 + break + + chunk_data = arr[chunk_start : chunk_start + actual_size] + nonzero = np.flatnonzero(chunk_data) + if len(nonzero) == 0: + continue + last_true_pos = chunk_start + int(nonzero[-1]) + break + + self._last_pos = last_true_pos + 1 + return self._last_pos + + def _grow(self) -> None: + """Double the physical capacity of all columns and the valid_rows mask.""" + c = len(self._valid_rows) + for col_arr in self._cols.values(): + col_arr.resize((c * 2,)) + self._valid_rows.resize((c * 2,)) + + # ------------------------------------------------------------------ + # Display + # ------------------------------------------------------------------ + + def __str__(self) -> str: + _HEAD_TAIL = 10 # rows shown at each end + + nrows = self._n_rows + ncols = len(self.col_names) + hidden = max(0, nrows - _HEAD_TAIL * 2) + + # -- physical positions for head and tail rows -- + valid_np = self._valid_rows[:] + all_pos = np.where(valid_np)[0] + + if nrows <= _HEAD_TAIL * 2: + head_pos = all_pos + tail_pos = np.array([], dtype=all_pos.dtype) + hidden = 0 + else: + head_pos = all_pos[:_HEAD_TAIL] + tail_pos = all_pos[-_HEAD_TAIL:] + + # -- per-column display widths -- + widths: dict[str, int] = {} + for name in self.col_names: + widths[name] = max( + self._col_widths[name], + len(str(self._cols[name].dtype)), + ) + + sep = " ".join("─" * (w + 2) for w in widths.values()) + + def fmt_cell(value, width: int) -> str: + s = str(value) + if len(s) > width: + s = s[: width - 1] + "…" + return f" {s:<{width}} " + + def fmt_row(values: dict) -> str: + return " ".join(fmt_cell(values[n], widths[n]) for n in self.col_names) + + # -- batch-fetch values (one read per column, not one per cell) -- + def rows_to_dicts(positions) -> list[dict]: + if len(positions) == 0: + return [] + col_data = {n: self._cols[n][positions] for n in self.col_names} + return [{n: col_data[n][i].item() for n in self.col_names} for i in range(len(positions))] + + lines = [ + fmt_row({n: n for n in self.col_names}), + fmt_row({n: str(self._cols[n].dtype) for n in self.col_names}), + sep, + ] + + for row in rows_to_dicts(head_pos): + lines.append(fmt_row(row)) + + if hidden > 0: + lines.append(fmt_row(dict.fromkeys(self.col_names, "..."))) + + for row in rows_to_dicts(tail_pos): + lines.append(fmt_row(row)) + + lines.append(sep) + footer = f"{nrows:,} rows × {ncols} columns" + if hidden > 0: + footer += f" ({hidden:,} rows hidden)" + lines.append(footer) + + return "\n".join(lines) + + def __repr__(self) -> str: + cols = ", ".join(self.col_names) + return f"CTable<{cols}>({self._n_rows:,} rows, {_fmt_bytes(self.cbytes)} compressed)" + + def __len__(self): + return self._n_rows + + def __iter__(self): + for i in range(self.nrows): + yield _Row(self, i) + + # ------------------------------------------------------------------ + # Open existing table (classmethod) + # ------------------------------------------------------------------ + + @classmethod + def open(cls, urlpath: str, *, mode: str = "r") -> CTable: + """Open a persistent CTable from *urlpath*. + + Parameters + ---------- + urlpath: + Path to the table root directory (created by passing ``urlpath`` + to :class:`CTable`). + mode: + ``'r'`` (default) — read-only. + ``'a'`` — read/write. + + Raises + ------ + FileNotFoundError + If *urlpath* does not contain a CTable. + ValueError + If the metadata at *urlpath* does not identify a CTable. + """ + storage = FileTableStorage(urlpath, mode) + if not storage.table_exists(): + raise FileNotFoundError(f"No CTable found at {urlpath!r}") + storage.check_kind() + schema_dict = storage.load_schema() + schema = schema_from_dict(schema_dict) + col_names = [c["name"] for c in schema_dict["columns"]] + + obj = cls.__new__(cls) + obj._row_type = None + obj._validate = True + obj._table_cparams = None + obj._table_dparams = None + obj._storage = storage + obj._read_only = storage.is_read_only() + obj._schema = schema + obj._cols = {} + obj._col_widths = {} + obj.col_names = col_names + obj.row = _RowIndexer(obj) + obj.auto_compact = False + obj.base = None + + obj._valid_rows = storage.open_valid_rows() + for name in col_names: + obj._cols[name] = storage.open_column(name) + cc = schema.columns_by_name[name] + obj._col_widths[name] = max(len(name), cc.display_width) + + obj._n_rows = int(blosc2.count_nonzero(obj._valid_rows)) + obj._last_pos = None # resolve lazily on first write + return obj + + # ------------------------------------------------------------------ + # Save / Load (in-memory ↔ disk) + # ------------------------------------------------------------------ + + def save(self, urlpath: str, *, overwrite: bool = False) -> None: + """Copy this (in-memory) table to disk at *urlpath*. + + Only live rows are written — the on-disk table is always compacted. + + Parameters + ---------- + urlpath: + Destination directory path. + overwrite: + If ``False`` (default), raise :exc:`ValueError` when *urlpath* + already exists. Set to ``True`` to replace an existing table. + + Raises + ------ + ValueError + If *urlpath* already exists and ``overwrite=False``, or if called + on a view. + """ + if self.base is not None: + raise ValueError("Cannot save a view — save the parent table instead.") + file_storage = FileTableStorage(urlpath, "w") + target_path = file_storage._root + if os.path.exists(target_path): + if not overwrite: + raise ValueError(f"Path {target_path!r} already exists. Use overwrite=True to replace.") + if os.path.isdir(target_path): + shutil.rmtree(target_path) + else: + os.remove(target_path) + + # Collect live physical positions + valid_np = self._valid_rows[:] + live_pos = np.where(valid_np)[0] + n_live = len(live_pos) + capacity = max(n_live, 1) + + default_chunks, default_blocks = compute_chunks_blocks((capacity,)) + + # --- valid_rows (all True, compacted) --- + disk_valid = file_storage.create_valid_rows( + shape=(capacity,), + chunks=default_chunks, + blocks=default_blocks, + ) + if n_live > 0: + disk_valid[:n_live] = True + + # --- columns --- + for col in self._schema.columns: + name = col.name + # Use dtype-aware defaults so large-itemsize columns (e.g. U4096) get + # sensible chunk/block sizes rather than the uint8-based defaults. + dtype_chunks, dtype_blocks = compute_chunks_blocks((capacity,), dtype=col.dtype) + col_storage = self._resolve_column_storage(col, dtype_chunks, dtype_blocks) + disk_col = file_storage.create_column( + name, + dtype=col.dtype, + shape=(capacity,), + chunks=col_storage["chunks"], + blocks=col_storage["blocks"], + cparams=col_storage.get("cparams"), + dparams=col_storage.get("dparams"), + ) + if n_live > 0: + disk_col[:n_live] = self._cols[name][live_pos] + + file_storage.save_schema(schema_to_dict(self._schema)) + file_storage.close() + + @classmethod + def load(cls, urlpath: str) -> CTable: + """Load a persistent table from *urlpath* into RAM. + + The schema is read from the table's metadata — the original Python + dataclass is not required. The returned table is fully in-memory and + read/write. + + Parameters + ---------- + urlpath: + Path to the table root directory. + + Raises + ------ + FileNotFoundError + If *urlpath* does not contain a CTable. + ValueError + If the metadata at *urlpath* does not identify a CTable. + """ + file_storage = FileTableStorage(urlpath, "r") + if not file_storage.table_exists(): + raise FileNotFoundError(f"No CTable found at {urlpath!r}") + file_storage.check_kind() + schema_dict = file_storage.load_schema() + schema = schema_from_dict(schema_dict) + col_names = [c["name"] for c in schema_dict["columns"]] + + disk_valid = file_storage.open_valid_rows() + disk_cols = {name: file_storage.open_column(name) for name in col_names} + phys_size = len(disk_valid) + n_live = int(blosc2.count_nonzero(disk_valid)) + capacity = max(phys_size, 1) + + mem_storage = InMemoryTableStorage() + bool_chunks, bool_blocks = compute_chunks_blocks((capacity,), dtype=np.dtype(np.bool_)) + + mem_valid = mem_storage.create_valid_rows( + shape=(capacity,), + chunks=bool_chunks, + blocks=bool_blocks, + ) + if phys_size > 0: + mem_valid[:phys_size] = disk_valid[:] + + mem_cols: dict[str, blosc2.NDArray] = {} + for col in schema.columns: + name = col.name + col_chunks, col_blocks = compute_chunks_blocks((capacity,), dtype=col.dtype) + mem_col = mem_storage.create_column( + name, + dtype=col.dtype, + shape=(capacity,), + chunks=col_chunks, + blocks=col_blocks, + cparams=None, + dparams=None, + ) + if phys_size > 0: + mem_col[:phys_size] = disk_cols[name][:] + mem_cols[name] = mem_col + + file_storage.close() + + obj = cls.__new__(cls) + obj._row_type = None + obj._validate = True + obj._table_cparams = None + obj._table_dparams = None + obj._storage = mem_storage + obj._read_only = False + obj._schema = schema + obj._cols = mem_cols + obj._col_widths = {col.name: max(len(col.name), col.display_width) for col in schema.columns} + obj.col_names = col_names + obj.row = _RowIndexer(obj) + obj.auto_compact = False + obj.base = None + obj._valid_rows = mem_valid + obj._n_rows = n_live + obj._last_pos = None # resolve lazily on first write + return obj + + # ------------------------------------------------------------------ + # View / filtering + # ------------------------------------------------------------------ + + @classmethod + def _make_view(cls, parent: CTable, new_valid_rows: blosc2.NDArray) -> CTable: + """Construct a read-only view sharing *parent*'s columns.""" + obj = cls.__new__(cls) + obj._row_type = parent._row_type + obj._validate = parent._validate + obj._table_cparams = parent._table_cparams + obj._table_dparams = parent._table_dparams + obj._storage = None + obj._read_only = parent._read_only # inherit: only True for mode="r" disk tables + obj._schema = parent._schema + obj._cols = parent._cols # shared — views cannot change row structure + obj._col_widths = parent._col_widths + obj.col_names = parent.col_names + obj.row = _RowIndexer(obj) + obj.auto_compact = parent.auto_compact + obj.base = parent + obj._valid_rows = new_valid_rows + obj._n_rows = int(blosc2.count_nonzero(new_valid_rows)) + obj._last_pos = None + return obj + + def view(self, new_valid_rows): + if isinstance(new_valid_rows, np.ndarray) and new_valid_rows.dtype == np.bool_: + new_valid_rows = blosc2.asarray(new_valid_rows) + if not ( + isinstance(new_valid_rows, (blosc2.NDArray, blosc2.LazyExpr)) + and (getattr(new_valid_rows, "dtype", None) == np.bool_) + ): + raise TypeError( + f"Expected boolean blosc2.NDArray or LazyExpr, got {type(new_valid_rows).__name__}" + ) + + new_valid_rows = ( + new_valid_rows.compute() if isinstance(new_valid_rows, blosc2.LazyExpr) else new_valid_rows + ) + + if len(self._valid_rows) != len(new_valid_rows): + raise ValueError() + + return CTable._make_view(self, new_valid_rows) + + def head(self, N: int = 5) -> CTable: + if N <= 0: + return self.view(blosc2.zeros(shape=len(self._valid_rows), dtype=np.bool_)) + if self._n_rows <= N: + return self.view(self._valid_rows) + + # Reuse _find_physical_index: physical position of the (N-1)-th live row + arr = self._valid_rows + pos_N_true = _find_physical_index(arr, N - 1) + + if pos_N_true < len(arr) // 2: + mask_arr = blosc2.zeros(shape=len(arr), dtype=np.bool_) + mask_arr[: pos_N_true + 1] = True + else: + mask_arr = blosc2.ones(shape=len(arr), dtype=np.bool_) + mask_arr[pos_N_true + 1 :] = False + + mask_arr = (mask_arr & self._valid_rows).compute() + return self.view(mask_arr) + + def tail(self, N: int = 5) -> CTable: + if N <= 0: + return self.view(blosc2.zeros(shape=len(self._valid_rows), dtype=np.bool_)) + if self._n_rows <= N: + return self.view(self._valid_rows) + + # Physical position of the first row we want = logical index (nrows - N) + arr = self._valid_rows + pos_start = _find_physical_index(arr, self._n_rows - N) + + if pos_start > len(arr) // 2: + mask_arr = blosc2.zeros(shape=len(arr), dtype=np.bool_) + mask_arr[pos_start:] = True + else: + mask_arr = blosc2.ones(shape=len(arr), dtype=np.bool_) + if pos_start > 0: + mask_arr[:pos_start] = False + + mask_arr = (mask_arr & self._valid_rows).compute() + return self.view(mask_arr) + + def sample(self, n: int, *, seed: int | None = None) -> CTable: + """Return a read-only view of *n* randomly chosen live rows. + + Parameters + ---------- + n: + Number of rows to sample. If *n* >= number of live rows, + returns a view of the whole table. + seed: + Optional random seed for reproducibility. + + Returns + ------- + CTable + A read-only view sharing columns with this table. + """ + if n <= 0: + return self.view(blosc2.zeros(shape=len(self._valid_rows), dtype=np.bool_)) + if n >= self._n_rows: + return self.view(self._valid_rows) + + rng = np.random.default_rng(seed) + all_pos = np.where(self._valid_rows[:])[0] + chosen = rng.choice(all_pos, size=n, replace=False) + + mask = np.zeros(len(self._valid_rows), dtype=np.bool_) + mask[chosen] = True + return self.view(blosc2.asarray(mask)) + + def select(self, cols: list[str]) -> CTable: + """Return a column-projection view exposing only *cols*. + + The returned object shares the underlying NDArrays with this table + (no data is copied). Row filtering and value writes work as usual; + structural mutations (add/drop/rename column, append, …) are blocked. + + Parameters + ---------- + cols: + Ordered list of column names to keep. + + Raises + ------ + KeyError + If any name in *cols* is not a column of this table. + ValueError + If *cols* is empty. + """ + if not cols: + raise ValueError("select() requires at least one column name.") + for name in cols: + if name not in self._cols: + raise KeyError(f"No column named {name!r}. Available: {self.col_names}") + + obj = CTable.__new__(CTable) + obj._row_type = self._row_type + obj._validate = self._validate + obj._table_cparams = self._table_cparams + obj._table_dparams = self._table_dparams + obj._storage = None + obj._read_only = self._read_only + obj._valid_rows = self._valid_rows + obj._n_rows = self._n_rows + obj._last_pos = self._last_pos + obj.auto_compact = self.auto_compact + obj.base = self + + # Subset of columns — same NDArray objects, no copy + obj._cols = {name: self._cols[name] for name in cols} + obj.col_names = list(cols) + + # Rebuild schema for the selected columns only + sel_set = set(cols) + sel_compiled = [c for c in self._schema.columns if c.name in sel_set] + # Preserve caller-specified order + order = {name: i for i, name in enumerate(cols)} + sel_compiled.sort(key=lambda c: order[c.name]) + obj._schema = CompiledSchema( + columns=sel_compiled, + columns_by_name={c.name: c for c in sel_compiled}, + row_cls=self._schema.row_cls, + ) + obj._col_widths = {name: self._col_widths[name] for name in cols if name in self._col_widths} + obj.row = _RowIndexer(obj) + return obj + + def describe(self) -> None: + """Print a per-column statistical summary. + + Numeric columns (int, float): count, mean, std, min, max. + Bool columns: count, true-count, true-%. + String columns: count, min (lex), max (lex), n-unique. + """ + n = self._n_rows + lines = [] + lines.append(f"CTable {n:,} rows × {self.ncols} cols") + lines.append("") + + for name in self.col_names: + col = self[name] + dtype = col.dtype + lines.append(f" {name} [{dtype}]") + + if n == 0: + lines.append(" (empty)") + lines.append("") + continue + + nc = col.null_count() + n_nonnull = n - nc + + if dtype.kind in "biufc" and dtype.kind != "c": + # numeric + bool + if dtype.kind == "b": + arr = col.to_numpy() + # Exclude null sentinels from true/false counts + if col.null_value is not None: + arr = arr[col.notnull()] + true_n = int(arr.sum()) + lines.append(f" count : {n:,}") + if nc > 0: + lines.append(f" null : {nc:,} ({nc / n * 100:.1f} %)") + lines.append(f" true : {true_n:,} ({true_n / n * 100:.1f} %)") + lines.append(f" false : {n - true_n - nc:,} ({(n - true_n - nc) / n * 100:.1f} %)") + else: + fmt = ".4g" + lines.append(f" count : {n:,}") + if nc > 0: + lines.append(f" null : {nc:,} ({nc / n * 100:.1f} %)") + if n_nonnull > 0: + mn = col.min() + mx = col.max() + avg = col.mean() + sd = col.std() + lines.append(f" mean : {avg:{fmt}}") + lines.append(f" std : {sd:{fmt}}") + lines.append(f" min : {mn:{fmt}}") + lines.append(f" max : {mx:{fmt}}") + else: + lines.append(" (all values are null)") + elif dtype.kind in "US": + nu = len(col.unique()) + lines.append(f" count : {n:,}") + if nc > 0: + lines.append(f" null : {nc:,} ({nc / n * 100:.1f} %)") + lines.append(f" unique : {nu:,}") + if n_nonnull > 0: + mn = col.min() + mx = col.max() + lines.append(f" min : {str(mn)!r}") + lines.append(f" max : {str(mx)!r}") + else: + lines.append(" (all values are null)") + else: + lines.append(f" count : {n:,}") + lines.append(f" (stats not available for dtype {dtype})") + + lines.append("") + + print("\n".join(lines)) + + def cov(self) -> np.ndarray: + """Return the covariance matrix as a numpy array. + + Only int, float, and bool columns are supported. Bool columns are + cast to int (0/1) before computation. Complex columns raise + :exc:`TypeError`. + + Returns + ------- + numpy.ndarray + Shape ``(ncols, ncols)``. Column order matches + :attr:`col_names`. + + Raises + ------ + TypeError + If any column has an unsupported dtype (complex, string, …). + ValueError + If the table has fewer than 2 live rows (covariance undefined). + """ + for name in self.col_names: + dtype = self._cols[name].dtype + if not ( + np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.floating) or dtype == np.bool_ + ): + raise TypeError( + f"Column {name!r} has dtype {dtype} which is not supported by cov(). " + "Only int, float, and bool columns are allowed." + ) + + if self._n_rows < 2: + raise ValueError(f"cov() requires at least 2 live rows, got {self._n_rows}.") + + # Build (n_cols, n_rows) matrix — one row per column. + # Compute a combined null mask: any row that is null in *any* column + # is excluded from all columns (listwise deletion). + raw_arrays = [] + null_union = None + for name in self.col_names: + col = self[name] + arr = col.to_numpy() + nm = col._null_mask_for(arr) + if nm.any(): + null_union = nm if null_union is None else (null_union | nm) + raw_arrays.append(arr) + + arrays = [] + for arr in raw_arrays: + if null_union is not None: + arr = arr[~null_union] + if arr.dtype == np.bool_: + arr = arr.astype(np.int8) + arrays.append(arr.astype(np.float64)) + + n_valid = len(arrays[0]) if arrays else 0 + if n_valid < 2: + raise ValueError( + f"cov() requires at least 2 non-null rows, got {n_valid} after excluding nulls." + ) + + data = np.stack(arrays, axis=0) # shape (ncols, n_valid) + return np.atleast_2d(np.cov(data)) + + # ------------------------------------------------------------------ + # Arrow interop + # ------------------------------------------------------------------ + + def to_arrow(self): + """Convert all live rows to a :class:`pyarrow.Table`. + + Each column is materialized via :meth:`Column.to_numpy` and wrapped + in a ``pyarrow.array``. String columns are emitted as ``pa.string()`` + (variable-length UTF-8); bytes columns as ``pa.large_binary()``. + + Raises + ------ + ImportError + If ``pyarrow`` is not installed. + """ + try: + import pyarrow as pa + except ImportError: + raise ImportError( + "pyarrow is required for to_arrow(). Install it with: pip install pyarrow" + ) from None + + arrays = {} + for name in self.col_names: + col = self[name] + arr = col.to_numpy() + # Only compute null mask when a sentinel is actually configured — + # avoids allocating a 1M-element zeros array for every non-nullable column. + nv = col.null_value + if nv is not None: + null_mask = col._null_mask_for(arr) + has_nulls = bool(null_mask.any()) + else: + null_mask = None + has_nulls = False + kind = arr.dtype.kind + if kind == "U": + values = arr.tolist() + if has_nulls: + values = [None if null_mask[i] else v for i, v in enumerate(values)] + pa_arr = pa.array(values, type=pa.string()) + elif kind == "S": + values = arr.tolist() + if has_nulls: + values = [None if null_mask[i] else v for i, v in enumerate(values)] + pa_arr = pa.array(values, type=pa.large_binary()) + else: + pa_arr = pa.array(arr, mask=null_mask if has_nulls else None) + arrays[name] = pa_arr + + return pa.table(arrays) + + @classmethod + def from_arrow(cls, arrow_table) -> CTable: + """Build a :class:`CTable` from a :class:`pyarrow.Table`. + + Schema is inferred from the Arrow field types. String columns + (``pa.string()``, ``pa.large_string()``) are stored with + ``max_length`` set to the longest value found in the data. + + Parameters + ---------- + arrow_table: + A ``pyarrow.Table`` instance. + + Returns + ------- + CTable + A new in-memory CTable containing all rows from *arrow_table*. + + Raises + ------ + ImportError + If ``pyarrow`` is not installed. + TypeError + If an Arrow field type has no corresponding blosc2 spec. + """ + try: + import pyarrow as pa + except ImportError: + raise ImportError( + "pyarrow is required for from_arrow(). Install it with: pip install pyarrow" + ) from None + + import blosc2.schema as b2s + + def _arrow_type_to_spec(pa_type, arrow_col): + """Map a pyarrow DataType to a blosc2 SchemaSpec.""" + mapping = [ + (pa.int8(), b2s.int8), + (pa.int16(), b2s.int16), + (pa.int32(), b2s.int32), + (pa.int64(), b2s.int64), + (pa.uint8(), b2s.uint8), + (pa.uint16(), b2s.uint16), + (pa.uint32(), b2s.uint32), + (pa.uint64(), b2s.uint64), + (pa.float32(), b2s.float32), + (pa.float64(), b2s.float64), + (pa.bool_(), b2s.bool), + ] + for arrow_t, spec_cls in mapping: + if pa_type == arrow_t: + return spec_cls() + + # String types: determine max_length from the data + if pa_type in (pa.string(), pa.large_string(), pa.utf8(), pa.large_utf8()): + values = [v for v in arrow_col.to_pylist() if v is not None] + max_len = max((len(v) for v in values), default=1) + return b2s.string(max_length=max(max_len, 1)) + + raise TypeError( + f"No blosc2 spec for Arrow type {pa_type!r}. " + "Supported: int8/16/32/64, uint8/16/32/64, float32/64, bool, string." + ) + + # Build CompiledSchema from Arrow schema + columns: list[CompiledColumn] = [] + for field in arrow_table.schema: + name = field.name + _validate_column_name(name) + spec = _arrow_type_to_spec(field.type, arrow_table.column(name)) + col_config = ColumnConfig(cparams=None, dparams=None, chunks=None, blocks=None) + columns.append( + CompiledColumn( + name=name, + py_type=spec.python_type, + spec=spec, + dtype=spec.dtype, + default=MISSING, + config=col_config, + display_width=compute_display_width(spec), + ) + ) + + schema = CompiledSchema( + row_cls=None, + columns=columns, + columns_by_name={col.name: col for col in columns}, + ) + + n = len(arrow_table) + capacity = max(n, 1) + default_chunks, default_blocks = compute_chunks_blocks((capacity,)) + mem_storage = InMemoryTableStorage() + + new_valid = mem_storage.create_valid_rows( + shape=(capacity,), + chunks=default_chunks, + blocks=default_blocks, + ) + new_cols: dict[str, blosc2.NDArray] = {} + for col in columns: + new_cols[col.name] = mem_storage.create_column( + col.name, + dtype=col.dtype, + shape=(capacity,), + chunks=default_chunks, + blocks=default_blocks, + cparams=None, + dparams=None, + ) + + obj = cls.__new__(cls) + obj._row_type = None + obj._validate = False + obj._table_cparams = None + obj._table_dparams = None + obj._storage = mem_storage + obj._read_only = False + obj._schema = schema + obj._cols = new_cols + obj._col_widths = {col.name: max(len(col.name), col.display_width) for col in columns} + obj.col_names = [col.name for col in columns] + obj.row = _RowIndexer(obj) + obj.auto_compact = False + obj.base = None + obj._valid_rows = new_valid + obj._n_rows = 0 + obj._last_pos = 0 + + if n > 0: + # Write each column directly — one bulk slice assignment per column. + # String columns (dtype.kind == 'U') can't go through Arrow's zero-copy + # to_numpy(), so we convert via to_pylist() and let NumPy handle the + # fixed-width unicode coercion. All other types use zero-copy numpy. + for col in columns: + arrow_col = arrow_table.column(col.name) + if col.dtype.kind in "US": + arr = np.array(arrow_col.to_pylist(), dtype=col.dtype) + else: + arr = arrow_col.to_numpy(zero_copy_only=False).astype(col.dtype) + new_cols[col.name][:n] = arr + + new_valid[:n] = True + obj._n_rows = n + obj._last_pos = n + + return obj + + # ------------------------------------------------------------------ + # CSV interop + # ------------------------------------------------------------------ + + def to_csv(self, path: str, *, header: bool = True, sep: str = ",") -> None: + """Write all live rows to a CSV file. + + Uses Python's stdlib ``csv`` module — no extra dependency required. + Each column is materialised once via :meth:`Column.to_numpy`; rows + are then written one at a time. + + Parameters + ---------- + path: + Destination file path. Created or overwritten. + header: + If ``True`` (default), write column names as the first row. + sep: + Field delimiter. Defaults to ``","``; use ``"\\t"`` for TSV. + """ + import csv + + arrays = [self[name].to_numpy() for name in self.col_names] + + with open(path, "w", newline="") as f: + writer = csv.writer(f, delimiter=sep) + if header: + writer.writerow(self.col_names) + for row in zip(*arrays, strict=True): + writer.writerow(row) + + @staticmethod + def _csv_col_to_array(raw: list[str], col, nv) -> np.ndarray: + """Convert a list of raw CSV strings to a numpy array for *col*.""" + if col.dtype == np.bool_: + + def _parse(v, _nv=nv): + stripped = v.strip() + if stripped == "" and _nv is not None: + return _nv + return stripped in ("True", "true", "1") + + return np.array([_parse(v) for v in raw], dtype=np.bool_) + if col.dtype.kind == "S": + prepared: list = [nv if (v.strip() == "" and nv is not None) else v.encode() for v in raw] + return np.array(prepared, dtype=col.dtype) + prepared2 = [nv if (v.strip() == "" and nv is not None) else v for v in raw] + return np.array(prepared2, dtype=col.dtype) + + @classmethod + def from_csv( + cls, + path: str, + row_cls, + *, + header: bool = True, + sep: str = ",", + ) -> CTable: + """Build a :class:`CTable` from a CSV file. + + Schema comes from *row_cls* (a dataclass) — CTable is always typed. + All rows are read in a single pass into per-column Python lists, then + each column is bulk-written into a pre-allocated NDArray (one slice + assignment per column, no ``extend()``). + + Parameters + ---------- + path: + Source CSV file path. + row_cls: + A dataclass whose fields define the column names and types. + header: + If ``True`` (default), the first row is treated as a header and + skipped. Column order in the file must match *row_cls* field + order regardless. + sep: + Field delimiter. Defaults to ``","``; use ``"\\t"`` for TSV. + + Returns + ------- + CTable + A new in-memory CTable containing all rows from the CSV file. + + Raises + ------ + TypeError + If *row_cls* is not a dataclass. + ValueError + If a row has a different number of fields than the schema. + """ + import csv + + schema = compile_schema(row_cls) + ncols = len(schema.columns) + + # Accumulate values per column as Python lists (one pass through file) + col_data: list[list] = [[] for _ in range(ncols)] + + with open(path, newline="") as f: + reader = csv.reader(f, delimiter=sep) + if header: + next(reader) + for lineno, row in enumerate(reader, start=2 if header else 1): + if len(row) != ncols: + raise ValueError(f"Line {lineno}: expected {ncols} fields, got {len(row)}.") + for i, val in enumerate(row): + col_data[i].append(val) + + n = len(col_data[0]) if ncols > 0 else 0 + capacity = max(n, 1) + default_chunks, default_blocks = compute_chunks_blocks((capacity,)) + mem_storage = InMemoryTableStorage() + + new_valid = mem_storage.create_valid_rows( + shape=(capacity,), + chunks=default_chunks, + blocks=default_blocks, + ) + new_cols: dict[str, blosc2.NDArray] = {} + for col in schema.columns: + new_cols[col.name] = mem_storage.create_column( + col.name, + dtype=col.dtype, + shape=(capacity,), + chunks=default_chunks, + blocks=default_blocks, + cparams=None, + dparams=None, + ) + + obj = cls.__new__(cls) + obj._row_type = row_cls + obj._validate = True + obj._table_cparams = None + obj._table_dparams = None + obj._storage = mem_storage + obj._read_only = False + obj._schema = schema + obj._cols = new_cols + obj._col_widths = {col.name: max(len(col.name), col.display_width) for col in schema.columns} + obj.col_names = [col.name for col in schema.columns] + obj.row = _RowIndexer(obj) + obj.auto_compact = False + obj.base = None + obj._valid_rows = new_valid + obj._n_rows = 0 + obj._last_pos = 0 + + if n > 0: + for i, col in enumerate(schema.columns): + nv = getattr(col.spec, "null_value", None) + arr = cls._csv_col_to_array(col_data[i], col, nv) + new_cols[col.name][:n] = arr + new_valid[:n] = True + obj._n_rows = n + obj._last_pos = n + + return obj + + # ------------------------------------------------------------------ + # Schema mutations: add / drop / rename columns + # ------------------------------------------------------------------ + + def add_column( + self, + name: str, + spec: SchemaSpec, + default, + *, + cparams: dict | None = None, + ) -> None: + """Add a new column filled with *default* for every existing live row. + + Parameters + ---------- + name: + Column name. Must follow the same naming rules as schema fields. + spec: + A schema descriptor such as ``b2.int64(ge=0)`` or ``b2.string()``. + default: + Value written to every existing live row. Must be coercible to + *spec*'s dtype. + cparams: + Optional compression parameters for this column's NDArray. + + Raises + ------ + ValueError + If the table is read-only, is a view, or the column already exists. + TypeError + If *default* cannot be coerced to *spec*'s dtype. + """ + if self._read_only: + raise ValueError("Table is read-only (opened with mode='r').") + if self.base is not None: + raise ValueError("Cannot add a column to a view.") + _validate_column_name(name) + if name in self._cols: + raise ValueError(f"Column {name!r} already exists.") + + try: + default_val = spec.dtype.type(default) + except (ValueError, OverflowError) as exc: + raise TypeError(f"Cannot coerce default {default!r} to dtype {spec.dtype!r}: {exc}") from exc + + capacity = len(self._valid_rows) + default_chunks, default_blocks = compute_chunks_blocks((capacity,)) + new_col = self._storage.create_column( + name, + dtype=spec.dtype, + shape=(capacity,), + chunks=default_chunks, + blocks=default_blocks, + cparams=cparams, + dparams=None, + ) + + live_pos = np.where(self._valid_rows[:])[0] + if len(live_pos) > 0: + new_col[live_pos] = default_val + + compiled_col = CompiledColumn( + name=name, + py_type=spec.python_type, + spec=spec, + dtype=spec.dtype, + default=default, + config=ColumnConfig(cparams=cparams, dparams=None, chunks=None, blocks=None), + display_width=compute_display_width(spec), + ) + self._cols[name] = new_col + self.col_names.append(name) + self._col_widths[name] = max(len(name), compiled_col.display_width) + + new_columns = self._schema.columns + [compiled_col] + self._schema = CompiledSchema( + row_cls=self._schema.row_cls, + columns=new_columns, + columns_by_name={**self._schema.columns_by_name, name: compiled_col}, + ) + if isinstance(self._storage, FileTableStorage): + self._storage.save_schema(schema_to_dict(self._schema)) + + def drop_column(self, name: str) -> None: + """Remove a column from the table. + + On disk tables the corresponding persisted column leaf is deleted. + + Raises + ------ + ValueError + If the table is read-only, is a view, or *name* is the last column. + KeyError + If *name* does not exist. + """ + if self._read_only: + raise ValueError("Table is read-only (opened with mode='r').") + if self.base is not None: + raise ValueError("Cannot drop a column from a view.") + if name not in self._cols: + raise KeyError(f"No column named {name!r}. Available: {self.col_names}") + if len(self.col_names) == 1: + raise ValueError("Cannot drop the last column.") + + catalog = self._storage.load_index_catalog() + if name in catalog: + descriptor = catalog.pop(name) + self._validate_index_descriptor(name, descriptor) + self._drop_index_descriptor(name, descriptor) + self._storage.save_index_catalog(catalog) + + if isinstance(self._storage, FileTableStorage): + self._storage.delete_column(name) + + del self._cols[name] + del self._col_widths[name] + self.col_names.remove(name) + + new_columns = [c for c in self._schema.columns if c.name != name] + self._schema = CompiledSchema( + row_cls=self._schema.row_cls, + columns=new_columns, + columns_by_name={c.name: c for c in new_columns}, + ) + if isinstance(self._storage, FileTableStorage): + self._storage.save_schema(schema_to_dict(self._schema)) + + def rename_column(self, old: str, new: str) -> None: + """Rename a column. + + On disk tables the corresponding persisted column leaf is renamed. + + Raises + ------ + ValueError + If the table is read-only, is a view, or *new* already exists. + KeyError + If *old* does not exist. + """ + if self._read_only: + raise ValueError("Table is read-only (opened with mode='r').") + if self.base is not None: + raise ValueError("Cannot rename a column in a view.") + if old not in self._cols: + raise KeyError(f"No column named {old!r}. Available: {self.col_names}") + if new in self._cols: + raise ValueError(f"Column {new!r} already exists.") + _validate_column_name(new) + + catalog = self._storage.load_index_catalog() + rebuild_kwargs = None + if old in catalog: + descriptor = catalog.pop(old) + self._validate_index_descriptor(old, descriptor) + rebuild_kwargs = self._index_create_kwargs_from_descriptor(descriptor) + self._drop_index_descriptor(old, descriptor) + self._storage.save_index_catalog(catalog) + + if isinstance(self._storage, FileTableStorage): + self._cols[new] = self._storage.rename_column(old, new) + else: + self._cols[new] = self._cols[old] + del self._cols[old] + + idx = self.col_names.index(old) + self.col_names[idx] = new + self._col_widths[new] = max(len(new), self._col_widths.pop(old)) + + old_compiled = self._schema.columns_by_name[old] + renamed = CompiledColumn( + name=new, + py_type=old_compiled.py_type, + spec=old_compiled.spec, + dtype=old_compiled.dtype, + default=old_compiled.default, + config=old_compiled.config, + display_width=old_compiled.display_width, + ) + new_columns = [renamed if c.name == old else c for c in self._schema.columns] + self._schema = CompiledSchema( + row_cls=self._schema.row_cls, + columns=new_columns, + columns_by_name={c.name: c for c in new_columns}, + ) + if isinstance(self._storage, FileTableStorage): + self._storage.save_schema(schema_to_dict(self._schema)) + if rebuild_kwargs is not None: + self.create_index(new, **rebuild_kwargs) + + # ------------------------------------------------------------------ + # Column access + # ------------------------------------------------------------------ + + def __getitem__(self, s: str): + if s in self._cols: + return Column(self, s) + return None + + def __getattr__(self, s: str): + if s in self._cols: + return Column(self, s) + return super().__getattribute__(s) + + # ------------------------------------------------------------------ + # Compaction + # ------------------------------------------------------------------ + + def compact(self): + if self._read_only: + raise ValueError("Table is read-only (opened with mode='r').") + if self.base is not None: + raise ValueError("Cannot compact a view.") + real_poss = blosc2.where(self._valid_rows, np.array(range(len(self._valid_rows)))).compute() + start = 0 + block_size = self._valid_rows.blocks[0] + end = min(block_size, self._n_rows) + while start < end: + for _k, v in self._cols.items(): + v[start:end] = v[real_poss[start:end]] + start += block_size + end = min(end + block_size, self._n_rows) + + self._valid_rows[: self._n_rows] = True + self._valid_rows[self._n_rows :] = False + self._last_pos = self._n_rows # next write goes right after live rows + self._mark_all_indexes_stale() + + def _normalise_sort_keys( + self, + cols: str | list[str], + ascending: bool | list[bool], + ) -> tuple[list[str], list[bool]]: + """Validate and normalise sort key arguments; return (cols, ascending).""" + if isinstance(cols, str): + cols = [cols] + if isinstance(ascending, bool): + ascending = [ascending] * len(cols) + if len(cols) != len(ascending): + raise ValueError( + f"'ascending' must have the same length as 'cols' ({len(cols)}), got {len(ascending)}." + ) + for name in cols: + if name not in self._cols: + raise KeyError(f"No column named {name!r}. Available: {self.col_names}") + dtype = self._cols[name].dtype + if np.issubdtype(dtype, np.complexfloating): + raise TypeError( + f"Column {name!r} has complex dtype {dtype} which does not support ordering." + ) + return cols, ascending + + def _build_lex_keys( + self, + cols: list[str], + ascending: list[bool], + live_pos: np.ndarray, + n: int, + ) -> list[np.ndarray]: + """Build the key list for np.lexsort (innermost = last = primary key). + + For nullable columns a null-indicator key (0=non-null, 1=null) is + inserted immediately after the value key, making it more significant. + This ensures nulls sort last regardless of ascending/descending order. + """ + lex_keys = [] + for name, asc in zip(reversed(cols), reversed(ascending), strict=True): + raw = self._cols[name][live_pos] + col_info = self._schema.columns_by_name.get(name) + nv = getattr(col_info.spec, "null_value", None) if col_info else None + + # Value key + if not asc: + if raw.dtype.kind in "US": + # strings can't be negated — invert via rank + rank = np.argsort(np.argsort(raw, kind="stable"), kind="stable") + lex_keys.append((n - 1 - rank).astype(np.intp)) + elif np.issubdtype(raw.dtype, np.unsignedinteger): + lex_keys.append(-raw.astype(np.int64)) + else: + lex_keys.append(-raw) + else: + lex_keys.append(raw) + + # Null indicator key — more significant than the value key above, + # so nulls always sort last (0 before 1 → non-null before null). + if nv is not None: + if isinstance(nv, float) and np.isnan(nv): + null_ind = np.isnan(raw).astype(np.intp) + else: + null_ind = (raw == nv).astype(np.intp) + lex_keys.append(null_ind) + + return lex_keys + + def sort_by( + self, + cols: str | list[str], + ascending: bool | list[bool] = True, + *, + inplace: bool = False, + ) -> CTable: + """Return a copy of the table sorted by one or more columns. + + Parameters + ---------- + cols: + Column name or list of column names to sort by. When multiple + columns are given, the first is the primary key, the second is + the tiebreaker, and so on. + ascending: + Sort direction. A single bool applies to all keys; a list must + have the same length as *cols*. + inplace: + If ``True``, rewrite the physical data in place and return + ``self`` (like :meth:`compact` but sorted). If ``False`` + (default), return a new in-memory CTable leaving this one + untouched. + + Raises + ------ + ValueError + If called on a view or a read-only table when ``inplace=True``. + KeyError + If any column name is not found. + TypeError + If a column used as a sort key does not support ordering + (e.g. complex numbers). + """ + if self.base is not None: + raise ValueError("Cannot sort a view. Materialise it first with .to_table() or sort the parent.") + if inplace and self._read_only: + raise ValueError("Table is read-only (opened with mode='r').") + + cols, ascending = self._normalise_sort_keys(cols, ascending) + + # Live physical positions + valid_np = self._valid_rows[:] + live_pos = np.where(valid_np)[0] + n = len(live_pos) + + if n == 0: + if inplace: + return self + return self._empty_copy() + + order = np.lexsort(self._build_lex_keys(cols, ascending, live_pos, n)) + + sorted_pos = live_pos[order] + + if inplace: + for _col_name, arr in self._cols.items(): + arr[:n] = arr[sorted_pos] + self._valid_rows[:n] = True + self._valid_rows[n:] = False + self._n_rows = n + self._last_pos = n + self._mark_all_indexes_stale() + return self + else: + # Build a new in-memory table with the sorted rows + result = self._empty_copy() + for col_name, arr in self._cols.items(): + result._cols[col_name][:n] = arr[sorted_pos] + result._valid_rows[:n] = True + result._valid_rows[n:] = False + result._n_rows = n + result._last_pos = n + return result + + def _empty_copy(self) -> CTable: + """Return a new empty in-memory CTable with the same schema and capacity.""" + from blosc2 import compute_chunks_blocks + + capacity = max(self._n_rows, 1) + default_chunks, default_blocks = compute_chunks_blocks((capacity,)) + mem_storage = InMemoryTableStorage() + + new_valid = mem_storage.create_valid_rows( + shape=(capacity,), + chunks=default_chunks, + blocks=default_blocks, + ) + new_cols = {} + for col in self._schema.columns: + col_storage = self._resolve_column_storage(col, default_chunks, default_blocks) + new_cols[col.name] = mem_storage.create_column( + col.name, + dtype=col.dtype, + shape=(capacity,), + chunks=col_storage["chunks"], + blocks=col_storage["blocks"], + cparams=col_storage.get("cparams"), + dparams=col_storage.get("dparams"), + ) + + obj = CTable.__new__(CTable) + obj._schema = self._schema + obj._row_type = self._row_type + obj._table_cparams = self._table_cparams + obj._table_dparams = self._table_dparams + obj._storage = mem_storage + obj._valid_rows = new_valid + obj._cols = new_cols + obj._col_widths = self._col_widths + obj.col_names = [col.name for col in self._schema.columns] + obj.row = _RowIndexer(obj) + obj._n_rows = 0 + obj._last_pos = None + obj._read_only = False + obj.base = None + obj.auto_compact = self.auto_compact + obj._validate = self._validate + return obj + + # ------------------------------------------------------------------ + # Properties / info + # ------------------------------------------------------------------ + + @property + def nrows(self) -> int: + return self._n_rows + + @property + def ncols(self) -> int: + return len(self._cols) + + @property + def cbytes(self) -> int: + """Total compressed size in bytes (all columns + valid_rows mask).""" + return sum(col.cbytes for col in self._cols.values()) + self._valid_rows.cbytes + + @property + def nbytes(self) -> int: + """Total uncompressed size in bytes (all columns + valid_rows mask).""" + return sum(col.nbytes for col in self._cols.values()) + self._valid_rows.nbytes + + @property + def cratio(self) -> float: + """Compression ratio for the whole table payload.""" + if self.cbytes == 0: + return float("inf") + return self.nbytes / self.cbytes + + @property + def schema(self) -> CompiledSchema: + """The compiled schema that drives this table's columns and validation.""" + return self._schema + + def column_schema(self, name: str) -> CompiledColumn: + """Return the :class:`CompiledColumn` descriptor for *name*. + + Raises + ------ + KeyError + If *name* is not a column in this table. + """ + try: + return self._schema.columns_by_name[name] + except KeyError: + raise KeyError(f"No column named {name!r}. Available: {self.col_names}") from None + + def schema_dict(self) -> dict[str, Any]: + """Return a JSON-compatible dict describing this table's schema.""" + return schema_to_dict(self._schema) + + # ------------------------------------------------------------------ + # Index management + # ------------------------------------------------------------------ + + @property + def _root_table(self) -> CTable: + """Return the root (non-view) table; *self* if not a view.""" + t = self + while t.base is not None: + t = t.base + return t + + def _mark_all_indexes_stale(self) -> None: + """Bump value_epoch and mark every catalog entry stale on the root table.""" + root = self._root_table + root._storage.bump_value_epoch() + catalog = root._storage.load_index_catalog() + if not catalog: + return + changed = False + for desc in catalog.values(): + if not desc.get("stale", False): + desc["stale"] = True + changed = True + if changed: + root._storage.save_index_catalog(catalog) + + @staticmethod + def _validate_index_descriptor(col_name: str, descriptor: dict) -> None: + """Raise ValueError when an index catalog entry is malformed.""" + if not isinstance(descriptor, dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: descriptor must be a dict.") + token = descriptor.get("token") + if not isinstance(token, str) or not token: + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing token.") + kind = descriptor.get("kind") + if kind not in {"summary", "bucket", "partial", "full"}: + raise ValueError(f"Malformed index metadata for column {col_name!r}: invalid kind {kind!r}.") + if kind == "bucket" and not isinstance(descriptor.get("bucket"), dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing bucket payload.") + if kind == "partial" and not isinstance(descriptor.get("partial"), dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing partial payload.") + if kind == "full" and not isinstance(descriptor.get("full"), dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing full payload.") + + def _drop_index_descriptor(self, col_name: str, descriptor: dict) -> None: + """Delete sidecars/cache for a catalog descriptor without touching the column mapping.""" + from pathlib import Path + + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _PERSISTENT_INDEXES, + _array_key, + _clear_cached_data, + _drop_descriptor_sidecars, + _is_persistent_array, + ) + + col_arr = self._cols.get(col_name) + token = descriptor["token"] + + if col_arr is not None: + _clear_cached_data(col_arr, token) + + if col_arr is not None and _is_persistent_array(col_arr): + arr_key = _array_key(col_arr) + store = _PERSISTENT_INDEXES.get(arr_key) + if store is not None: + store["indexes"].pop(token, None) + elif col_arr is not None: + store = _IN_MEMORY_INDEXES.get(id(col_arr)) + if store is not None: + store["indexes"].pop(token, None) + + _drop_descriptor_sidecars(descriptor) + + anchor = self._storage.index_anchor_path(col_name) + if anchor is not None: + proxy_key = ("persistent", str(Path(anchor).resolve())) + _PERSISTENT_INDEXES.pop(proxy_key, None) + with contextlib.suppress(OSError): + os.rmdir(os.path.dirname(anchor)) + + def _index_create_kwargs_from_descriptor(self, descriptor: dict) -> dict[str, Any]: + """Return create_index kwargs that rebuild an existing descriptor.""" + build = "ooc" if bool(descriptor.get("ooc", False)) else "memory" + return { + "kind": descriptor["kind"], + "optlevel": int(descriptor.get("optlevel", 5)), + "name": descriptor.get("name") or None, + "build": build, + "cparams": descriptor.get("cparams"), + } + + def _build_index_persistent( + self, + col_name: str, + col_arr: blosc2.NDArray, + *, + kind: str, + optlevel: int, + name_hint: str | None, + build: str, + tmpdir: str | None, + cparams_obj, + ) -> dict: + """Build index sidecar files for a persistent-table column; return the descriptor.""" + import tempfile + from pathlib import Path + + from blosc2.indexing import ( + _PERSISTENT_INDEXES, + _array_key, + _build_bucket_descriptor, + _build_bucket_descriptor_ooc, + _build_descriptor, + _build_full_descriptor, + _build_full_descriptor_ooc, + _build_levels_descriptor, + _build_levels_descriptor_ooc, + _build_partial_descriptor, + _build_partial_descriptor_ooc, + _copy_descriptor, + _field_target_descriptor, + _resolve_full_index_tmpdir, + _resolve_ooc_mode, + _target_token, + _values_for_target, + ) + + anchor = self._storage.index_anchor_path(col_name) + os.makedirs(os.path.dirname(anchor), exist_ok=True) + proxy = _CTableIndexProxy(col_arr, anchor) + proxy_key = _array_key(proxy) + _PERSISTENT_INDEXES.pop(proxy_key, None) # clear any stale cache entry + + target = _field_target_descriptor(None) + token = _target_token(target) + persistent = True + dtype = col_arr.dtype + use_ooc = _resolve_ooc_mode(kind, build) + + if use_ooc: + resolved_tmpdir = _resolve_full_index_tmpdir(proxy, tmpdir) + levels = _build_levels_descriptor_ooc(proxy, target, token, kind, dtype, persistent, cparams_obj) + bucket = ( + _build_bucket_descriptor_ooc( + proxy, target, token, kind, dtype, optlevel, persistent, cparams_obj + ) + if kind == "bucket" + else None + ) + partial = ( + _build_partial_descriptor_ooc( + proxy, target, token, kind, dtype, optlevel, persistent, cparams_obj + ) + if kind == "partial" + else None + ) + full = None + if kind == "full": + with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-", dir=resolved_tmpdir) as td: + full = _build_full_descriptor_ooc( + proxy, target, token, kind, dtype, persistent, Path(td), cparams_obj + ) + descriptor = _build_descriptor( + proxy, + target, + token, + kind, + optlevel, + persistent, + True, + name_hint, + dtype, + levels, + bucket, + partial, + full, + cparams_obj, + ) + else: + values = _values_for_target(proxy, target) + levels = _build_levels_descriptor( + proxy, target, token, kind, dtype, values, persistent, cparams_obj + ) + bucket = ( + _build_bucket_descriptor(proxy, token, kind, values, optlevel, persistent, cparams_obj) + if kind == "bucket" + else None + ) + partial = ( + _build_partial_descriptor(proxy, token, kind, values, optlevel, persistent, cparams_obj) + if kind == "partial" + else None + ) + full = ( + _build_full_descriptor(proxy, token, kind, values, persistent, cparams_obj) + if kind == "full" + else None + ) + descriptor = _build_descriptor( + proxy, + target, + token, + kind, + optlevel, + persistent, + False, + name_hint, + dtype, + levels, + bucket, + partial, + full, + cparams_obj, + ) + + result = _copy_descriptor(descriptor) + _PERSISTENT_INDEXES.pop(proxy_key, None) # evict proxy to avoid memory leak + return result + + def create_index( + self, + col_name: str, + *, + kind: blosc2.IndexKind = blosc2.IndexKind.BUCKET, + optlevel: int = 5, + name: str | None = None, + build: str = "auto", + tmpdir: str | None = None, + **kwargs, + ) -> CTableIndex: + """Build and register an index for a column. + + Parameters + ---------- + col_name: + Name of the column to index. + kind: + Index kind. One of :attr:`blosc2.IndexKind.BUCKET` (default), + :attr:`blosc2.IndexKind.PARTIAL`, or :attr:`blosc2.IndexKind.FULL`. + optlevel: + Optimisation level (1–9). Higher values give more precise pruning + at the cost of larger index files. Default is 5. + name: + Optional human-readable label for the index. + build: + Build strategy: ``'auto'``, ``'memory'``, or ``'ooc'`` (out-of-core). + tmpdir: + Temporary directory for out-of-core builds. ``None`` means use the + column's own directory (persistent tables) or the system temporary + directory (in-memory tables). + **kwargs: + Pass ``cparams=`` to customise index compression. + + Returns + ------- + CTableIndex + A handle on the newly created index. + + Raises + ------ + ValueError + If called on a view. + KeyError + If *col_name* is not a column of this table. + """ + if self.base is not None: + raise ValueError("Cannot create an index on a view.") + if col_name not in self._cols: + raise KeyError(f"No column named {col_name!r}. Available: {self.col_names}") + catalog = self._storage.load_index_catalog() + if col_name in catalog: + raise ValueError( + f"Index already exists for column {col_name!r}. " + "Call rebuild_index() to replace it or drop_index() first." + ) + + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _copy_descriptor, + _normalize_build_mode, + _normalize_index_cparams, + _normalize_index_kind, + ) + from blosc2.indexing import ( + create_index as _ix_create_index, + ) + + cparams_obj = _normalize_index_cparams(kwargs.pop("cparams", None)) + if kwargs: + raise TypeError(f"unexpected keyword argument(s): {', '.join(sorted(kwargs))}") + + kind_str = _normalize_index_kind(kind) + build_str = _normalize_build_mode(build) + col_arr = self._cols[col_name] + is_persistent = self._storage.index_anchor_path(col_name) is not None + + if is_persistent: + descriptor = self._build_index_persistent( + col_name, + col_arr, + kind=kind_str, + optlevel=optlevel, + name_hint=name, + build=build_str, + tmpdir=tmpdir, + cparams_obj=cparams_obj, + ) + else: + _ix_create_index( + col_arr, + field=None, + kind=blosc2.IndexKind(kind_str), + optlevel=optlevel, + name=name, + build=build, + tmpdir=tmpdir, + cparams=cparams_obj, + ) + store = _IN_MEMORY_INDEXES[id(col_arr)] + descriptor = _copy_descriptor(store["indexes"]["__self__"]) + + value_epoch, _ = self._storage.get_epoch_counters() + descriptor["built_value_epoch"] = value_epoch + + catalog = self._storage.load_index_catalog() + catalog[col_name] = descriptor + self._storage.save_index_catalog(catalog) + return CTableIndex(self, col_name, descriptor) + + def drop_index(self, col_name: str) -> None: + """Remove the index for *col_name* and delete any sidecar files. + + Parameters + ---------- + col_name: + Column whose index should be dropped. + + Raises + ------ + ValueError + If called on a view. + KeyError + If no index exists for *col_name*. + """ + if self.base is not None: + raise ValueError("Cannot drop an index from a view.") + + catalog = self._storage.load_index_catalog() + if col_name not in catalog: + raise KeyError(f"No index found for column {col_name!r}.") + + descriptor = catalog.pop(col_name) + self._validate_index_descriptor(col_name, descriptor) + self._drop_index_descriptor(col_name, descriptor) + self._storage.save_index_catalog(catalog) + + def rebuild_index(self, col_name: str) -> CTableIndex: + """Drop and recreate the index for *col_name* with the same parameters. + + Parameters + ---------- + col_name: + Column whose index should be rebuilt. + + Returns + ------- + CTableIndex + A handle on the newly built index. + + Raises + ------ + ValueError + If called on a view. + KeyError + If no index exists for *col_name*. + """ + if self.base is not None: + raise ValueError("Cannot rebuild an index on a view.") + + catalog = self._storage.load_index_catalog() + if col_name not in catalog: + raise KeyError(f"No index found for column {col_name!r}.") + + old_desc = catalog[col_name] + self._validate_index_descriptor(col_name, old_desc) + create_kwargs = self._index_create_kwargs_from_descriptor(old_desc) + + self.drop_index(col_name) + return self.create_index(col_name, **create_kwargs) + + def compact_index(self, col_name: str) -> CTableIndex: + """Compact the index for *col_name*, merging any incremental append runs. + + Only meaningful for ``kind='full'`` indexes. For other kinds the call + is a no-op and returns the current handle. + + Parameters + ---------- + col_name: + Column whose index should be compacted. + + Returns + ------- + CTableIndex + A handle reflecting the (possibly updated) index descriptor. + + Raises + ------ + ValueError + If called on a view. + KeyError + If no index exists for *col_name*. + """ + if self.base is not None: + raise ValueError("Cannot compact an index on a view.") + + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _PERSISTENT_INDEXES, + _array_key, + _copy_descriptor, + _default_index_store, + _is_persistent_array, + ) + from blosc2.indexing import ( + compact_index as _ix_compact_index, + ) + + catalog = self._storage.load_index_catalog() + if col_name not in catalog: + raise KeyError(f"No index found for column {col_name!r}.") + + col_arr = self._cols[col_name] + descriptor = catalog[col_name] + + if _is_persistent_array(col_arr): + anchor = self._storage.index_anchor_path(col_name) + proxy = _CTableIndexProxy(col_arr, anchor) + proxy_key = _array_key(proxy) + store = _default_index_store() + store["indexes"][descriptor["token"]] = descriptor + _PERSISTENT_INDEXES[proxy_key] = store + try: + _ix_compact_index(proxy) + updated_store = _PERSISTENT_INDEXES.get(proxy_key) or store + updated_desc = _copy_descriptor(updated_store["indexes"][descriptor["token"]]) + finally: + _PERSISTENT_INDEXES.pop(proxy_key, None) + updated_desc["built_value_epoch"] = descriptor.get("built_value_epoch", 0) + catalog[col_name] = updated_desc + self._storage.save_index_catalog(catalog) + return CTableIndex(self, col_name, updated_desc) + else: + _ix_compact_index(col_arr) + store = _IN_MEMORY_INDEXES.get(id(col_arr)) + if store: + token = descriptor["token"] + updated_desc = _copy_descriptor(store["indexes"].get(token, descriptor)) + updated_desc["built_value_epoch"] = descriptor.get("built_value_epoch", 0) + catalog[col_name] = updated_desc + self._storage.save_index_catalog(catalog) + return CTableIndex(self, col_name, updated_desc) + return CTableIndex(self, col_name, descriptor) + + def index(self, col_name: str) -> CTableIndex: + """Return the index handle for *col_name*. + + Parameters + ---------- + col_name: + Column name to look up. + + Returns + ------- + CTableIndex + + Raises + ------ + KeyError + If no index exists for *col_name*. + """ + catalog = self._root_table._storage.load_index_catalog() + if col_name not in catalog: + raise KeyError(f"No index found for column {col_name!r}.") + return CTableIndex(self, col_name, catalog[col_name]) + + @property + def indexes(self) -> list[CTableIndex]: + """Return a list of :class:`CTableIndex` handles for all active indexes.""" + catalog = self._root_table._storage.load_index_catalog() + return [CTableIndex(self, col_name, desc) for col_name, desc in catalog.items()] + + @staticmethod + def _find_indexed_columns(root_cols, catalog, operands): + """Return live indexed columns referenced by *operands* in expression order.""" + indexed = [] + seen = set() + for operand in operands.values(): + if not isinstance(operand, blosc2.NDArray): + continue + for col_name, col_arr in root_cols.items(): + if col_arr is not operand or col_name in seen or col_name not in catalog: + continue + descriptor = catalog[col_name] + CTable._validate_index_descriptor(col_name, descriptor) + if descriptor.get("stale", False): + continue + indexed.append((col_name, col_arr, descriptor)) + seen.add(col_name) + return indexed + + def _try_index_where(self, expr_result: blosc2.LazyExpr) -> np.ndarray | None: + """Attempt to resolve *expr_result* via a column index. + + Returns a 1-D int64 array of physical row positions that satisfy the + predicate, or ``None`` if no usable index was found (caller falls back + to a full scan). + """ + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _PERSISTENT_INDEXES, + _array_key, + _default_index_store, + _is_persistent_array, + evaluate_bucket_query, + evaluate_segment_query, + plan_query, + ) + + root = self._root_table + catalog = root._storage.load_index_catalog() + if not catalog: + return None + + expression = expr_result.expression + operands = dict(expr_result.operands) + + indexed_columns = self._find_indexed_columns(root._cols, catalog, operands) + if not indexed_columns: + return None + + primary_col_name, primary_col_arr, _ = indexed_columns[0] + + # Inject every usable table-owned descriptor so plan_query can combine them. + for _col_name, col_arr, descriptor in indexed_columns: + arr_key = _array_key(col_arr) + if _is_persistent_array(col_arr): + store = _PERSISTENT_INDEXES.get(arr_key) or _default_index_store() + store["indexes"][descriptor["token"]] = descriptor + _PERSISTENT_INDEXES[arr_key] = store + else: + store = _IN_MEMORY_INDEXES.get(id(col_arr)) or _default_index_store() + store["indexes"][descriptor["token"]] = descriptor + _IN_MEMORY_INDEXES[id(col_arr)] = store + + where_dict = {"_where_x": primary_col_arr} + merged_operands = {**operands, "_where_x": primary_col_arr} + + plan = plan_query(expression, merged_operands, where_dict) + if not plan.usable: + return None + + if plan.exact_positions is not None: + return np.asarray(plan.exact_positions, dtype=np.int64) + + if plan.bucket_masks is not None: + _, positions = evaluate_bucket_query( + expression, merged_operands, {}, where_dict, plan, return_positions=True + ) + return np.asarray(positions, dtype=np.int64) + + if plan.candidate_units is not None and plan.segment_len is not None: + _, positions = evaluate_segment_query( + expression, merged_operands, {}, where_dict, plan, return_positions=True + ) + return np.asarray(positions, dtype=np.int64) + + return None + + @property + def info_items(self) -> list[tuple[str, object]]: + """Structured summary items used by :meth:`info`.""" + storage_type = "persistent" if isinstance(self._storage, FileTableStorage) else "in-memory" + urlpath = self._storage._root if isinstance(self._storage, FileTableStorage) else None + schema_summary = { + name: _InfoLiteral(self._dtype_info_label(self._cols[name].dtype)) for name in self.col_names + } + + index_summary = {} + for idx in self.indexes: + stale = " stale" if idx.stale else "" + label = f" name={idx.name!r}" if idx.name and idx.name != "__self__" else "" + stats = idx.storage_stats() + if stats is None: + suffix = "size=n/a (sidecars not directly addressable)" + else: + _, cbytes, _ = stats + suffix = f"cbytes={format_nbytes_info(cbytes)}" + index_summary[idx.col_name] = f"[{idx.kind}{stale}{label}] {suffix}" + + items = [ + ("type", self.__class__.__name__), + ("storage", storage_type), + ("rows", self.nrows), + ("columns", self.ncols), + ("view", self.base is not None), + ("nbytes", format_nbytes_info(self.nbytes)), + ("cbytes", format_nbytes_info(self.cbytes)), + ("cratio", f"{self.cratio:.1f}x"), + ("schema", schema_summary), + ( + "valid_rows_mask", + f"cbytes={format_nbytes_info(self._valid_rows.cbytes)}", + ), + ("indexes", index_summary if index_summary else "none"), + ] + if urlpath is not None: + items.insert(2, ("urlpath", urlpath)) + open_mode = self._storage.open_mode() + if open_mode is not None: + items.insert(3, ("open_mode", open_mode)) + return items + + @staticmethod + def _dtype_info_label(dtype: np.dtype) -> str: + """Return a compact dtype label for info reports.""" + if dtype.kind == "U": + nchars = dtype.itemsize // 4 + return f"U{nchars} (Unicode, max {nchars} chars)" + if dtype.kind == "S": + return f"S{dtype.itemsize}" + return str(dtype) + + @property + def info(self) -> _CTableInfoReporter: + """Get information about this table. + + Examples + -------- + >>> print(t.info) + >>> t.info() + """ + return _CTableInfoReporter(self) + + # ------------------------------------------------------------------ + # Mutation: append / extend / delete + # ------------------------------------------------------------------ + + def _load_initial_data(self, new_data) -> None: + """Dispatch new_data to append() or extend() as appropriate.""" + is_append = False + + if isinstance(new_data, (np.void, np.record)): + is_append = True + elif isinstance(new_data, np.ndarray): + if new_data.dtype.names is not None and new_data.ndim == 0: + is_append = True + elif isinstance(new_data, list) and len(new_data) > 0: + first_elem = new_data[0] + if isinstance(first_elem, (str, bytes, int, float, bool, complex)): + is_append = True + + if is_append: + self.append(new_data) + else: + self.extend(new_data) + + def append(self, data: list | np.void | np.ndarray) -> None: + if self._read_only: + raise ValueError("Table is read-only (opened with mode='r').") + if self.base is not None: + raise TypeError("Cannot extend view.") + + # Normalize → validate → coerce + row = self._normalize_row_input(data) + if self._validate: + from blosc2.schema_validation import validate_row + + row = validate_row(self._schema, row) + row = self._coerce_row_to_storage(row) + + pos = self._resolve_last_pos() + if pos >= len(self._valid_rows): + self._grow() + + for name, col_array in self._cols.items(): + col_array[pos] = row[name] + + self._valid_rows[pos] = True + self._last_pos = pos + 1 + self._n_rows += 1 + self._mark_all_indexes_stale() + + def delete(self, ind: int | slice | str | Iterable) -> None: + if self._read_only: + raise ValueError("Table is read-only (opened with mode='r').") + if self.base is not None: + raise ValueError("Cannot delete rows from a view.") + valid_rows_np = self._valid_rows[:] + true_pos = np.where(valid_rows_np)[0] + + if isinstance(ind, Iterable) and not isinstance(ind, (str, bytes)): + ind = list(ind) + elif not isinstance(ind, int) and not isinstance(ind, slice): + raise TypeError(f"Invalid type '{type(ind)}'") + + false_pos = true_pos[ind] + n_deleted = len(np.unique(false_pos)) + + valid_rows_np[false_pos] = False + self._valid_rows[:] = valid_rows_np # write back in-place; no new array created + self._n_rows -= n_deleted + self._last_pos = None # recalculate on next write + self._storage.bump_visibility_epoch() + + def extend(self, data: list | CTable | Any, *, validate: bool | None = None) -> None: + if self._read_only: + raise ValueError("Table is read-only (opened with mode='r').") + if self.base is not None: + raise TypeError("Cannot extend view.") + if len(data) <= 0: + return + + # Resolve effective validate flag: per-call override takes precedence + do_validate = self._validate if validate is None else validate + + start_pos = self._resolve_last_pos() + + current_col_names = self.col_names + columns_to_insert = [] + new_nrows = 0 + + if hasattr(data, "_cols") and hasattr(data, "_n_rows"): + for name in current_col_names: + col = data._cols[name][: data._n_rows] + columns_to_insert.append(col) + new_nrows = data._n_rows + else: + if isinstance(data, np.ndarray) and data.dtype.names is not None: + for name in current_col_names: + columns_to_insert.append(data[name]) + new_nrows = len(data) + else: + columns_to_insert = list(zip(*data, strict=False)) + new_nrows = len(data) + + # Validate constraints column-by-column before writing + if do_validate: + from blosc2.schema_vectorized import validate_column_batch + + raw_columns = {current_col_names[i]: columns_to_insert[i] for i in range(len(current_col_names))} + validate_column_batch(self._schema, raw_columns) + + processed_cols = [] + for i, raw_col in enumerate(columns_to_insert): + target_dtype = self._cols[current_col_names[i]].dtype + b2_arr = blosc2.asarray(raw_col, dtype=target_dtype) + processed_cols.append(b2_arr) + + end_pos = start_pos + new_nrows + + if self.auto_compact and end_pos >= len(self._valid_rows): + self.compact() # sets _last_pos = _n_rows + start_pos = self._last_pos + end_pos = start_pos + new_nrows + + while end_pos > len(self._valid_rows): + self._grow() + + for j, name in enumerate(current_col_names): + self._cols[name][start_pos:end_pos] = processed_cols[j][:] + + self._valid_rows[start_pos:end_pos] = True + self._last_pos = end_pos + self._n_rows += new_nrows + self._mark_all_indexes_stale() + + # ------------------------------------------------------------------ + # Filtering + # ------------------------------------------------------------------ + + @profile + def where(self, expr_result) -> CTable: + if isinstance(expr_result, np.ndarray) and expr_result.dtype == np.bool_: + expr_result = blosc2.asarray(expr_result) + if isinstance(expr_result, Column): + expr_result = expr_result._raw_col + + if not ( + isinstance(expr_result, (blosc2.NDArray, blosc2.LazyExpr)) + and (getattr(expr_result, "dtype", None) == np.bool_) + ): + raise TypeError(f"Expected boolean blosc2.NDArray or LazyExpr, got {type(expr_result).__name__}") + + # Attempt index-accelerated filtering before falling back to a full scan. + if isinstance(expr_result, blosc2.LazyExpr): + positions = self._try_index_where(expr_result) + if positions is not None: + total = len(self._valid_rows) + mask = np.zeros(total, dtype=bool) + valid_pos = positions[(positions >= 0) & (positions < total)] + mask[valid_pos] = True + mask &= self._valid_rows[:] + return self.view(blosc2.asarray(mask)) + + filter = expr_result.compute() if isinstance(expr_result, blosc2.LazyExpr) else expr_result + + target_len = len(self._valid_rows) + + if len(filter) > target_len: + filter = filter[:target_len] + elif len(filter) < target_len: + padding = blosc2.zeros(target_len, dtype=np.bool_) + padding[: len(filter)] = filter[:] + filter = padding + + filter = (filter & self._valid_rows).compute() + + return self.view(filter) + + def _run_row_logic(self, ind: int | slice | str | Iterable) -> CTable: + valid_rows_np = self._valid_rows[:] + true_pos = np.where(valid_rows_np)[0] + + if isinstance(ind, Iterable) and not isinstance(ind, (str, bytes)): + ind = list(ind) + + mant_pos = true_pos[ind] + + new_mask_np = np.zeros_like(valid_rows_np, dtype=bool) + new_mask_np[mant_pos] = True + + new_mask = blosc2.asarray(new_mask_np) + return self.view(new_mask) diff --git a/src/blosc2/ctable_storage.py b/src/blosc2/ctable_storage.py new file mode 100644 index 00000000..54efa59b --- /dev/null +++ b/src/blosc2/ctable_storage.py @@ -0,0 +1,416 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +"""Storage backends for CTable. + +Two concrete backends: + +* :class:`InMemoryTableStorage` — all arrays live in RAM (default when + ``urlpath`` is not provided). +* :class:`FileTableStorage` — arrays are stored inside a :class:`blosc2.TreeStore` + rooted at ``urlpath``; logical object metadata lives in ``/_meta`` and table + data lives under ``/_valid_rows`` and ``/_cols/``. +""" + +from __future__ import annotations + +import copy +import json +import os +from typing import Any + +import numpy as np + +import blosc2 + +# Directory inside the table root that holds per-column index sidecar files. +_INDEXES_DIR = "_indexes" + + +# --------------------------------------------------------------------------- +# Abstract base +# --------------------------------------------------------------------------- + + +class TableStorage: + """Interface that CTable uses to create/open its backing arrays.""" + + def create_column( + self, + name: str, + *, + dtype: np.dtype, + shape: tuple[int, ...], + chunks: tuple[int, ...], + blocks: tuple[int, ...], + cparams: dict[str, Any] | None, + dparams: dict[str, Any] | None, + ) -> blosc2.NDArray: + raise NotImplementedError + + def open_column(self, name: str) -> blosc2.NDArray: + raise NotImplementedError + + def create_valid_rows( + self, + *, + shape: tuple[int, ...], + chunks: tuple[int, ...], + blocks: tuple[int, ...], + ) -> blosc2.NDArray: + raise NotImplementedError + + def open_valid_rows(self) -> blosc2.NDArray: + raise NotImplementedError + + def save_schema(self, schema_dict: dict[str, Any]) -> None: + raise NotImplementedError + + def load_schema(self) -> dict[str, Any] | None: + raise NotImplementedError + + def table_exists(self) -> bool: + raise NotImplementedError + + def is_read_only(self) -> bool: + raise NotImplementedError + + def open_mode(self) -> str | None: + raise NotImplementedError + + def delete_column(self, name: str) -> None: + raise NotImplementedError + + def rename_column(self, old: str, new: str) -> blosc2.NDArray: + raise NotImplementedError + + def close(self) -> None: + raise NotImplementedError + + def discard(self) -> None: + """Clean up resources without persisting changes back to the archive.""" + self.close() + + # -- Index catalog and epoch helpers ------------------------------------- + + def load_index_catalog(self) -> dict: + """Return the current index catalog (column_name → descriptor dict).""" + raise NotImplementedError + + def save_index_catalog(self, catalog: dict) -> None: + """Persist *catalog* (column_name → descriptor dict).""" + raise NotImplementedError + + def get_epoch_counters(self) -> tuple[int, int]: + """Return ``(value_epoch, visibility_epoch)``.""" + raise NotImplementedError + + def bump_value_epoch(self) -> int: + """Increment and return the value epoch (data values changed).""" + raise NotImplementedError + + def bump_visibility_epoch(self) -> int: + """Increment and return the visibility epoch (row set changed by delete).""" + raise NotImplementedError + + def index_anchor_path(self, col_name: str) -> str | None: + """Return the urlpath used as the anchor for index sidecar naming. + + Returns *None* for in-memory storage. For file-backed storage returns + a path of the form ``/_indexes//_anchor``. + """ + raise NotImplementedError + + +# --------------------------------------------------------------------------- +# In-memory backend +# --------------------------------------------------------------------------- + + +class InMemoryTableStorage(TableStorage): + """All arrays are plain in-memory blosc2.NDArray objects.""" + + def __init__(self) -> None: + self._index_catalog: dict = {} + self._value_epoch: int = 0 + self._visibility_epoch: int = 0 + + def create_column(self, name, *, dtype, shape, chunks, blocks, cparams, dparams): + kwargs: dict[str, Any] = {"chunks": chunks, "blocks": blocks} + if cparams is not None: + kwargs["cparams"] = cparams + if dparams is not None: + kwargs["dparams"] = dparams + return blosc2.zeros(shape, dtype=dtype, **kwargs) + + def open_column(self, name): + raise RuntimeError("In-memory tables have no on-disk representation to open.") + + def create_valid_rows(self, *, shape, chunks, blocks): + return blosc2.zeros(shape, dtype=np.bool_, chunks=chunks, blocks=blocks) + + def open_valid_rows(self): + raise RuntimeError("In-memory tables have no on-disk representation to open.") + + def save_schema(self, schema_dict): + pass # nothing to persist + + def load_schema(self): + return None + + def table_exists(self): + return False + + def is_read_only(self): + return False + + def open_mode(self) -> str | None: + return None + + def delete_column(self, name): + raise RuntimeError("In-memory tables have no on-disk representation to mutate.") + + def rename_column(self, old: str, new: str): + raise RuntimeError("In-memory tables have no on-disk representation to mutate.") + + def close(self): + pass + + # -- Index catalog and epoch helpers ------------------------------------- + + def load_index_catalog(self) -> dict: + return copy.deepcopy(self._index_catalog) + + def save_index_catalog(self, catalog: dict) -> None: + self._index_catalog = copy.deepcopy(catalog) + + def get_epoch_counters(self) -> tuple[int, int]: + return self._value_epoch, self._visibility_epoch + + def bump_value_epoch(self) -> int: + self._value_epoch += 1 + return self._value_epoch + + def bump_visibility_epoch(self) -> int: + self._visibility_epoch += 1 + return self._visibility_epoch + + def index_anchor_path(self, col_name: str) -> str | None: + return None + + +# --------------------------------------------------------------------------- +# File-backed backend +# --------------------------------------------------------------------------- + +_META_KEY = "/_meta" +_VALID_ROWS_KEY = "/_valid_rows" +_COLS_DIR = "_cols" + + +class FileTableStorage(TableStorage): + """Arrays stored as TreeStore leaves inside *urlpath*. + + Parameters + ---------- + urlpath: + Path to the backing TreeStore (typically ``.b2d`` or ``.b2z``). + mode: + ``'w'`` — create (overwrite existing files). + ``'a'`` — open existing or create new. + ``'r'`` — open existing read-only. + """ + + def __init__(self, urlpath: str, mode: str) -> None: + if mode not in ("r", "a", "w"): + raise ValueError(f"mode must be 'r', 'a', or 'w'; got {mode!r}") + self._root = urlpath + self._mode = mode + self._meta: blosc2.SChunk | None = None + self._store: blosc2.TreeStore | None = None + + # ------------------------------------------------------------------ + # Key helpers + # ------------------------------------------------------------------ + + @property + def _meta_path(self) -> str: + return self._key_to_path(_META_KEY) + + @property + def _valid_rows_path(self) -> str: + return self._key_to_path(_VALID_ROWS_KEY) + + def _col_path(self, name: str) -> str: + return self._key_to_path(self._col_key(name)) + + def _col_key(self, name: str) -> str: + return f"/{_COLS_DIR}/{name}" + + def _key_to_path(self, key: str) -> str: + rel_key = key.lstrip("/") + suffix = ".b2f" if key == _META_KEY else ".b2nd" + if self._root.endswith(".b2d"): + return os.path.join(self._root, rel_key + suffix) + return os.path.join(self._root, rel_key + suffix) + + def _open_store(self) -> blosc2.TreeStore: + if self._store is None: + kwargs: dict[str, Any] = {"mode": self._mode} + if self._mode != "r": + # Force table internals to be stored as proper external leaves so + # reopened arrays stay live and mutable through the TreeStore. + kwargs["threshold"] = 0 + self._store = blosc2.TreeStore(self._root, **kwargs) + return self._store + + # ------------------------------------------------------------------ + # TableStorage interface + # ------------------------------------------------------------------ + + def table_exists(self) -> bool: + return os.path.exists(self._root) + + def is_read_only(self) -> bool: + return self._mode == "r" + + def open_mode(self) -> str | None: + return self._mode + + def create_column(self, name, *, dtype, shape, chunks, blocks, cparams, dparams): + kwargs: dict[str, Any] = { + "chunks": chunks, + "blocks": blocks, + } + if cparams is not None: + kwargs["cparams"] = cparams + if dparams is not None: + kwargs["dparams"] = dparams + col = blosc2.zeros(shape, dtype=dtype, **kwargs) + store = self._open_store() + store[self._col_key(name)] = col + return store[self._col_key(name)] + + def open_column(self, name: str) -> blosc2.NDArray: + return self._open_store()[self._col_key(name)] + + def create_valid_rows(self, *, shape, chunks, blocks): + valid_rows = blosc2.zeros( + shape, + dtype=np.bool_, + chunks=chunks, + blocks=blocks, + ) + store = self._open_store() + store[_VALID_ROWS_KEY] = valid_rows + return store[_VALID_ROWS_KEY] + + def open_valid_rows(self) -> blosc2.NDArray: + return self._open_store()[_VALID_ROWS_KEY] + + def save_schema(self, schema_dict: dict[str, Any]) -> None: + """Write *schema_dict* (plus kind/version markers) to ``/_meta``.""" + meta = blosc2.SChunk() + meta.vlmeta["kind"] = "ctable" + meta.vlmeta["version"] = 1 + meta.vlmeta["schema"] = json.dumps(schema_dict) + store = self._open_store() + store[_META_KEY] = meta + opened = store[_META_KEY] + if not isinstance(opened, blosc2.SChunk): + raise ValueError("CTable manifest '/_meta' must materialize as an SChunk.") + self._meta = opened + + def _open_meta(self) -> blosc2.SChunk: + """Open (or return cached) the ``/_meta`` SChunk.""" + if self._meta is None: + try: + opened = self._open_store()[_META_KEY] + except KeyError as exc: + raise FileNotFoundError(f"No CTable manifest found at {self._root!r}") from exc + if not isinstance(opened, blosc2.SChunk): + raise ValueError(f"CTable manifest at {self._root!r} must be an SChunk.") + self._meta = opened + return self._meta + + def load_schema(self) -> dict[str, Any]: + """Read and return the schema dict stored in ``/_meta``.""" + raw = self._open_meta().vlmeta["schema"] + if isinstance(raw, bytes): + raw = raw.decode() + return json.loads(raw) + + def check_kind(self) -> None: + """Raise :exc:`ValueError` if ``_meta`` does not identify a CTable.""" + kind = self._open_meta().vlmeta["kind"] + if isinstance(kind, bytes): + kind = kind.decode() + if kind != "ctable": + raise ValueError(f"Path {self._root!r} does not contain a CTable (kind={kind!r}).") + + def column_names_from_schema(self) -> list[str]: + d = self.load_schema() + return [c["name"] for c in d["columns"]] + + def delete_column(self, name: str) -> None: + del self._open_store()[self._col_key(name)] + + def rename_column(self, old: str, new: str) -> blosc2.NDArray: + store = self._open_store() + old_key = self._col_key(old) + new_key = self._col_key(new) + store[new_key] = store[old_key] + del store[old_key] + return store[new_key] + + def close(self) -> None: + if self._store is not None: + self._store.close() + self._store = None + self._meta = None + + def discard(self) -> None: + """Clean up without repacking the .b2z archive.""" + if self._store is not None: + self._store.discard() + self._store = None + self._meta = None + + # -- Index catalog and epoch helpers ------------------------------------- + + def load_index_catalog(self) -> dict: + meta = self._open_meta() + raw = meta.vlmeta.get("index_catalog") + if isinstance(raw, dict): + return copy.deepcopy(raw) + return {} + + def save_index_catalog(self, catalog: dict) -> None: + meta = self._open_meta() + meta.vlmeta["index_catalog"] = copy.deepcopy(catalog) + + def get_epoch_counters(self) -> tuple[int, int]: + meta = self._open_meta() + ve = int(meta.vlmeta.get("value_epoch", 0) or 0) + vis_e = int(meta.vlmeta.get("visibility_epoch", 0) or 0) + return ve, vis_e + + def bump_value_epoch(self) -> int: + meta = self._open_meta() + ve = int(meta.vlmeta.get("value_epoch", 0) or 0) + 1 + meta.vlmeta["value_epoch"] = ve + return ve + + def bump_visibility_epoch(self) -> int: + meta = self._open_meta() + vis_e = int(meta.vlmeta.get("visibility_epoch", 0) or 0) + 1 + meta.vlmeta["visibility_epoch"] = vis_e + return vis_e + + def index_anchor_path(self, col_name: str) -> str | None: + return os.path.join(self._root, _INDEXES_DIR, col_name, "_anchor") diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index e52ba607..3166bfeb 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -29,7 +29,7 @@ class DictStore: """ Directory-based storage for compressed data using Blosc2. - Manages arrays in a directory (.b2d) or zip (.b2z) format. + Manages arrays in a directory or zip-file backed format. Supports the following types: @@ -46,10 +46,11 @@ class DictStore: Parameters ---------- localpath : str - Local path for the directory (".b2d") or file (".b2z"); other extensions - are not supported. If a directory is specified, it will be treated as - a Blosc2 directory format (B2DIR). If a file is specified, it - will be treated as a Blosc2 zip format (B2ZIP). + Local path for the directory or zip file. Paths ending in ``.b2d`` and + ``.b2z`` remain the recommended conventions. If the path already exists, + directories are treated as Blosc2 directory format (B2DIR) and files as + Blosc2 zip format (B2ZIP). For new extensionless paths, directory-backed + storage is used by default. mode : str, optional File mode ('r', 'w', 'a'). Default is 'a'. mmap_mode : str or None, optional @@ -57,7 +58,8 @@ class DictStore: and only when ``mode="r"``. Default is None. tmpdir : str or None, optional Temporary directory to use when working with ".b2z" files. If None, - a system temporary directory will be managed. Default is None. + a temporary directory is created in the same directory as the ".b2z" + file, so that unpacked data stays on the same filesystem. Default is None. cparams : dict or None, optional Compression parameters for the internal embed store. If None, the default Blosc2 parameters are used. @@ -117,8 +119,6 @@ def __init__( See :class:`DictStore` for full documentation of parameters. """ self.localpath = localpath if isinstance(localpath, str | bytes) else str(localpath) - if not self.localpath.endswith((".b2z", ".b2d")): - raise ValueError(f"localpath must have a .b2z or .b2d extension; you passed: {self.localpath}") if mode not in ("r", "w", "a"): raise ValueError("For DictStore containers, mode must be 'r', 'w', or 'a'") if mmap_mode not in (None, "r"): @@ -142,6 +142,8 @@ def __init__( self.offsets = {} self.map_tree = {} self._temp_dir_obj = None + self._closed = False + self._modified = False self._setup_paths_and_dirs(tmpdir) @@ -152,20 +154,33 @@ def __init__( def _setup_paths_and_dirs(self, tmpdir: str | None): """Set up working directories and paths.""" - self.is_zip_store = self.localpath.endswith(".b2z") + localpath_exists = os.path.exists(self.localpath) + if localpath_exists: + self.is_zip_store = os.path.isfile(self.localpath) + elif self.localpath.endswith(".b2z"): + self.is_zip_store = True + elif self.localpath.endswith(".b2d"): + self.is_zip_store = False + else: + # Default extensionless new stores to directory-backed layout. + self.is_zip_store = False if self.is_zip_store: if tmpdir is None: - self._temp_dir_obj = tempfile.TemporaryDirectory() + b2z_parent = os.path.dirname(os.path.abspath(self.localpath)) + self._temp_dir_obj = tempfile.TemporaryDirectory(dir=b2z_parent) self.working_dir = self._temp_dir_obj.name else: self.working_dir = tmpdir os.makedirs(tmpdir, exist_ok=True) self.b2z_path = self.localpath - else: # .b2d + else: self.working_dir = self.localpath if self.mode in ("w", "a"): os.makedirs(self.working_dir, exist_ok=True) - self.b2z_path = self.localpath[:-4] + ".b2z" + if self.localpath.endswith(".b2d"): + self.b2z_path = self.localpath[:-4] + ".b2z" + else: + self.b2z_path = self.localpath + ".b2z" self.estore_path = os.path.join(self.working_dir, "embed.b2e") @@ -304,6 +319,12 @@ def _init_write_append_mode( """Initialize store in write/append mode.""" if self.mode == "a" and os.path.exists(self.localpath): if self.is_zip_store: + # When using an explicit tmpdir the directory may already contain + # stale files from a previous open that was never closed. Clear + # it before extracting so we always start from a clean slate. + if self._temp_dir_obj is None: + shutil.rmtree(self.working_dir, ignore_errors=True) + os.makedirs(self.working_dir, exist_ok=True) with zipfile.ZipFile(self.localpath, "r") as zf: zf.extractall(self.working_dir) elif not os.path.isdir(self.working_dir): @@ -376,6 +397,7 @@ def __setitem__( self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray | blosc2.BatchArray ) -> None: """Add a node to the DictStore.""" + self._modified = True if isinstance(value, np.ndarray): value = blosc2.asarray(value, cparams=self.cparams, dparams=self.dparams) # C2Array should always go to embed store; let estore handle it directly @@ -477,6 +499,7 @@ def get( def __delitem__(self, key: str) -> None: """Remove a node from the DictStore.""" + self._modified = True if key in self.map_tree: # Remove from map_tree and delete the external file filepath = self.map_tree[key] @@ -661,6 +684,10 @@ def _get_zip_offsets(self) -> dict[str, dict[str, int]]: def close(self) -> None: """Persist changes and cleanup.""" + if self._closed: + return + self._closed = True + # Repack estore # TODO: for some reason this is not working # if self.mode != "r": @@ -669,13 +696,46 @@ def close(self) -> None: # f.write(cframe) if self.is_zip_store and self.mode in ("w", "a"): - # Serialize to b2z file + # Serialize to b2z file. self.to_b2z(overwrite=True) # Clean up temporary directory if we created it if self._temp_dir_obj is not None: self._temp_dir_obj.cleanup() + def discard(self) -> None: + """Clean up resources *without* repacking the .b2z file. + + Use this instead of :meth:`close` when the store was opened only for + inspection and should be thrown away without persisting any changes + back to the archive. + """ + if self._closed: + return + self._closed = True + if self._temp_dir_obj is not None: + self._temp_dir_obj.cleanup() + + def __del__(self): + """Ensure the temporary directory is removed and, if writes were made + through this store's own API, the store is flushed back to the .b2z + file. + + When no Python-level writes went through ``__setitem__`` / ``__delitem__`` + (``_modified`` is False), we skip ``to_b2z()`` to avoid repacking a + potentially partial temp dir during garbage collection. Explicit + ``close()`` / ``__exit__`` always repacks regardless. + """ + try: + if not self._closed and self.is_zip_store and self.mode in ("w", "a") and not self._modified: + # Skip repacking — discard is safe and avoids corrupting the + # archive when the temp dir is torn down during GC. + self.discard() + else: + self.close() + except Exception: + pass + def __enter__(self): """Context manager enter.""" return self diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 22f86ce6..8beb2d59 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -106,6 +106,52 @@ def _cleanup_in_memory_store(key: int) -> None: _hot_cache_clear(scope=("memory", key)) +def _persistent_cache_path_exists(path: str | int) -> bool: + if not isinstance(path, str): + return False + path_obj = Path(path) + return path_obj.exists() or path_obj.parent.exists() + + +def _purge_stale_persistent_caches() -> None: + stale_scopes = { + key + for key in tuple(_PERSISTENT_INDEXES) + if key[0] == "persistent" and not _persistent_cache_path_exists(key[1]) + } + for key in stale_scopes: + _PERSISTENT_INDEXES.pop(key, None) + + stale_data_keys = { + key + for key in tuple(_DATA_CACHE) + if key[0][0] == "persistent" and not _persistent_cache_path_exists(key[0][1]) + } + stale_scopes.update(key[0] for key in stale_data_keys) + for key in stale_data_keys: + _DATA_CACHE.pop(key, None) + + stale_handle_keys = { + key + for key in tuple(_SIDECAR_HANDLE_CACHE) + if key[0][0] == "persistent" and not _persistent_cache_path_exists(key[0][1]) + } + stale_scopes.update(key[0] for key in stale_handle_keys) + for key in stale_handle_keys: + _SIDECAR_HANDLE_CACHE.pop(key, None) + + stale_query_paths = [path for path in tuple(_QUERY_CACHE_STORE_HANDLES) if not Path(path).exists()] + for path in stale_query_paths: + _QUERY_CACHE_STORE_HANDLES.pop(path, None) + + stale_gather_paths = [path for path in tuple(_GATHER_MMAP_HANDLES) if not Path(path).exists()] + for path in stale_gather_paths: + _GATHER_MMAP_HANDLES.pop(path, None) + + for scope in stale_scopes: + _hot_cache_clear(scope=scope) + + @dataclass(slots=True) class IndexPlan: usable: bool @@ -256,7 +302,7 @@ def _copy_descriptor_for_token(array: blosc2.NDArray, token: str) -> dict: def _is_persistent_array(array: blosc2.NDArray) -> bool: - return array.urlpath is not None + return getattr(array, "urlpath", None) is not None def _tmpdir_for_array(array: blosc2.NDArray) -> str | None: @@ -267,8 +313,9 @@ def _tmpdir_for_array(array: blosc2.NDArray) -> str | None: size limits on ``/tmp`` (commonly a tmpfs with only a few GB). For in-memory arrays we fall back to the system default (``None``). """ - if array.urlpath is not None: - return str(Path(array.urlpath).resolve().parent) + urlpath = getattr(array, "urlpath", None) + if urlpath is not None: + return str(Path(urlpath).resolve().parent) return None @@ -285,6 +332,7 @@ def _resolve_full_index_tmpdir(array: blosc2.NDArray, tmpdir: str | None) -> str def _load_store(array: blosc2.NDArray) -> dict: + _purge_stale_persistent_caches() if _is_persistent_array(array): key = _array_key(array) cached = _PERSISTENT_INDEXES.get(key) @@ -404,6 +452,7 @@ def _open_query_cache_store(array: blosc2.NDArray, *, create: bool = False): Returns ``None`` if the array is not persistent. When *create* is True the store is created if it does not yet exist. """ + _purge_stale_persistent_caches() if not _is_persistent_array(array): return None path = _query_cache_payload_path(array) @@ -919,6 +968,7 @@ def _invalidate_sidecar_cache_entries(array: blosc2.NDArray, token: str, categor def _open_sidecar_handle(array: blosc2.NDArray, token: str, category: str, name: str, path: str | None): + _purge_stale_persistent_caches() cache_key = _sidecar_handle_cache_key(array, token, category, name) cached = _SIDECAR_HANDLE_CACHE.get(cache_key) if cached is not None: @@ -934,7 +984,7 @@ def _open_sidecar_handle(array: blosc2.NDArray, token: str, category: str, name: raise RuntimeError("sidecar handle path is not available") handle = legacy if isinstance(legacy, blosc2.NDArray) else blosc2.asarray(np.asarray(legacy)) else: - handle = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) + handle = blosc2.open(path, mode="r", mmap_mode=_INDEX_MMAP_MODE) _SIDECAR_HANDLE_CACHE[cache_key] = handle return handle @@ -1053,7 +1103,7 @@ def _compute_sorted_boundaries_from_sidecar( ) -> np.ndarray: nsegments = math.ceil(length / segment_len) boundaries = np.empty(nsegments, dtype=_boundary_dtype(dtype)) - sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) + sidecar = blosc2.open(path, mode="r", mmap_mode=_INDEX_MMAP_MODE) start_value = np.empty(1, dtype=dtype) end_value = np.empty(1, dtype=dtype) for idx in range(nsegments): @@ -1107,8 +1157,11 @@ def _store_array_sidecar( kwargs["blocks"] = blocks if cparams is not None: kwargs["cparams"] = cparams + # Do not retain writable persistent handles in the process-wide cache. + # They keep native resources alive after index construction and can + # accumulate badly across tests on macOS/Python 3.14. handle = blosc2.asarray(data, **kwargs) - _SIDECAR_HANDLE_CACHE[handle_cache_key] = handle + del handle _DATA_CACHE.pop(cache_key, None) else: path = None @@ -1151,10 +1204,9 @@ def _create_persistent_sidecar_handle( kwargs["cparams"] = cparams if length == 0: handle = blosc2.asarray(np.empty(0, dtype=dtype), **kwargs) - _SIDECAR_HANDLE_CACHE[_sidecar_handle_cache_key(array, token, category, name)] = handle + del handle return None, {"path": path, "dtype": dtype.descr if dtype.fields else dtype.str} handle = blosc2.empty((length,), dtype=dtype, **kwargs) - _SIDECAR_HANDLE_CACHE[_sidecar_handle_cache_key(array, token, category, name)] = handle return handle, {"path": path, "dtype": dtype.descr if dtype.fields else dtype.str} @@ -1300,7 +1352,7 @@ def _sidecar_storage_geometry( ) -> tuple[int, int]: if path is None: return fallback_chunk_len, fallback_block_len - sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) + sidecar = blosc2.open(path, mode="r", mmap_mode=_INDEX_MMAP_MODE) return int(sidecar.chunks[0]), int(sidecar.blocks[0]) @@ -1384,7 +1436,7 @@ def _stream_copy_sidecar_array( blocks: tuple[int, ...], cparams: dict | None = None, ) -> None: - source = blosc2.open(str(source_path), mmap_mode=_INDEX_MMAP_MODE) + source = blosc2.open(str(source_path), mode="r", mmap_mode=_INDEX_MMAP_MODE) blosc2.remove_urlpath(str(dest_path)) kwargs = {"chunks": chunks, "blocks": blocks, "urlpath": str(dest_path), "mode": "w"} if cparams is not None: @@ -2753,7 +2805,7 @@ def _copy_sidecar_to_temp_run( cparams: dict | None = None, ) -> Path: out_path = workdir / f"{prefix}.b2nd" - sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) + sidecar = blosc2.open(path, mode="r", mmap_mode=_INDEX_MMAP_MODE) output = _create_blosc2_temp_array(out_path, length, dtype, FULL_OOC_MERGE_BUFFER_ITEMS, cparams) chunk_len = int(sidecar.chunks[0]) for chunk_id, start in enumerate(range(0, length, chunk_len)): @@ -2813,10 +2865,10 @@ def _merge_run_pair( tracker: TempRunTracker | None = None, cparams: dict | None = None, ) -> SortedRun: - left_values_mm = blosc2.open(str(left.values_path), mmap_mode=_INDEX_MMAP_MODE) - left_positions_mm = blosc2.open(str(left.positions_path), mmap_mode=_INDEX_MMAP_MODE) - right_values_mm = blosc2.open(str(right.values_path), mmap_mode=_INDEX_MMAP_MODE) - right_positions_mm = blosc2.open(str(right.positions_path), mmap_mode=_INDEX_MMAP_MODE) + left_values_mm = blosc2.open(str(left.values_path), mode="r", mmap_mode=_INDEX_MMAP_MODE) + left_positions_mm = blosc2.open(str(left.positions_path), mode="r", mmap_mode=_INDEX_MMAP_MODE) + right_values_mm = blosc2.open(str(right.values_path), mode="r", mmap_mode=_INDEX_MMAP_MODE) + right_positions_mm = blosc2.open(str(right.positions_path), mode="r", mmap_mode=_INDEX_MMAP_MODE) out_values_path = workdir / f"full_merge_values_{merge_id}.b2nd" out_positions_path = workdir / f"full_merge_positions_{merge_id}.b2nd" @@ -3023,8 +3075,8 @@ def _build_full_descriptor_ooc( array, token, kind, full, final_run, dtype, persistent, tracker, cparams ) else: - sorted_values = blosc2.open(str(final_run.values_path), mmap_mode=_INDEX_MMAP_MODE)[:] - positions = blosc2.open(str(final_run.positions_path), mmap_mode=_INDEX_MMAP_MODE)[:] + sorted_values = blosc2.open(str(final_run.values_path), mode="r", mmap_mode=_INDEX_MMAP_MODE)[:] + positions = blosc2.open(str(final_run.positions_path), mode="r", mmap_mode=_INDEX_MMAP_MODE)[:] values_sidecar = _store_array_sidecar( array, token, kind, "full", "values", sorted_values, persistent, cparams=cparams ) @@ -3256,14 +3308,14 @@ def iter_index_components(array: blosc2.NDArray, descriptor: dict): def _component_nbytes(array: blosc2.NDArray, descriptor: dict, component: IndexComponent) -> int: if component.path is not None: - return int(blosc2.open(component.path, mmap_mode=_INDEX_MMAP_MODE).nbytes) + return int(blosc2.open(component.path, mode="r", mmap_mode=_INDEX_MMAP_MODE).nbytes) token = descriptor["token"] return int(_load_array_sidecar(array, token, component.category, component.name).nbytes) def _component_cbytes(array: blosc2.NDArray, descriptor: dict, component: IndexComponent) -> int: if component.path is not None: - return int(blosc2.open(component.path, mmap_mode=_INDEX_MMAP_MODE).cbytes) + return int(blosc2.open(component.path, mode="r", mmap_mode=_INDEX_MMAP_MODE).cbytes) token = descriptor["token"] sidecar = _load_array_sidecar(array, token, component.category, component.name) kwargs = {} @@ -3803,8 +3855,8 @@ def compact_index(array: blosc2.NDArray, field: str | None = None, name: str | N array, descriptor, final_run.values_path, final_run.positions_path, final_run.length ) else: - sorted_values = blosc2.open(str(final_run.values_path), mmap_mode=_INDEX_MMAP_MODE)[:] - positions = blosc2.open(str(final_run.positions_path), mmap_mode=_INDEX_MMAP_MODE)[:] + sorted_values = blosc2.open(str(final_run.values_path), mode="r", mmap_mode=_INDEX_MMAP_MODE)[:] + positions = blosc2.open(str(final_run.positions_path), mode="r", mmap_mode=_INDEX_MMAP_MODE)[:] _replace_full_descriptor(array, descriptor, sorted_values, positions, descriptor["persistent"]) del sorted_values, positions final_run.values_path.unlink(missing_ok=True) @@ -4858,7 +4910,7 @@ def _bucket_batch_result_dtype(where_x) -> np.dtype: def _bucket_worker_source(where_x): if _supports_block_reads(where_x) and getattr(where_x, "urlpath", None) is not None: - return blosc2.open(str(where_x.urlpath), mmap_mode=_INDEX_MMAP_MODE) + return blosc2.open(str(where_x.urlpath), mode="r", mmap_mode=_INDEX_MMAP_MODE) return where_x @@ -4873,10 +4925,11 @@ def _gather_mmap_source(where_x): urlpath = getattr(where_x, "urlpath", None) if not _supports_block_reads(where_x) or urlpath is None: return where_x + _purge_stale_persistent_caches() urlpath = str(urlpath) handle = _GATHER_MMAP_HANDLES.get(urlpath) if handle is None: - handle = blosc2.open(urlpath, mmap_mode=_INDEX_MMAP_MODE) + handle = blosc2.open(urlpath, mode="r", mmap_mode=_INDEX_MMAP_MODE) _GATHER_MMAP_HANDLES[urlpath] = handle return handle @@ -5103,17 +5156,17 @@ def process_batch(chunk_ids: np.ndarray) -> tuple[list[tuple[int, np.ndarray]], batch_values = ( values_sidecar if bucket.get("values_path") is None - else blosc2.open(bucket["values_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(bucket["values_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_buckets = ( bucket_sidecar if bucket.get("bucket_positions_path") is None - else blosc2.open(bucket["bucket_positions_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(bucket["bucket_positions_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_l2 = ( l2_sidecar if bucket.get("l2_path") is None - else blosc2.open(bucket["l2_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(bucket["l2_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_results = [] batch_candidate_segments = 0 @@ -5250,9 +5303,9 @@ def _partial_chunk_nav_positions_cython( def process_cython_batch(chunk_ids: np.ndarray) -> tuple[np.ndarray, int]: if len(chunk_ids) == 0: return np.empty(0, dtype=np.int64), 0 - batch_values = blosc2.open(partial["values_path"], mmap_mode=_INDEX_MMAP_MODE) - batch_positions = blosc2.open(partial["positions_path"], mmap_mode=_INDEX_MMAP_MODE) - batch_l2 = blosc2.open(partial["l2_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_values = blosc2.open(partial["values_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) + batch_positions = blosc2.open(partial["positions_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) + batch_l2 = blosc2.open(partial["l2_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) batch_l2_row = np.empty(nsegments_per_chunk, dtype=l2_boundary_dtype) batch_span_values = np.empty(chunk_len, dtype=dtype) batch_local_positions = np.empty(chunk_len, dtype=local_position_dtype) @@ -5299,17 +5352,17 @@ def process_batch(chunk_ids: np.ndarray) -> tuple[list[np.ndarray], int]: batch_values = ( values_sidecar if partial.get("values_path") is None - else blosc2.open(partial["values_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(partial["values_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_positions = ( positions_sidecar if partial.get("positions_path") is None - else blosc2.open(partial["positions_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(partial["positions_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_l2 = ( l2_sidecar if partial.get("l2_path") is None - else blosc2.open(partial["l2_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(partial["l2_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_parts = [] batch_candidate_segments = 0 diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 042926c5..e8772304 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -580,7 +580,7 @@ def save(self, **kwargs: Any) -> None: >>> # Save the LazyExpr to disk >>> expr.save(urlpath='lazy_array.b2nd', mode='w') >>> # Open and load the LazyExpr from disk - >>> disk_expr = blosc2.open('lazy_array.b2nd') + >>> disk_expr = blosc2.open('lazy_array.b2nd', mode='r') >>> disk_expr[:2] [[0. 1.25 2.5 ] [3.75 5. 6.25]] @@ -1241,7 +1241,7 @@ def _run_async_reader(): except Empty: if not thread.is_alive(): if worker_exc is not None: - raise worker_exc + raise worker_exc from None break continue finally: @@ -2869,7 +2869,7 @@ def chunked_eval( # noqa: C901 unit_steps = np.all([s.step == 1 for s in item.raw if isinstance(s, slice)]) # shape of slice, if non-unit steps have to decompress full array into memory shape_operands = item.newshape(shape) if unit_steps else shape - _dtype = kwargs.get("dtype", np.float64) + _dtype = np.dtype(kwargs.get("dtype", np.float64)) size_operands = math.prod(shape_operands) * len(operands) * _dtype.itemsize # Only take the fast path if the size of operands is relatively small if size_operands < blosc2.MAX_FAST_PATH_SIZE: @@ -4707,7 +4707,7 @@ def open_lazyarray(array): if isinstance(v, str): v = parent_path / v try: - op = blosc2.open(v) + op = blosc2.open(v, mode="r") except FileNotFoundError: missing_ops[key] = v else: diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index fdcfb2ae..d336178c 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -505,9 +505,17 @@ def reshape( return dst +def _normalize_expr_operand(value: Any) -> Any: + """Normalize foreign expression operands to the array-like object lazy ops expect.""" + raw_col = getattr(value, "_raw_col", None) + return raw_col if raw_col is not None else value + + def _check_allowed_dtypes( value: bool | int | float | str | blosc2.Array, ): + value = _normalize_expr_operand(value) + def _is_array_like(v: Any) -> bool: try: # Try Protocol runtime check first (works when possible) @@ -3437,6 +3445,7 @@ def __abs__(self) -> blosc2.LazyExpr: @is_documented_by(bitwise_and) def __and__(self, value: int | float | blosc2.Array, /) -> blosc2.LazyExpr: + value = _normalize_expr_operand(value) _check_allowed_dtypes(value) return blosc2.LazyExpr(new_op=(self, "&", value)) @@ -3448,6 +3457,8 @@ def __rand__(self, value: int | float | blosc2.Array, /) -> blosc2.LazyExpr: @is_documented_by(bitwise_xor) def __xor__(self, other) -> blosc2.LazyExpr: + other = _normalize_expr_operand(other) + _check_allowed_dtypes(other) return blosc2.LazyExpr(new_op=(self, "^", other)) def __ixor__(self, other) -> blosc2.LazyExpr: @@ -3458,6 +3469,8 @@ def __rxor__(self, other) -> blosc2.LazyExpr: @is_documented_by(bitwise_or) def __or__(self, other) -> blosc2.LazyExpr: + other = _normalize_expr_operand(other) + _check_allowed_dtypes(other) return blosc2.LazyExpr(new_op=(self, "|", other)) def __ior__(self, other) -> blosc2.LazyExpr: diff --git a/src/blosc2/ref.py b/src/blosc2/ref.py index c0acca8e..b1eda6b1 100644 --- a/src/blosc2/ref.py +++ b/src/blosc2/ref.py @@ -118,7 +118,9 @@ def open(self): import blosc2 if self.kind == "urlpath": - return blosc2.open(self.urlpath) + # Structured refs are used to reopen operands for persisted recipes. + # Read-only access avoids allocating unnecessary writable state. + return blosc2.open(self.urlpath, mode="r") if self.kind == "dictstore_key": return blosc2.DictStore(self.urlpath, mode="r")[self.key] if self.kind == "c2array": diff --git a/src/blosc2/schema.py b/src/blosc2/schema.py new file mode 100644 index 00000000..560d7117 --- /dev/null +++ b/src/blosc2/schema.py @@ -0,0 +1,376 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +"""Schema spec primitives and field helper for CTable.""" + +from __future__ import annotations + +import dataclasses +from dataclasses import MISSING +from typing import Any + +import numpy as np + +BLOSC2_FIELD_METADATA_KEY = "blosc2" + +# Aliases so we can still use the builtins inside this module +# after our spec classes shadow them. +_builtin_bool = bool +_builtin_bytes = bytes + + +# --------------------------------------------------------------------------- +# Base spec class +# --------------------------------------------------------------------------- + + +class SchemaSpec: + """Base class for all Blosc2 column schema descriptors. + + Subclasses carry the logical type, storage dtype, and optional + validation constraints for one column. + + Numpy dtype attributes (``itemsize``, ``kind``, ``type``, ``str``, + ``name``) are mirrored at class level so that schema spec classes can + be used anywhere blosc2 internals expect a dtype-like object. + """ + + dtype: np.dtype + python_type: type + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + # Mirror numpy dtype attributes at class level for duck-typing. + _np_dtype = cls.__dict__.get("dtype") + if isinstance(_np_dtype, np.dtype): + cls.itemsize = _np_dtype.itemsize + cls.kind = _np_dtype.kind + cls.type = _np_dtype.type + cls.str = _np_dtype.str + cls.name = _np_dtype.name + + def to_pydantic_kwargs(self) -> dict[str, Any]: + """Return kwargs for building a Pydantic field annotation.""" + raise NotImplementedError + + def to_metadata_dict(self) -> dict[str, Any]: + """Return a JSON-compatible dict for schema serialization.""" + raise NotImplementedError + + +# --------------------------------------------------------------------------- +# Numeric spec classes +# --------------------------------------------------------------------------- + +# Internal helper to avoid repeating the constraint boilerplate for every +# integer and float spec. Subclasses only need to set `dtype`, `python_type`, +# and `_kind` as class attributes. + + +class _NumericSpec(SchemaSpec): + """Mixin for numeric specs that support ge / gt / le / lt constraints.""" + + _kind: str # set by each concrete subclass + + def __init__(self, *, ge=None, gt=None, le=None, lt=None, null_value=None): + self.ge = ge + self.gt = gt + self.le = le + self.lt = lt + self.null_value = null_value + + def to_pydantic_kwargs(self) -> dict[str, Any]: + # null_value is not a Pydantic constraint — exclude it from Pydantic kwargs. + return { + k: v + for k, v in {"ge": self.ge, "gt": self.gt, "le": self.le, "lt": self.lt}.items() + if v is not None + } + + def to_metadata_dict(self) -> dict[str, Any]: + d: dict[str, Any] = {"kind": self._kind, **self.to_pydantic_kwargs()} + if self.null_value is not None: + d["null_value"] = self.null_value + return d + + +# ── Signed integers ────────────────────────────────────────────────────────── + + +class int8(_NumericSpec): + """8-bit signed integer column (−128 … 127).""" + + dtype = np.dtype(np.int8) + python_type = int + _kind = "int8" + + +class int16(_NumericSpec): + """16-bit signed integer column (−32 768 … 32 767).""" + + dtype = np.dtype(np.int16) + python_type = int + _kind = "int16" + + +class int32(_NumericSpec): + """32-bit signed integer column (−2 147 483 648 … 2 147 483 647).""" + + dtype = np.dtype(np.int32) + python_type = int + _kind = "int32" + + +class int64(_NumericSpec): + """64-bit signed integer column.""" + + dtype = np.dtype(np.int64) + python_type = int + _kind = "int64" + + +# ── Unsigned integers ──────────────────────────────────────────────────────── + + +class uint8(_NumericSpec): + """8-bit unsigned integer column (0 … 255).""" + + dtype = np.dtype(np.uint8) + python_type = int + _kind = "uint8" + + +class uint16(_NumericSpec): + """16-bit unsigned integer column (0 … 65 535).""" + + dtype = np.dtype(np.uint16) + python_type = int + _kind = "uint16" + + +class uint32(_NumericSpec): + """32-bit unsigned integer column (0 … 4 294 967 295).""" + + dtype = np.dtype(np.uint32) + python_type = int + _kind = "uint32" + + +class uint64(_NumericSpec): + """64-bit unsigned integer column.""" + + dtype = np.dtype(np.uint64) + python_type = int + _kind = "uint64" + + +# ── Floating point ─────────────────────────────────────────────────────────── + + +class float32(_NumericSpec): + """32-bit floating-point column (single precision).""" + + dtype = np.dtype(np.float32) + python_type = float + _kind = "float32" + + +class float64(_NumericSpec): + """64-bit floating-point column (double precision).""" + + dtype = np.dtype(np.float64) + python_type = float + _kind = "float64" + + +class complex64(SchemaSpec): + """64-bit complex number column (two 32-bit floats).""" + + dtype = np.dtype(np.complex64) + python_type = complex + + def __init__(self): + pass + + def to_pydantic_kwargs(self) -> dict[str, Any]: + return {} + + def to_metadata_dict(self) -> dict[str, Any]: + return {"kind": "complex64"} + + +class complex128(SchemaSpec): + """128-bit complex number column (two 64-bit floats).""" + + dtype = np.dtype(np.complex128) + python_type = complex + + def __init__(self): + pass + + def to_pydantic_kwargs(self) -> dict[str, Any]: + return {} + + def to_metadata_dict(self) -> dict[str, Any]: + return {"kind": "complex128"} + + +class bool(SchemaSpec): + """Boolean column.""" + + dtype = np.dtype(np.bool_) + python_type = _builtin_bool + + def __init__(self): + pass + + def to_pydantic_kwargs(self) -> dict[str, Any]: + return {} + + def to_metadata_dict(self) -> dict[str, Any]: + return {"kind": "bool"} + + +# --------------------------------------------------------------------------- +# String / bytes spec classes +# --------------------------------------------------------------------------- + + +class string(SchemaSpec): + """Fixed-width Unicode string column. + + Parameters + ---------- + max_length: + Maximum number of characters. Determines the NumPy ``U`` dtype. + Defaults to 32 if not specified. + min_length: + Minimum number of characters (validation only, no effect on dtype). + pattern: + Regex pattern the value must match (validation only). + """ + + python_type = str + _DEFAULT_MAX_LENGTH = 32 + + def __init__(self, *, min_length=None, max_length=None, pattern=None, null_value=None): + self.min_length = min_length + self.max_length = max_length if max_length is not None else self._DEFAULT_MAX_LENGTH + self.pattern = pattern + self.null_value = null_value + self.dtype = np.dtype(f"U{self.max_length}") + + def to_pydantic_kwargs(self) -> dict[str, Any]: + d = {} + if self.min_length is not None: + d["min_length"] = self.min_length + if self.max_length is not None: + d["max_length"] = self.max_length + if self.pattern is not None: + d["pattern"] = self.pattern + return d + + def to_metadata_dict(self) -> dict[str, Any]: + d: dict[str, Any] = {"kind": "string", **self.to_pydantic_kwargs()} + if self.null_value is not None: + d["null_value"] = self.null_value + return d + + +class bytes(SchemaSpec): + """Fixed-width bytes column. + + Parameters + ---------- + max_length: + Maximum number of bytes. Determines the NumPy ``S`` dtype. + Defaults to 32 if not specified. + min_length: + Minimum number of bytes (validation only, no effect on dtype). + """ + + python_type = _builtin_bytes + _DEFAULT_MAX_LENGTH = 32 + + def __init__(self, *, min_length=None, max_length=None, null_value=None): + self.min_length = min_length + self.max_length = max_length if max_length is not None else self._DEFAULT_MAX_LENGTH + self.null_value = null_value + self.dtype = np.dtype(f"S{self.max_length}") + + def to_pydantic_kwargs(self) -> dict[str, Any]: + d = {} + if self.min_length is not None: + d["min_length"] = self.min_length + if self.max_length is not None: + d["max_length"] = self.max_length + return d + + def to_metadata_dict(self) -> dict[str, Any]: + d: dict[str, Any] = {"kind": "bytes", **self.to_pydantic_kwargs()} + if self.null_value is not None: + d["null_value"] = self.null_value + return d + + +# --------------------------------------------------------------------------- +# Field helper +# --------------------------------------------------------------------------- + + +def field( + spec: SchemaSpec, + *, + default=MISSING, + cparams: dict[str, Any] | None = None, + dparams: dict[str, Any] | None = None, + chunks: tuple[int, ...] | None = None, + blocks: tuple[int, ...] | None = None, +) -> dataclasses.Field: + """Attach a Blosc2 schema spec and per-column storage options to a dataclass field. + + Parameters + ---------- + spec: + A schema descriptor such as ``b2.int64(ge=0)`` or ``b2.float64()``. + default: + Default value for the field. Omit for required fields. + cparams: + Compression parameters for this column's NDArray. + dparams: + Decompression parameters for this column's NDArray. + chunks: + Chunk shape for this column's NDArray. + blocks: + Block shape for this column's NDArray. + + Examples + -------- + >>> from dataclasses import dataclass + >>> import blosc2 as b2 + >>> @dataclass + ... class Row: + ... id: int = b2.field(b2.int64(ge=0)) + ... score: float = b2.field(b2.float64(ge=0, le=100)) + ... active: bool = b2.field(b2.bool(), default=True) + """ + if not isinstance(spec, SchemaSpec): + raise TypeError(f"field() requires a SchemaSpec as its first argument, got {type(spec)!r}.") + + metadata = { + BLOSC2_FIELD_METADATA_KEY: { + "spec": spec, + "cparams": cparams, + "dparams": dparams, + "chunks": chunks, + "blocks": blocks, + } + } + if default is MISSING: + return dataclasses.field(metadata=metadata) + return dataclasses.field(default=default, metadata=metadata) diff --git a/src/blosc2/schema_compiler.py b/src/blosc2/schema_compiler.py new file mode 100644 index 00000000..19a3d0c1 --- /dev/null +++ b/src/blosc2/schema_compiler.py @@ -0,0 +1,436 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +"""Schema compiler: turns a dataclass row definition into a CompiledSchema.""" + +from __future__ import annotations + +import dataclasses +import typing +from dataclasses import MISSING +from typing import Any + +import numpy as np # noqa: TC002 + +from blosc2.schema import ( + BLOSC2_FIELD_METADATA_KEY, + SchemaSpec, + complex64, + complex128, + float32, + float64, + int8, + int16, + int32, + int64, + string, + uint8, + uint16, + uint32, + uint64, +) +from blosc2.schema import ( + bool as b2_bool, +) +from blosc2.schema import ( + bytes as b2_bytes, +) + +# Maps the "kind" string used in serialized dicts back to spec constructors. +_KIND_TO_SPEC: dict[str, type[SchemaSpec]] = { + # signed integers + "int8": int8, + "int16": int16, + "int32": int32, + "int64": int64, + # unsigned integers + "uint8": uint8, + "uint16": uint16, + "uint32": uint32, + "uint64": uint64, + # floats + "float32": float32, + "float64": float64, + # complex + "complex64": complex64, + "complex128": complex128, + # bool / string / bytes + "bool": b2_bool, + "string": string, + "bytes": b2_bytes, +} + +# --------------------------------------------------------------------------- +# Display-width helper (used by CTable.__str__ / info()) +# --------------------------------------------------------------------------- + +_DTYPE_DISPLAY_WIDTH: dict[str, int] = { + "int8": 6, + "int16": 8, + "int32": 10, + "int64": 12, + "uint8": 6, + "uint16": 8, + "uint32": 10, + "uint64": 12, + "float32": 12, + "float64": 15, + "bool": 6, + "complex64": 20, + "complex128": 25, +} + + +def compute_display_width(spec: SchemaSpec) -> int: + """Return a reasonable terminal display width for *spec*'s column.""" + dtype = spec.dtype + if dtype.kind == "U": # fixed-width unicode (string spec) + return max(10, min(dtype.itemsize // 4, 50)) + if dtype.kind == "S": # fixed-width bytes + return max(10, min(dtype.itemsize, 50)) + return _DTYPE_DISPLAY_WIDTH.get(dtype.name, 20) + + +# --------------------------------------------------------------------------- +# Mapping from Python primitive annotations to default spec constructors. +# Keys are the actual builtin types (bool before int because bool <: int). +# --------------------------------------------------------------------------- +_ANNOTATION_TO_SPEC: dict[type, type[SchemaSpec]] = { + bool: b2_bool, # must come before int (bool is a subclass of int) + int: int64, + float: float64, + complex: complex128, + str: string, + bytes: b2_bytes, +} + + +# --------------------------------------------------------------------------- +# Compiled representations +# --------------------------------------------------------------------------- + + +@dataclasses.dataclass(slots=True) +class ColumnConfig: + """Per-column NDArray storage options.""" + + cparams: dict[str, Any] | None + dparams: dict[str, Any] | None + chunks: tuple[int, ...] | None + blocks: tuple[int, ...] | None + + +@dataclasses.dataclass(slots=True) +class CompiledColumn: + """All compile-time information about a single CTable column.""" + + name: str + py_type: Any + spec: SchemaSpec + dtype: np.dtype + default: Any # MISSING means required (no default) + config: ColumnConfig + display_width: int = 20 # terminal column width for __str__ / info() + + +@dataclasses.dataclass(slots=True) +class CompiledSchema: + """Compiled representation of a CTable row schema. + + Built once per row class by :func:`compile_schema` and cached on the + ``CTable`` instance. Drives NDArray creation, row validation, and + future schema serialization. + """ + + row_cls: type[Any] + columns: list[CompiledColumn] + columns_by_name: dict[str, CompiledColumn] + validator_model: type[Any] | None = None # filled in by schema_validation + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def get_blosc2_field_metadata(dc_field: dataclasses.Field) -> dict[str, Any] | None: + """Return the ``blosc2`` metadata dict stored on a dataclass field, or ``None``.""" + return dc_field.metadata.get(BLOSC2_FIELD_METADATA_KEY) + + +def infer_spec_from_annotation(annotation: Any) -> SchemaSpec: + """Build a default :class:`SchemaSpec` from a plain Python type annotation. + + Supports ``bool``, ``int``, ``float``, ``str``, and ``bytes``. + + Raises + ------ + TypeError + If the annotation is not one of the supported primitive types. + """ + spec_cls = _ANNOTATION_TO_SPEC.get(annotation) + if spec_cls is None: + raise TypeError( + f"Cannot infer a Blosc2 schema spec from annotation {annotation!r}. " + f"Use b2.field(b2.(...)) to declare this column explicitly." + ) + return spec_cls() + + +def validate_annotation_matches_spec(name: str, annotation: Any, spec: SchemaSpec) -> None: + """Raise :exc:`TypeError` if *annotation* is incompatible with *spec*. + + Parameters + ---------- + name: + Column name, used only in the error message. + annotation: + The resolved Python type from the dataclass field. + spec: + The :class:`SchemaSpec` attached via ``b2.field(...)``. + """ + expected = spec.python_type + if annotation is not expected: + raise TypeError( + f"Column {name!r}: annotation {annotation!r} is incompatible with " + f"spec {type(spec).__name__!r} (expected Python type {expected.__name__!r})." + ) + + +# --------------------------------------------------------------------------- +# Public compiler entry point +# --------------------------------------------------------------------------- + + +_RESERVED_COLUMN_NAMES: frozenset[str] = frozenset({"_meta", "_valid_rows", "_cols", "_indexes"}) + + +def _validate_column_name(name: str) -> None: + """Raise :exc:`ValueError` if *name* is not a legal CTable column name. + + Rules (enforced for both in-memory and persistent tables so that an + in-memory schema can always be persisted without surprises): + + * must be a non-empty string + * must not start with ``_`` (reserved for internal table layout) + * must not contain ``/`` (used as path separator in persistent layout) + * must not be one of the reserved internal names + """ + if not name: + raise ValueError("Column name cannot be empty.") + if name.startswith("_"): + raise ValueError(f"Column name cannot start with '_' (reserved for internal use): {name!r}") + if "/" in name: + raise ValueError(f"Column name cannot contain '/': {name!r}") + if name in _RESERVED_COLUMN_NAMES: + raise ValueError(f"Column name {name!r} is reserved for internal CTable use.") + + +def compile_schema(row_cls: type[Any]) -> CompiledSchema: + """Compile *row_cls* (a dataclass) into a :class:`CompiledSchema`. + + Parameters + ---------- + row_cls: + A class decorated with ``@dataclass``. Each field must either carry a + ``b2.field(...)`` default or use a supported plain annotation + (``int``, ``float``, ``bool``, ``str``, ``bytes``). + + Returns + ------- + CompiledSchema + + Raises + ------ + TypeError + If *row_cls* is not a dataclass, if a field spec is incompatible with + its annotation, or if an unsupported annotation is encountered. + ValueError + If any column name violates the naming rules. + """ + if not dataclasses.is_dataclass(row_cls) or not isinstance(row_cls, type): + raise TypeError( + f"{row_cls!r} is not a dataclass type. CTable row schemas must be defined with @dataclass." + ) + + # Resolve string annotations (handles `from __future__ import annotations`) + try: + hints = typing.get_type_hints(row_cls) + except Exception as exc: + raise TypeError(f"Could not resolve type hints for {row_cls!r}: {exc}") from exc + + columns: list[CompiledColumn] = [] + + for dc_field in dataclasses.fields(row_cls): + name = dc_field.name + _validate_column_name(name) + annotation = hints.get(name, dc_field.type) + meta = get_blosc2_field_metadata(dc_field) + + if meta is not None: + # Explicit b2.field(...) path + spec = meta["spec"] + if not isinstance(spec, SchemaSpec): + raise TypeError( + f"Column {name!r}: b2.field() requires a SchemaSpec as its first " + f"argument, got {type(spec)!r}." + ) + validate_annotation_matches_spec(name, annotation, spec) + config = ColumnConfig( + cparams=meta.get("cparams"), + dparams=meta.get("dparams"), + chunks=meta.get("chunks"), + blocks=meta.get("blocks"), + ) + else: + # Inferred shorthand: plain annotation without b2.field() + spec = infer_spec_from_annotation(annotation) + config = ColumnConfig(cparams=None, dparams=None, chunks=None, blocks=None) + + # Resolve default value + if dc_field.default is not MISSING: + default = dc_field.default + elif dc_field.default_factory is not MISSING: # type: ignore[misc] + default = dc_field.default_factory + else: + default = MISSING + + columns.append( + CompiledColumn( + name=name, + py_type=annotation, + spec=spec, + dtype=spec.dtype, + default=default, + config=config, + display_width=compute_display_width(spec), + ) + ) + + return CompiledSchema( + row_cls=row_cls, + columns=columns, + columns_by_name={col.name: col for col in columns}, + ) + + +# --------------------------------------------------------------------------- +# Schema serialization helpers (Step 12 — persistence groundwork) +# --------------------------------------------------------------------------- + + +def _default_to_json(value: Any) -> Any: + """Convert a field default to a JSON-compatible value.""" + if value is MISSING: + return None + if isinstance(value, complex): + return {"__complex__": True, "real": value.real, "imag": value.imag} + return value + + +def _default_from_json(value: Any) -> Any: + """Reverse of :func:`_default_to_json`.""" + if value is None: + return MISSING + if isinstance(value, dict) and value.get("__complex__"): + return complex(value["real"], value["imag"]) + return value + + +def schema_to_dict(schema: CompiledSchema) -> dict[str, Any]: + """Serialize *schema* to a JSON-compatible dict. + + The result is self-contained: it can be stored as table metadata and + later passed to :func:`schema_from_dict` to reconstruct the schema + without the original Python dataclass. + + Example output:: + + { + "version": 1, + "row_cls": "Row", + "columns": [ + {"name": "id", "kind": "int64", "ge": 0, "default": null}, + {"name": "score", "kind": "float64", "ge": 0, "le": 100, "default": 0.0}, + {"name": "active", "kind": "bool", "default": true}, + ] + } + """ + cols = [] + for col in schema.columns: + entry: dict[str, Any] = {"name": col.name} + entry.update(col.spec.to_metadata_dict()) # adds "kind" + constraints + entry["default"] = _default_to_json(col.default) + if col.config.cparams is not None: + entry["cparams"] = col.config.cparams + if col.config.dparams is not None: + entry["dparams"] = col.config.dparams + if col.config.chunks is not None: + entry["chunks"] = list(col.config.chunks) + if col.config.blocks is not None: + entry["blocks"] = list(col.config.blocks) + cols.append(entry) + + return { + "version": 1, + "row_cls": schema.row_cls.__name__ if schema.row_cls is not None else None, + "columns": cols, + } + + +def schema_from_dict(data: dict[str, Any]) -> CompiledSchema: + """Reconstruct a :class:`CompiledSchema` from a dict produced by + :func:`schema_to_dict`. + + The original Python dataclass is *not* required. ``row_cls`` on the + returned schema will be ``None``. + + Raises + ------ + ValueError + If *data* uses an unknown schema version or an unknown column kind. + """ + version = data.get("version", 1) + if version != 1: + raise ValueError(f"Unsupported schema version {version!r}") + + columns: list[CompiledColumn] = [] + for entry in data["columns"]: + entry = dict(entry) # don't mutate caller's data + name = entry.pop("name") + kind = entry.pop("kind") + default = _default_from_json(entry.pop("default", None)) + cparams = entry.pop("cparams", None) + dparams = entry.pop("dparams", None) + chunks = tuple(entry.pop("chunks")) if "chunks" in entry else None + blocks = tuple(entry.pop("blocks")) if "blocks" in entry else None + + spec_cls = _KIND_TO_SPEC.get(kind) + if spec_cls is None: + raise ValueError(f"Unknown column kind {kind!r}") + + # Remaining keys in entry are constraint kwargs (ge, le, max_length, …) + spec = spec_cls(**entry) + + columns.append( + CompiledColumn( + name=name, + py_type=spec.python_type, + spec=spec, + dtype=spec.dtype, + default=default, + config=ColumnConfig(cparams=cparams, dparams=dparams, chunks=chunks, blocks=blocks), + display_width=compute_display_width(spec), + ) + ) + + return CompiledSchema( + row_cls=None, + columns=columns, + columns_by_name={col.name: col for col in columns}, + ) diff --git a/src/blosc2/schema_validation.py b/src/blosc2/schema_validation.py new file mode 100644 index 00000000..91f157f7 --- /dev/null +++ b/src/blosc2/schema_validation.py @@ -0,0 +1,159 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +"""Row-level validation via an internally-generated Pydantic model. + +All Pydantic-specific logic is isolated here. CTable and the rest of the +schema layer never import from Pydantic directly. +""" + +from __future__ import annotations + +from dataclasses import MISSING +from typing import Any + +from pydantic import BaseModel, Field, ValidationError, create_model + +from blosc2.schema_compiler import CompiledSchema # noqa: TC001 + + +def build_validator_model(schema: CompiledSchema) -> type[BaseModel]: + """Return (and cache) a Pydantic model class for *schema*. + + Built once per schema; subsequent calls return the cached class. + The model enforces all constraints declared in each column's + :class:`~blosc2.schema.SchemaSpec` (``ge``, ``le``, ``gt``, ``lt``, + ``max_length``, ``min_length``, ``pattern``). + + Nullable columns (those with a ``null_value``) are typed as + ``Optional[T]`` with ``default=None`` so that null sentinels can be + passed as ``None`` and bypass constraint validation entirely — no + placeholder guessing required. + """ + if schema.validator_model is not None: + return schema.validator_model + + field_definitions: dict[str, Any] = {} + for col in schema.columns: + pydantic_kwargs = col.spec.to_pydantic_kwargs() + is_nullable = getattr(col.spec, "null_value", None) is not None + py_type = col.py_type | None if is_nullable else col.py_type + + if col.default is MISSING: + default = None if is_nullable else MISSING + if default is MISSING: + field_definitions[col.name] = (py_type, Field(**pydantic_kwargs)) + else: + field_definitions[col.name] = (py_type, Field(default=default, **pydantic_kwargs)) + else: + field_definitions[col.name] = (py_type, Field(default=col.default, **pydantic_kwargs)) + + cls_name = schema.row_cls.__name__ if schema.row_cls is not None else "Unknown" + model_cls = create_model(f"_Validator_{cls_name}", **field_definitions) + schema.validator_model = model_cls + return model_cls + + +def _is_null_value(val, null_value) -> bool: + """Return True if *val* equals the null sentinel, handling NaN correctly.""" + import math + + if null_value is None: + return False + try: + if isinstance(null_value, float) and math.isnan(null_value): + return isinstance(val, float) and math.isnan(val) + except TypeError: + pass + return val == null_value + + +def _mask_nulls(schema: CompiledSchema, row: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]: + """Replace null sentinel values with ``None`` so Pydantic skips constraint checks. + + Nullable columns are declared as ``Optional[T]`` in the validator model, + so passing ``None`` is always valid regardless of ``ge``/``le``/``pattern`` + constraints. The original sentinel is stashed in *nulled* and restored + after validation. + + Returns (masked_row, nulled) where nulled maps column name → sentinel value. + """ + masked = dict(row) + nulled: dict[str, Any] = {} + for col in schema.columns: + nv = getattr(col.spec, "null_value", None) + if nv is None: + continue + val = row.get(col.name) + if _is_null_value(val, nv): + nulled[col.name] = val + masked[col.name] = None + return masked, nulled + + +def validate_row(schema: CompiledSchema, row: dict[str, Any]) -> dict[str, Any]: + """Validate a single row dict and return the coerced values. + + Parameters + ---------- + schema: + Compiled schema for the table. + row: + ``{column_name: value}`` mapping for one row. + + Returns + ------- + dict + Validated (and Pydantic-coerced) values ready for storage. + + Raises + ------ + ValueError + If any constraint is violated. The message includes the column + name and the violated constraint. + """ + model_cls = build_validator_model(schema) + masked_row, nulled = _mask_nulls(schema, row) + try: + instance = model_cls(**masked_row) + except ValidationError as exc: + # Re-raise as a plain ValueError so callers don't need to import Pydantic. + raise ValueError(str(exc)) from exc + result = instance.model_dump() + result.update(nulled) + return result + + +def validate_rows_rowwise(schema: CompiledSchema, rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Validate a list of row dicts. Returns a list of validated dicts. + + Parameters + ---------- + schema: + Compiled schema for the table. + rows: + List of ``{column_name: value}`` mappings. + + Raises + ------ + ValueError + On the first row that violates a constraint, with the row index + and the Pydantic error details. + """ + model_cls = build_validator_model(schema) + result = [] + for i, row in enumerate(rows): + masked_row, nulled = _mask_nulls(schema, row) + try: + instance = model_cls(**masked_row) + except ValidationError as exc: + raise ValueError(f"Row {i}: {exc}") from exc + validated = instance.model_dump() + validated.update(nulled) + result.append(validated) + return result diff --git a/src/blosc2/schema_vectorized.py b/src/blosc2/schema_vectorized.py new file mode 100644 index 00000000..c15f2dd2 --- /dev/null +++ b/src/blosc2/schema_vectorized.py @@ -0,0 +1,134 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# This source code is licensed under a BSD-style license (found in the +# LICENSE file in the root directory of this source tree) +####################################################################### + +"""Vectorized (NumPy-based) constraint validation for bulk inserts. + +Used by ``CTable.extend()`` to check entire column arrays at once, +avoiding the per-row Python overhead of Pydantic validation for large +batches. +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np + +from blosc2.schema_compiler import CompiledColumn, CompiledSchema # noqa: TC001 + + +def _validate_string_lengths(col: CompiledColumn, arr: Any) -> None: + """Check min_length / max_length constraints on a string/bytes column.""" + if arr.dtype.kind in ("U", "S"): + lengths = np.char.str_len(arr) + else: + lengths = np.vectorize(len)(arr.astype(object)) + + spec = col.spec + if getattr(spec, "max_length", None) is not None: + bad = lengths > spec.max_length + if np.any(bad): + first = arr.astype(object)[bad][0] + raise ValueError(f"Column '{col.name}': value {first!r} exceeds max_length={spec.max_length}") + if getattr(spec, "min_length", None) is not None: + bad = lengths < spec.min_length + if np.any(bad): + first = arr.astype(object)[bad][0] + raise ValueError( + f"Column '{col.name}': value {first!r} is shorter than min_length={spec.min_length}" + ) + + +def _null_mask_for_spec(arr: np.ndarray, spec) -> np.ndarray | None: + """Return a boolean mask True where values are the null sentinel, or None if no null_value.""" + null_value = getattr(spec, "null_value", None) + if null_value is None: + return None + try: + import math + + if isinstance(null_value, float) and math.isnan(null_value): + return np.isnan(arr) + except TypeError: + pass + return arr == null_value + + +def validate_column_values(col: CompiledColumn, values: Any) -> None: + """Check all constraint attributes of *col*'s spec against *values*. + + Parameters + ---------- + col: + Compiled column descriptor (carries the spec with constraints). + values: + Array-like of values for this column. + + Raises + ------ + ValueError + If any value violates a constraint declared on the column's spec. + """ + spec = col.spec + arr = np.asarray(values) + + # Compute null mask so sentinels bypass constraint checks + null_mask = _null_mask_for_spec(arr, spec) + if null_mask is not None: + check = arr[~null_mask] + else: + check = arr + + # Numeric bounds + if getattr(spec, "ge", None) is not None: + bad = check < spec.ge + if np.any(bad): + first = check[bad][0] + raise ValueError(f"Column '{col.name}': value {first!r} violates constraint ge={spec.ge}") + if getattr(spec, "gt", None) is not None: + bad = check <= spec.gt + if np.any(bad): + first = check[bad][0] + raise ValueError(f"Column '{col.name}': value {first!r} violates constraint gt={spec.gt}") + if getattr(spec, "le", None) is not None: + bad = check > spec.le + if np.any(bad): + first = check[bad][0] + raise ValueError(f"Column '{col.name}': value {first!r} violates constraint le={spec.le}") + if getattr(spec, "lt", None) is not None: + bad = check >= spec.lt + if np.any(bad): + first = check[bad][0] + raise ValueError(f"Column '{col.name}': value {first!r} violates constraint lt={spec.lt}") + + # String / bytes length bounds + # np.char.str_len is a true C-level vectorized operation for 'U' and 'S' + # dtypes. Fall back to np.vectorize(len) only for unexpected object arrays. + if getattr(spec, "max_length", None) is not None or getattr(spec, "min_length", None) is not None: + _validate_string_lengths(col, check) + + +def validate_column_batch(schema: CompiledSchema, columns: dict[str, Any]) -> None: + """Validate a dict of column arrays against all constraints in *schema*. + + Parameters + ---------- + schema: + Compiled schema for the table. + columns: + ``{column_name: array_like}`` mapping of the batch being inserted. + + Raises + ------ + ValueError + On the first constraint violation found, naming the column and + the violated constraint. + """ + for col in schema.columns: + if col.name in columns: + validate_column_values(col, columns[col.name]) diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 706c44d5..55bf31db 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -9,6 +9,7 @@ import os import pathlib +import warnings import zipfile from collections import namedtuple from collections.abc import Iterator, Mapping, MutableMapping @@ -1530,11 +1531,11 @@ def _open_meta(path, off=0): if urlpath.endswith(".b2e") and offset == 0: return _open_meta(urlpath) - if urlpath.endswith(".b2d") and os.path.isdir(urlpath): + if os.path.isdir(urlpath): embed_path = os.path.join(urlpath, "embed.b2e") if os.path.exists(embed_path): return _open_meta(embed_path) - if urlpath.endswith(".b2z") and os.path.isfile(urlpath): + if os.path.isfile(urlpath) and not urlpath.endswith(".b2e"): try: with open(urlpath, "rb") as f, zipfile.ZipFile(f) as zf: for info in zf.infolist(): @@ -1573,6 +1574,16 @@ def _store_from_extension(urlpath, mode, offset, **kwargs): return None +def _resolve_store_alias(urlpath): + if os.path.exists(urlpath) or urlpath.endswith((".b2d", ".b2z", ".b2e")): + return urlpath + for suffix in (".b2d", ".b2z", ".b2e"): + candidate = urlpath + suffix + if os.path.exists(candidate): + return candidate + return urlpath + + def _open_special_store(urlpath, mode, offset, **kwargs): # Meta-based detection has priority over extension schunk_meta = _meta_from_store(urlpath, offset) @@ -1625,13 +1636,17 @@ def _set_default_dparams(kwargs): def process_opened_object(res): meta = getattr(res, "schunk", res).meta if "proxy-source" in meta: + proxy_cache = res + cache_schunk = getattr(res, "schunk", res) + if getattr(cache_schunk, "urlpath", None) is not None and getattr(cache_schunk, "mode", None) == "r": + proxy_cache = blosc2_ext.open(cache_schunk.urlpath, "a", 0) proxy_src = meta["proxy-source"] if proxy_src["local_abspath"] is not None: - src = blosc2.open(proxy_src["local_abspath"]) - return blosc2.Proxy(src, _cache=res) + src = blosc2.open(proxy_src["local_abspath"], mode="a") + return blosc2.Proxy(src, _cache=proxy_cache) elif proxy_src["urlpath"] is not None: src = blosc2.C2Array(proxy_src["urlpath"][0], proxy_src["urlpath"][1], proxy_src["urlpath"][2]) - return blosc2.Proxy(src, _cache=res) + return blosc2.Proxy(src, _cache=proxy_cache) elif not proxy_src["caterva2_env"]: raise RuntimeError("Could not find the source when opening a Proxy") @@ -1654,8 +1669,69 @@ def process_opened_object(res): return res +def _read_treestore_root_manifest(store): + try: + meta_obj = store["/_meta"] + except KeyError: + return None + + if not isinstance(meta_obj, blosc2.SChunk): + raise ValueError("TreeStore root manifest '/_meta' must be an SChunk.") + + vlmeta = meta_obj.vlmeta + try: + kind = vlmeta["kind"] + except KeyError as exc: + raise ValueError("TreeStore root manifest is missing required field 'kind'.") from exc + try: + version = vlmeta["version"] + except KeyError as exc: + raise ValueError("TreeStore root manifest is missing required field 'version'.") from exc + + if isinstance(kind, bytes): + kind = kind.decode() + if not isinstance(kind, str): + raise ValueError("TreeStore root manifest field 'kind' must be a string.") + if not isinstance(version, int): + raise ValueError("TreeStore root manifest field 'version' must be an integer.") + + return {"kind": kind, "version": version, "meta": meta_obj} + + +def _open_treestore_root_object(store, urlpath, mode): + manifest = _read_treestore_root_manifest(store) + if manifest is None: + return store + + if manifest["kind"] == "ctable": + if mode not in {"r", "a"}: + return store + # Discard the probe store without repacking — it was only opened + # to peek at the manifest. A full close() would trigger to_b2z() + # even though nothing was modified, and CTable.open() below will + # create its own store anyway. + store.discard() + return blosc2.CTable.open(urlpath, mode=mode) + + return store + + +def _finalize_special_open(special, urlpath, mode): + if special is None: + return None + if isinstance(special, blosc2.TreeStore): + return _open_treestore_root_object(special, urlpath, mode) + return special + + +_OPEN_MODE_SENTINEL = object() + + def open( - urlpath: str | pathlib.Path | blosc2.URLPath, mode: str = "a", offset: int = 0, **kwargs: dict + urlpath: str | pathlib.Path | blosc2.URLPath, + mode: str = _OPEN_MODE_SENTINEL, + offset: int = 0, + **kwargs: dict, ) -> ( blosc2.SChunk | blosc2.NDArray @@ -1681,7 +1757,10 @@ def open( mode: str, optional Persistence mode: 'r' means read only (must exist); 'a' means read/write (create if it doesn't exist); - 'w' means create (overwrite if it exists). Default is 'a'. + 'w' means create (overwrite if it exists). Defaults to 'a' for now, + but will change to 'r' in a future release. Pass ``mode='a'`` + explicitly to preserve writable behavior, or ``mode='r'`` for + read-only access. offset: int, optional An offset in the file where super-chunk or array data is located (e.g. in a file containing several such objects). @@ -1741,7 +1820,7 @@ def open( >>> # Create SChunk and append data >>> schunk = blosc2.SChunk(chunksize=chunksize, data=data.tobytes(), storage=storage) >>> # Open SChunk - >>> sc_open = blosc2.open(urlpath=urlpath) + >>> sc_open = blosc2.open(urlpath=urlpath, mode="r") >>> for i in range(nchunks): ... dest = np.empty(nelem // nchunks, dtype=data.dtype) ... schunk.decompress_chunk(i, dest) @@ -1756,12 +1835,25 @@ def open( To open the same schunk memory-mapped, we simply need to pass the `mmap_mode` parameter: - >>> sc_open_mmap = blosc2.open(urlpath=urlpath, mmap_mode="r") + >>> sc_open_mmap = blosc2.open(urlpath=urlpath, mode="r", mmap_mode="r") >>> sc_open.nchunks == sc_open_mmap.nchunks True >>> all(sc_open.decompress_chunk(i, dest1) == sc_open_mmap.decompress_chunk(i, dest1) for i in range(nchunks)) True """ + # Resolve the sentinel before URLPath check so we can raise the correct + # error without also triggering the deprecation warning for invalid calls. + if mode is _OPEN_MODE_SENTINEL: + # TODO: remove the sentinel/FutureWarning path once blosc2.open() defaults to mode="r". + warnings.warn( + "blosc2.open() currently defaults to mode='a', but this will change " + "to mode='r' in a future release. Pass mode='a' explicitly to keep " + "writable behavior, or mode='r' for read-only access.", + FutureWarning, + stacklevel=2, + ) + mode = "a" + if isinstance(urlpath, blosc2.URLPath): if mode != "r" or offset != 0 or kwargs != {}: raise NotImplementedError( @@ -1772,14 +1864,40 @@ def open( if isinstance(urlpath, pathlib.PurePath): urlpath = str(urlpath) - special = _open_special_store(urlpath, mode, offset, **kwargs) + # Keep explicit store paths on the direct dispatch path. For regular + # Blosc containers, try the standard open first and only fall back to the + # more expensive store probing when that fails. + if urlpath.endswith((".b2d", ".b2z", ".b2e")): + special = _open_special_store(urlpath, mode, offset, **kwargs) + special = _finalize_special_open(special, urlpath, mode) + if special is not None: + return special + + regular_exc = None + if os.path.exists(urlpath): + _set_default_dparams(kwargs) + try: + res = blosc2_ext.open(urlpath, mode, offset, **kwargs) + except Exception as exc: + regular_exc = exc + else: + return process_opened_object(res) + + resolved_urlpath = _resolve_store_alias(urlpath) + special_path = ( + resolved_urlpath if resolved_urlpath != urlpath or not os.path.exists(urlpath) else urlpath + ) + special = _open_special_store(special_path, mode, offset, **kwargs) + special = _finalize_special_open(special, special_path, mode) if special is not None: return special - if not os.path.exists(urlpath): - raise FileNotFoundError(f"No such file or directory: {urlpath}") + if regular_exc is not None: + raise regular_exc + if not os.path.exists(special_path): + raise FileNotFoundError(f"No such file or directory: {special_path}") _set_default_dparams(kwargs) - res = blosc2_ext.open(urlpath, mode, offset, **kwargs) + res = blosc2_ext.open(special_path, mode, offset, **kwargs) return process_opened_object(res) diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index a378fa49..6fd02e9a 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -105,7 +105,8 @@ class TreeStore(DictStore): File mode ('r', 'w', 'a'). Default is 'a'. tmpdir : str or None, optional Temporary directory to use when working with `.b2z` files. If None, - a system temporary directory will be managed. Default is None. + a temporary directory is created in the same directory as the `.b2z` + file, so that unpacked data stays on the same filesystem. Default is None. cparams : dict or None, optional Compression parameters for the internal embed store. If None, the default Blosc2 parameters are used. @@ -154,8 +155,12 @@ def __init__(self, *args, _from_parent_store=None, **kwargs): It supports the same arguments as :class:`blosc2.DictStore`. """ if _from_parent_store is not None: - # This is a subtree view, copy state from parent + # This is a subtree view, copy state from parent. + # Mark it as closed so DictStore.__del__ does not attempt to pack + # or clean up the shared backing store when this ephemeral view + # is garbage-collected. self.__dict__.update(_from_parent_store.__dict__) + self._closed = True else: # Call initialization and mark this storage as a b2tree object super().__init__(*args, **kwargs, _storage_meta={"b2tree": {"version": 1}}) diff --git a/tests/conftest.py b/tests/conftest.py index 35db4afe..b089496c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,6 +14,16 @@ import blosc2 +# Each SChunk allocates C-level thread pools (pthreads) for its compression +# and decompression contexts. Python 3.14 changed the GC gen-2 threshold +# to 0, so long-lived objects are never collected automatically; they +# accumulate until an explicit gc.collect() (e.g. pytest session cleanup). +# Joining thousands of idle pthreads at once can hit the macOS thread-count +# ceiling (6 144) and hang. Periodically forcing a full collection keeps +# the thread count bounded. +_GC_COLLECT_INTERVAL = 50 # collect every N tests +_test_counter = 0 + # Each SChunk allocates C-level thread pools (pthreads) for its compression # and decompression contexts. Python 3.14 changed the GC gen-2 threshold diff --git a/tests/ctable/test_arrow_interop.py b/tests/ctable/test_arrow_interop.py new file mode 100644 index 00000000..9d997030 --- /dev/null +++ b/tests/ctable/test_arrow_interop.py @@ -0,0 +1,225 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for CTable.to_arrow() and CTable.from_arrow().""" + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + +pa = pytest.importorskip("pyarrow") + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + label: str = blosc2.field(blosc2.string(max_length=16), default="") + + +DATA10 = [(i, float(i * 10 % 100), i % 2 == 0, f"r{i}") for i in range(10)] + + +# =========================================================================== +# to_arrow() +# =========================================================================== + + +def test_to_arrow_returns_pyarrow_table(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + assert isinstance(at, pa.Table) + + +def test_to_arrow_column_names(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + assert at.column_names == ["id", "score", "active", "label"] + + +def test_to_arrow_row_count(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + assert len(at) == 10 + + +def test_to_arrow_int_values(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + np.testing.assert_array_equal(at["id"].to_pylist(), [r[0] for r in DATA10]) + + +def test_to_arrow_float_values(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + np.testing.assert_allclose(at["score"].to_pylist(), [r[1] for r in DATA10]) + + +def test_to_arrow_bool_values(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + assert at["active"].to_pylist() == [r[2] for r in DATA10] + + +def test_to_arrow_string_values(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + assert at["label"].to_pylist() == [r[3] for r in DATA10] + + +def test_to_arrow_string_type(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + assert at.schema.field("label").type == pa.string() + + +def test_to_arrow_skips_deleted_rows(): + t = CTable(Row, new_data=DATA10) + t.delete([0, 1]) + at = t.to_arrow() + assert len(at) == 8 + assert at["id"].to_pylist() == list(range(2, 10)) + + +def test_to_arrow_empty_table(): + t = CTable(Row) + at = t.to_arrow() + assert len(at) == 0 + assert at.column_names == ["id", "score", "active", "label"] + + +def test_to_arrow_select_view(): + t = CTable(Row, new_data=DATA10) + at = t.select(["id", "score"]).to_arrow() + assert at.column_names == ["id", "score"] + assert len(at) == 10 + + +def test_to_arrow_where_view(): + t = CTable(Row, new_data=DATA10) + at = t.where(t["id"] > 4).to_arrow() + assert len(at) == 5 + + +# =========================================================================== +# from_arrow() +# =========================================================================== + + +def test_from_arrow_returns_ctable(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + t2 = CTable.from_arrow(at) + assert isinstance(t2, CTable) + + +def test_from_arrow_row_count(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + t2 = CTable.from_arrow(at) + assert len(t2) == 10 + + +def test_from_arrow_column_names(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + t2 = CTable.from_arrow(at) + assert t2.col_names == ["id", "score", "active", "label"] + + +def test_from_arrow_int_values(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + t2 = CTable.from_arrow(at) + np.testing.assert_array_equal(t2["id"].to_numpy(), t["id"].to_numpy()) + + +def test_from_arrow_float_values(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + t2 = CTable.from_arrow(at) + np.testing.assert_allclose(t2["score"].to_numpy(), t["score"].to_numpy()) + + +def test_from_arrow_bool_values(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + t2 = CTable.from_arrow(at) + np.testing.assert_array_equal(t2["active"].to_numpy(), t["active"].to_numpy()) + + +def test_from_arrow_string_values(): + t = CTable(Row, new_data=DATA10) + at = t.to_arrow() + t2 = CTable.from_arrow(at) + assert t2["label"].to_numpy().tolist() == t["label"].to_numpy().tolist() + + +def test_from_arrow_empty_table(): + schema = pa.schema( + [ + pa.field("id", pa.int64()), + pa.field("val", pa.float64()), + ] + ) + at = pa.table({"id": pa.array([], type=pa.int64()), "val": pa.array([], type=pa.float64())}) + t = CTable.from_arrow(at) + assert len(t) == 0 + assert t.col_names == ["id", "val"] + + +def test_from_arrow_roundtrip(): + """to_arrow then from_arrow preserves all values.""" + t = CTable(Row, new_data=DATA10) + t2 = CTable.from_arrow(t.to_arrow()) + for name in ["id", "score", "active"]: + np.testing.assert_array_equal(t2[name].to_numpy(), t[name].to_numpy()) + assert t2["label"].to_numpy().tolist() == t["label"].to_numpy().tolist() + + +def test_from_arrow_all_numeric_types(): + """All integer and float Arrow types map to correct blosc2 specs.""" + at = pa.table( + { + "i8": pa.array([1, 2, 3], type=pa.int8()), + "i16": pa.array([1, 2, 3], type=pa.int16()), + "i32": pa.array([1, 2, 3], type=pa.int32()), + "i64": pa.array([1, 2, 3], type=pa.int64()), + "u8": pa.array([1, 2, 3], type=pa.uint8()), + "u16": pa.array([1, 2, 3], type=pa.uint16()), + "u32": pa.array([1, 2, 3], type=pa.uint32()), + "u64": pa.array([1, 2, 3], type=pa.uint64()), + "f32": pa.array([1.0, 2.0, 3.0], type=pa.float32()), + "f64": pa.array([1.0, 2.0, 3.0], type=pa.float64()), + } + ) + t = CTable.from_arrow(at) + assert len(t) == 3 + assert t.col_names == list(at.column_names) + + +def test_from_arrow_string_max_length(): + """String max_length is set from the longest value in the data.""" + at = pa.table({"name": pa.array(["hi", "hello world", "!"], type=pa.string())}) + t = CTable.from_arrow(at) + # "hello world" is 11 chars — stored dtype must accommodate it + assert t["name"].dtype.itemsize // 4 >= 11 + + +def test_from_arrow_unsupported_type_raises(): + at = pa.table({"ts": pa.array([1, 2, 3], type=pa.timestamp("s"))}) + with pytest.raises(TypeError, match="No blosc2 spec"): + CTable.from_arrow(at) + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_column.py b/tests/ctable/test_column.py new file mode 100644 index 00000000..838d4c75 --- /dev/null +++ b/tests/ctable/test_column.py @@ -0,0 +1,748 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0)) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +@dataclass +class StrRow: + label: str = blosc2.field(blosc2.string(max_length=16)) + + +DATA20 = [(i, float(i * 10), True) for i in range(20)] + + +# ------------------------------------------------------------------- +# Tests +# ------------------------------------------------------------------- + + +def test_column_metadata(): + """dtype correctness, internal reference consistency, and mask defaults.""" + tabla = CTable(Row, new_data=DATA20) + + assert tabla.id.dtype == np.int64 + assert tabla.score.dtype == np.float64 + assert tabla.active.dtype == np.bool_ + + assert tabla.id._raw_col is tabla._cols["id"] + assert tabla.id._valid_rows is tabla._valid_rows + + # mask is None by default + assert tabla.id._mask is None + assert tabla.score._mask is None + + +def test_column_getitem_no_holes(): + """int, slice, and list indexing on a full table.""" + tabla = CTable(Row, new_data=DATA20) + col = tabla.id + + # int + assert col[0] == 0 + assert col[5] == 5 + assert col[19] == 19 + assert col[-1] == 19 + assert col[-5] == 15 + + # slice returns a Column view + assert isinstance(col[0:5], blosc2.Column) + assert isinstance(col[10:15], blosc2.Column) + + # list + assert list(col[[0, 5, 10, 15]]) == [0, 5, 10, 15] + assert list(col[[19, 0, 10]]) == [19, 0, 10] + + +def test_column_getitem_with_holes(): + """int, slice, and list indexing after deletions.""" + tabla = CTable(Row, new_data=DATA20) + tabla.delete([1, 3, 5, 7, 9]) + col = tabla.id + + assert col[0] == 0 + assert col[1] == 2 + assert col[2] == 4 + assert col[3] == 6 + assert col[4] == 8 + assert col[-1] == 19 + assert col[-2] == 18 + + assert list(col[[0, 2, 4]]) == [0, 4, 8] + assert list(col[[5, 3, 1]]) == [10, 6, 2] + + tabla2 = CTable(Row, new_data=DATA20) + tabla2.delete([1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) + col2 = tabla2.id + + assert list(col2[0:5].to_numpy()) == [0, 2, 4, 6, 8] + assert list(col2[5:10].to_numpy()) == [10, 12, 14, 16, 18] + assert list(col2[::2].to_numpy()) == [0, 4, 8, 12, 16] + + +def test_column_getitem_out_of_range(): + """int and list indexing raise IndexError when out of bounds.""" + tabla = CTable(Row, new_data=DATA20) + tabla.delete([1, 3, 5, 7, 9]) + col = tabla.id + + with pytest.raises(IndexError): + _ = col[100] + with pytest.raises(IndexError): + _ = col[-100] + with pytest.raises(IndexError): + _ = col[[0, 1, 100]] + + +def test_column_setitem_no_holes(): + """int, slice, and list assignment on a full table.""" + tabla = CTable(Row, new_data=DATA20) + col = tabla.id + + col[0] = 999 + assert col[0] == 999 + col[10] = 888 + assert col[10] == 888 + col[-1] = 777 + assert col[-1] == 777 + + col[0:5] = [100, 101, 102, 103, 104] + assert list(col[0:5].to_numpy()) == [100, 101, 102, 103, 104] + + col[[0, 5, 10]] = [10, 50, 100] + assert col[0] == 10 + assert col[5] == 50 + assert col[10] == 100 + + +def test_column_setitem_with_holes(): + """int, slice, and list assignment after deletions.""" + tabla = CTable(Row, new_data=DATA20) + tabla.delete([1, 3, 5, 7, 9]) + col = tabla.id + + col[0] = 999 + assert col[0] == 999 + assert tabla._cols["id"][0] == 999 + + col[2] = 888 + assert col[2] == 888 + assert tabla._cols["id"][4] == 888 + + col[-1] = 777 + assert col[-1] == 777 + + col[0:3] = [100, 200, 300] + assert col[0] == 100 + assert col[1] == 200 + assert col[2] == 300 + + col[[0, 2, 4]] = [11, 22, 33] + assert col[0] == 11 + assert col[2] == 22 + assert col[4] == 33 + + +def test_column_iter(): + """Iteration over full table, with odd-index holes, and on score column.""" + tabla = CTable(Row, new_data=DATA20) + assert list(tabla.id) == list(range(20)) + + tabla2 = CTable(Row, new_data=DATA20) + tabla2.delete([1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) + assert list(tabla2.id) == [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] + + tabla3 = CTable(Row, new_data=DATA20) + tabla3.delete([0, 5, 10, 15]) + # fmt: off + expected_score = [ + 10.0, 20.0, 30.0, 40.0, + 60.0, 70.0, 80.0, 90.0, + 110.0, 120.0, 130.0, 140.0, + 160.0, 170.0, 180.0, 190.0, + ] + # fmt: on + assert list(tabla3.score) == expected_score + + +def test_column_len(): + """len() after no deletions, partial deletions, cumulative deletions, and cross-column.""" + tabla = CTable(Row, new_data=DATA20) + col = tabla.id + assert len(col) == 20 + + tabla.delete([1, 3, 5, 7, 9]) + assert len(col) == 15 + + tabla2 = CTable(Row, new_data=DATA20) + col2 = tabla2.id + tabla2.delete([0, 1, 2]) + assert len(col2) == 17 + tabla2.delete([0, 1, 2, 3, 4]) + assert len(col2) == 12 + + data = [(i, float(i * 10), i % 2 == 0) for i in range(10)] + tabla3 = CTable(Row, new_data=data, expected_size=10) + tabla3.delete([0, 1, 5, 6, 9]) + assert len(tabla3.id) == len(tabla3.score) == len(tabla3.active) == 5 + for i in range(len(tabla3.id)): + assert tabla3.score[i] == float(tabla3.id[i] * 10) + + +def test_column_edge_cases(): + """Empty table and fully-deleted table both behave as zero-length columns.""" + tabla = CTable(Row) + assert len(tabla.id) == 0 + assert list(tabla.id) == [] + + data = [(i, float(i * 10), True) for i in range(10)] + tabla2 = CTable(Row, new_data=data) + tabla2.delete(list(range(10))) + assert len(tabla2.id) == 0 + assert list(tabla2.id) == [] + + +# ------------------------------------------------------------------- +# New tests for Column view (mask) and to_array() +# ------------------------------------------------------------------- + + +def test_column_slice_returns_view(): + """Column[slice] returns a Column instance with a non-None mask.""" + tabla = CTable(Row, new_data=DATA20) + col = tabla.id + + view = col[0:5] + assert isinstance(view, blosc2.Column) + assert view._mask is not None + assert view._table is tabla + assert view._col_name == "id" + + +def test_to_array_slices(): + """to_array() on slice views: full table and with holes.""" + # No holes + tabla = CTable(Row, new_data=DATA20) + col = tabla.id + np.testing.assert_array_equal(col[0:5].to_numpy(), np.array([0, 1, 2, 3, 4], dtype=np.int64)) + np.testing.assert_array_equal(col[5:10].to_numpy(), np.array([5, 6, 7, 8, 9], dtype=np.int64)) + np.testing.assert_array_equal(col[15:20].to_numpy(), np.array([15, 16, 17, 18, 19], dtype=np.int64)) + np.testing.assert_array_equal(col[0:20].to_numpy(), np.arange(20, dtype=np.int64)) + + # With holes: delete odd indices → keep evens 0,2,4,...,18 + tabla.delete([1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) + col = tabla.id + np.testing.assert_array_equal(col[0:5].to_numpy(), np.array([0, 2, 4, 6, 8], dtype=np.int64)) + np.testing.assert_array_equal(col[5:10].to_numpy(), np.array([10, 12, 14, 16, 18], dtype=np.int64)) + + +def test_to_array_full_column(): + """to_array() with no slice (full column) returns all valid rows.""" + tabla = CTable(Row, new_data=DATA20) + tabla.delete([0, 10, 19]) + col = tabla.id + + expected = np.array([i for i in range(20) if i not in {0, 10, 19}], dtype=np.int64) + np.testing.assert_array_equal(col[0 : len(col)].to_numpy(), expected) + + +def test_to_array_mask_does_not_include_deleted(): + """Mask & valid_rows intersection excludes deleted rows inside the slice range.""" + tabla = CTable(Row, new_data=DATA20) + # delete rows 2 and 3, which fall inside slice [0:5] + tabla.delete([2, 3]) + col = tabla.id + + # logical [0:5] should now map to physical rows 0,1,4,5,6 + result = col[0:5].to_numpy() + np.testing.assert_array_equal(result, np.array([0, 1, 4, 5, 6], dtype=np.int64)) + + +def test_column_view_mask_is_independent(): + """Two slice views on the same column have independent masks.""" + tabla = CTable(Row, new_data=DATA20) + col = tabla.id + + view_a = col[0:5] + + np.testing.assert_array_equal(view_a.to_numpy(), np.arange(0, 5, dtype=np.int64)) + + +# ------------------------------------------------------------------- +# iter_chunks +# ------------------------------------------------------------------- + + +def test_iter_chunks_full_table(): + """iter_chunks reassembles to the same values as to_numpy().""" + tabla = CTable(Row, new_data=DATA20) + expected = tabla["id"].to_numpy() + got = np.concatenate(list(tabla["id"].iter_chunks(size=7))) + np.testing.assert_array_equal(got, expected) + + +def test_iter_chunks_chunk_sizes(): + """Each yielded chunk has at most *size* elements; last may be smaller.""" + tabla = CTable(Row, new_data=DATA20) + chunks = list(tabla["score"].iter_chunks(size=6)) + for c in chunks[:-1]: + assert len(c) == 6 + assert len(chunks[-1]) <= 6 + assert sum(len(c) for c in chunks) == 20 + + +def test_iter_chunks_skips_deleted_rows(): + """Deleted rows are not included in any chunk.""" + tabla = CTable(Row, new_data=DATA20) + tabla.delete([0, 1, 2]) # delete id 0, 1, 2 + chunks = list(tabla["id"].iter_chunks(size=5)) + all_vals = np.concatenate(chunks) + assert 0 not in all_vals + assert 1 not in all_vals + assert 2 not in all_vals + assert len(all_vals) == 17 + + +def test_iter_chunks_size_larger_than_table(): + """A size larger than the table yields a single chunk with all rows.""" + tabla = CTable(Row, new_data=DATA20) + chunks = list(tabla["id"].iter_chunks(size=1000)) + assert len(chunks) == 1 + np.testing.assert_array_equal(chunks[0], np.arange(20, dtype=np.int64)) + + +def test_iter_chunks_empty_table(): + """iter_chunks on an empty table yields nothing.""" + tabla = CTable(Row) + chunks = list(tabla["id"].iter_chunks()) + assert chunks == [] + + +# ------------------------------------------------------------------- +# Aggregates: sum +# ------------------------------------------------------------------- + + +def test_sum_int(): + t = CTable(Row, new_data=DATA20) + assert t["id"].sum() == sum(range(20)) + + +def test_sum_float(): + t = CTable(Row, new_data=DATA20) + assert t["score"].sum() == pytest.approx(sum(i * 10.0 for i in range(20))) + + +def test_sum_bool_counts_trues(): + t = CTable(Row, new_data=DATA20) # all active=True + assert t["active"].sum() == 20 + + +def test_sum_skips_deleted_rows(): + t = CTable(Row, new_data=DATA20) + t.delete([0]) # remove id=0 + assert t["id"].sum() == sum(range(1, 20)) + + +def test_sum_empty_raises(): + t = CTable(Row) + with pytest.raises(ValueError, match="empty"): + t["id"].sum() + + +def test_sum_wrong_type_raises(): + t = CTable(StrRow, new_data=[("hello",)]) + with pytest.raises(TypeError): + t["label"].sum() + + +# ------------------------------------------------------------------- +# Aggregates: min / max +# ------------------------------------------------------------------- + + +def test_min_int(): + t = CTable(Row, new_data=DATA20) + assert t["id"].min() == 0 + + +def test_max_int(): + t = CTable(Row, new_data=DATA20) + assert t["id"].max() == 19 + + +def test_min_float(): + t = CTable(Row, new_data=DATA20) + assert t["score"].min() == pytest.approx(0.0) + + +def test_max_float(): + t = CTable(Row, new_data=DATA20) + assert t["score"].max() == pytest.approx(190.0) + + +def test_min_max_string(): + t = CTable(StrRow, new_data=[("banana",), ("apple",), ("cherry",)]) + assert t["label"].min() == "apple" + assert t["label"].max() == "cherry" + + +def test_min_skips_deleted(): + t = CTable(Row, new_data=DATA20) + t.delete([0]) # remove id=0, next min is 1 + assert t["id"].min() == 1 + + +def test_min_empty_raises(): + t = CTable(Row) + with pytest.raises(ValueError, match="empty"): + t["id"].min() + + +def test_max_complex_raises(): + @dataclass + class CRow: + val: complex = blosc2.field(blosc2.complex128()) + + t = CTable(CRow, new_data=[(1 + 2j,)]) + with pytest.raises(TypeError): + t["val"].max() + + +# ------------------------------------------------------------------- +# Aggregates: mean +# ------------------------------------------------------------------- + + +def test_mean_int(): + t = CTable(Row, new_data=DATA20) + assert t["id"].mean() == pytest.approx(9.5) + + +def test_mean_float(): + t = CTable(Row, new_data=DATA20) + assert t["score"].mean() == pytest.approx(95.0) + + +def test_mean_skips_deleted(): + t = CTable(Row, new_data=[(0, 0.0, True), (10, 100.0, True)]) + t.delete([0]) # remove id=0; only id=10 remains + assert t["id"].mean() == pytest.approx(10.0) + + +def test_mean_empty_raises(): + t = CTable(Row) + with pytest.raises(ValueError, match="empty"): + t["id"].mean() + + +# ------------------------------------------------------------------- +# Aggregates: std +# ------------------------------------------------------------------- + + +def test_std_population(): + t = CTable(Row, new_data=DATA20) + ids = np.arange(20, dtype=np.float64) + assert t["id"].std() == pytest.approx(float(ids.std(ddof=0))) + + +def test_std_sample(): + t = CTable(Row, new_data=DATA20) + ids = np.arange(20, dtype=np.float64) + assert t["id"].std(ddof=1) == pytest.approx(float(ids.std(ddof=1))) + + +def test_std_single_element(): + t = CTable(Row, new_data=[(5, 50.0, True)]) + assert t["id"].std() == pytest.approx(0.0) + + +def test_std_single_element_ddof1_is_nan(): + t = CTable(Row, new_data=[(5, 50.0, True)]) + assert np.isnan(t["id"].std(ddof=1)) + + +def test_std_empty_raises(): + t = CTable(Row) + with pytest.raises(ValueError, match="empty"): + t["id"].std() + + +# ------------------------------------------------------------------- +# Aggregates: any / all +# ------------------------------------------------------------------- + + +def test_any_all_true(): + t = CTable(Row, new_data=DATA20) # all active=True + assert t["active"].any() is True + assert t["active"].all() is True + + +def test_any_some_false(): + data = [(i, float(i), i % 2 == 0) for i in range(10)] + t = CTable(Row, new_data=data) + assert t["active"].any() is True + assert t["active"].all() is False + + +def test_all_false(): + data = [(i, float(i), False) for i in range(5)] + t = CTable(Row, new_data=data) + assert t["active"].any() is False + assert t["active"].all() is False + + +def test_any_empty_is_false(): + t = CTable(Row) + assert t["active"].any() is False + + +def test_all_empty_is_true(): + # vacuous truth: all() over nothing is True (same as Python's built-in) + t = CTable(Row) + assert t["active"].all() is True + + +def test_any_wrong_type_raises(): + t = CTable(Row, new_data=DATA20) + with pytest.raises(TypeError): + t["id"].any() + + +# ------------------------------------------------------------------- +# unique +# ------------------------------------------------------------------- + + +def test_unique_int(): + t = CTable(Row, new_data=[(i % 5, float(i), True) for i in range(20)]) + result = t["id"].unique() + np.testing.assert_array_equal(result, np.array([0, 1, 2, 3, 4], dtype=np.int64)) + + +def test_unique_bool(): + data = [(i, float(i), i % 3 != 0) for i in range(10)] + t = CTable(Row, new_data=data) + result = t["active"].unique() + assert set(result.tolist()) == {True, False} + + +def test_unique_skips_deleted(): + t = CTable(Row, new_data=[(i % 3, float(i), True) for i in range(9)]) + # ids are [0,1,2,0,1,2,0,1,2]; logical rows with id==0 are at positions 0,3,6 + t.delete([0, 3, 6]) + result = t["id"].unique() + assert 0 not in result.tolist() + assert set(result.tolist()) == {1, 2} + + +def test_unique_empty(): + t = CTable(Row) + result = t["id"].unique() + assert len(result) == 0 + + +# ------------------------------------------------------------------- +# value_counts +# ------------------------------------------------------------------- + + +def test_value_counts_basic(): + data = [(i % 3, float(i), True) for i in range(9)] # ids: 0,1,2,0,1,2,0,1,2 + t = CTable(Row, new_data=data) + vc = t["id"].value_counts() + assert vc[0] == 3 + assert vc[1] == 3 + assert vc[2] == 3 + + +def test_value_counts_sorted_by_count(): + data = [(0, 0.0, True)] * 5 + [(1, 1.0, True)] * 2 + [(2, 2.0, True)] * 8 + t = CTable(Row, new_data=data) + vc = t["id"].value_counts() + counts = list(vc.values()) + assert counts == sorted(counts, reverse=True) + + +def test_value_counts_bool(): + data = [(i, float(i), i % 4 != 0) for i in range(20)] # 5 False, 15 True + t = CTable(Row, new_data=data) + vc = t["active"].value_counts() + assert vc[True] == 15 + assert vc[False] == 5 + assert list(vc.keys())[0] is True # True comes first (higher count) + + +def test_value_counts_empty(): + t = CTable(Row) + assert t["id"].value_counts() == {} + + +# ------------------------------------------------------------------- +# sample (on CTable) +# ------------------------------------------------------------------- + + +def test_sample_returns_correct_count(): + t = CTable(Row, new_data=DATA20) + s = t.sample(5, seed=0) + assert len(s) == 5 + + +def test_sample_rows_are_subset(): + t = CTable(Row, new_data=DATA20) + s = t.sample(7, seed=42) + all_ids = set(t["id"].to_numpy().tolist()) + sample_ids = set(s["id"].to_numpy().tolist()) + assert sample_ids.issubset(all_ids) + + +def test_sample_is_read_only(): + t = CTable(Row, new_data=DATA20) + s = t.sample(5, seed=0) + with pytest.raises((ValueError, TypeError)): + s.append((99, 9.0, True)) + + +def test_sample_seed_reproducible(): + t = CTable(Row, new_data=DATA20) + s1 = t.sample(5, seed=7) + s2 = t.sample(5, seed=7) + np.testing.assert_array_equal(s1["id"].to_numpy(), s2["id"].to_numpy()) + + +def test_sample_n_larger_than_table(): + t = CTable(Row, new_data=DATA20) + s = t.sample(1000, seed=0) + assert len(s) == 20 + + +def test_sample_zero(): + t = CTable(Row, new_data=DATA20) + assert len(t.sample(0)) == 0 + + +# ------------------------------------------------------------------- +# cbytes / nbytes / __repr__ +# ------------------------------------------------------------------- + + +def test_cbytes_nbytes_positive(): + t = CTable(Row, new_data=DATA20) + assert t.cbytes > 0 + assert t.nbytes > 0 + assert t.nbytes >= t.cbytes # compressed is never larger than raw + + +def test_cbytes_nbytes_consistent_with_info(): + t = CTable(Row, new_data=DATA20) + expected_cb = sum(col.cbytes for col in t._cols.values()) + t._valid_rows.cbytes + expected_nb = sum(col.nbytes for col in t._cols.values()) + t._valid_rows.nbytes + assert t.cbytes == expected_cb + assert t.nbytes == expected_nb + + +def test_repr_contains_col_names_and_row_count(): + t = CTable(Row, new_data=DATA20) + r = repr(t) + assert "id" in r + assert "score" in r + assert "active" in r + assert "20" in r + + +def test_repr_is_single_line(): + t = CTable(Row, new_data=DATA20) + assert "\n" not in repr(t) + + +def test_column_repr_shows_preview_values(): + t = CTable(Row, new_data=DATA20) + r = repr(t["id"][:]) + assert "Column('id'" in r + assert "dtype=int64" in r + assert "len=20" in r + assert "values=[0, 1, 2" in r + assert "..." in r + + +def test_info_omits_capacity_and_read_only_for_in_memory_table(): + t = CTable(Row, new_data=DATA20) + info = repr(t.info) + assert "capacity" not in info + assert "read_only" not in info + assert "open_mode" not in info + + +def test_info_shows_open_mode_for_persistent_table(tmp_path): + path = str(tmp_path / "table.b2d") + t = CTable(Row, new_data=DATA20, urlpath=path, mode="w") + t.close() + + opened = CTable.open(path) + info = repr(opened.info) + assert "capacity" not in info + assert "read_only" not in info + assert "open_mode : r" in info + opened.close() + + +def test_info_schema_expands_unicode_dtype_labels(): + t = CTable(StrRow, new_data=[("alpha",), ("beta",)]) + info = repr(t.info) + assert "U16 (Unicode, max 16 chars)" in info + + +def test_info_valid_rows_mask_only_reports_cbytes(): + t = CTable(Row, new_data=DATA20) + info = repr(t.info) + assert "valid_rows_mask : cbytes=" in info + assert "valid_rows_mask : nbytes=" not in info + + +def test_info_indexes_only_report_cbytes(tmp_path): + @dataclass + class IndexedRow: + id: int = blosc2.field(blosc2.int32()) + active: bool = blosc2.field(blosc2.bool(), default=True) + + data = [(i, i % 2 == 0) for i in range(32)] + path = str(tmp_path / "indexed.b2d") + t = CTable(IndexedRow, new_data=data, urlpath=path, mode="w") + t.create_index("id", kind=blosc2.IndexKind.FULL) + + info = repr(t.info) + index_block = info.split("indexes :", 1)[1] + assert "cbytes=" in index_block + assert "nbytes=" not in index_block + assert "cratio=" not in index_block + + +def test_info_cratio_uses_one_decimal_with_suffix(): + t = CTable(Row, new_data=DATA20) + info = repr(t.info) + assert "cratio :" in info + assert "x" in next(line for line in info.splitlines() if line.startswith("cratio")) + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_compact.py b/tests/ctable/test_compact.py new file mode 100644 index 00000000..f67688d7 --- /dev/null +++ b/tests/ctable/test_compact.py @@ -0,0 +1,152 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass + +import numpy as np + +import blosc2 +from blosc2 import CTable + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100)) + + +def generate_test_data(n_rows: int) -> list: + return [(i, float(i)) for i in range(n_rows)] + + +def test_compact_empty_table(): + """Test compact() on a completely empty table (no data).""" + table = CTable(Row, expected_size=100) + + assert len(table) == 0 + + # Should not raise any error + table.compact() + + # Capacity might have drastically reduced, but the logical table must remain empty + assert len(table) == 0 + # Verify that if data is added later, it works correctly + table.append((1, 10.0)) + assert len(table) == 1 + assert table.id[0] == 1 + + +def test_compact_full_table(): + """Test compact() on a completely full table (no holes or free space).""" + data = generate_test_data(50) + table = CTable(Row, new_data=data, expected_size=50) + + assert len(table) == 50 + initial_capacity = len(table._valid_rows) + + # Should not raise any error or change the logical state + table.compact() + + assert len(table) == 50 + # Capacity should not have changed because it was already full + assert len(table._valid_rows) == initial_capacity + + # Verify data integrity + assert table.id[0] == 0 + assert table.id[-1] == 49 + + +def test_compact_already_compacted_table(): + """Test compact() on a table that has free space but no holes (contiguous data).""" + data = generate_test_data(20) + # Large expected_size to ensure free space at the end + table = CTable(Row, new_data=data, expected_size=100) + + assert len(table) == 20 + + # Execute compact. Since data is already contiguous, the table might reduce + # its size due to the < len//2 while loop, but it shouldn't fail. + table.compact() + + assert len(table) == 20 + + # Verify that data remains in place + for i in range(20): + assert table.id[i] == i + + # Validate that all True values are consecutive at the beginning + mask = table._valid_rows[: len(table._valid_rows)] + assert np.all(mask[:20]) + if len(mask) > 20: + assert not np.any(mask[20:]) + + +def test_compact_with_holes(): + """Test compact() on a table with high fragmentation (holes).""" + data = generate_test_data(30) + table = CTable(Row, new_data=data, expected_size=50) + + # Delete sparsely: leave only [0, 5, 10, 15, 20, 25] + to_delete = [i for i in range(30) if i % 5 != 0] + table.delete(to_delete) + + assert len(table) == 6 + + # Execute compact + table.compact() + + assert len(table) == 6 + + # Verify that the correct data survived and moved to the beginning + expected_ids = [0, 5, 10, 15, 20, 25] + for i, exp_id in enumerate(expected_ids): + # Through the logical view (Column wrapper) + assert table.id[i] == exp_id + # Through the physical blosc2 array (to ensure compact worked) + assert table._cols["id"][i] == exp_id + + # Verify physical mask: first 6 must be True, the rest False + mask = table._valid_rows[: len(table._valid_rows)] + assert np.all(mask[:6]) + if len(mask) > 6: + assert not np.any(mask[6:]) + + +def test_compact_all_deleted(): + """Test compact() on a table where absolutely all rows have been deleted.""" + data = generate_test_data(20) + table = CTable(Row, new_data=data, expected_size=20) + + # Delete everything + table.delete(list(range(20))) + assert len(table) == 0 + + # Should handle empty arrays correctly + table.compact() + + assert len(table) == 0 + + # Check that we can write to it again + table.append((99, 99.0)) + assert len(table) == 1 + assert table.id[0] == 99 + + +def test_compact_multiple_times(): + """Calling compact() multiple times in a row must not corrupt data or crash.""" + data = generate_test_data(10) + table = CTable(Row, new_data=data, expected_size=20) + + table.delete([1, 3, 5, 7, 9]) # 5 elements remaining + + # Compact 3 times in a row + table.compact() + table.compact() + table.compact() + + assert len(table) == 5 + assert list(table.id) == [0, 2, 4, 6, 8] diff --git a/tests/ctable/test_construct.py b/tests/ctable/test_construct.py new file mode 100644 index 00000000..4a091997 --- /dev/null +++ b/tests/ctable/test_construct.py @@ -0,0 +1,201 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +# ------------------------------------------------------------------- +# Predefined Test Data +# ------------------------------------------------------------------- +SMALL_DATA = [ + (1, 1 + 2j, 95.5, True), + (2, 3 - 4j, 80.0, False), + (3, 0j, 50.2, True), + (4, -1 + 1j, 12.3, False), + (5, 5j, 99.9, True), +] +SMALLEST_DATA = SMALL_DATA[:2] + +dtype_struct = [("id", "i8"), ("c_val", "c16"), ("score", "f8"), ("active", "?")] +SMALL_STRUCT = np.array(SMALL_DATA, dtype=dtype_struct) + + +# ------------------------------------------------------------------- +# Validation Utility +# ------------------------------------------------------------------- +def assert_table_equals_data(table: CTable, expected_data: list): + assert len(table) == len(expected_data), f"Expected length {len(expected_data)}, got {len(table)}" + if not expected_data: + return + col_names = table.col_names + # Transpose: expected_data is list-of-rows → list-of-columns + expected_cols = list(zip(*expected_data, strict=False)) + for col_idx, col_name in enumerate(col_names): + actual = table[col_name].to_numpy() + expected = expected_cols[col_idx] + if isinstance(expected[0], (float, complex)): + np.testing.assert_allclose(actual, expected, err_msg=f"col {col_name}") + else: + np.testing.assert_array_equal(actual, expected, err_msg=f"col {col_name}") + + +# ------------------------------------------------------------------- +# Tests +# ------------------------------------------------------------------- + + +def test_empty_table_variants(): + """Empty table: default, with expected_size, and with compact=True.""" + table = CTable(Row) + assert len(table) == 0 + assert table.nrows == 0 + assert table.ncols == 4 + for col_name in ["id", "c_val", "score", "active"]: + assert col_name in table._cols + assert isinstance(table._cols[col_name], blosc2.NDArray) + + table_sized = CTable(Row, expected_size=5000) + assert len(table_sized) == 0 + assert all(len(col) == 5000 for col in table_sized._cols.values()) + + table_compact = CTable(Row, compact=True) + assert len(table_compact) == 0 + assert table_compact.auto_compact is True + + +def test_empty_data_lifecycle(): + """Create from [], extend with [], then extend with real data.""" + table = CTable(Row, new_data=[]) + assert len(table) == 0 + + table.extend([]) + assert len(table) == 0 + + table.extend(SMALL_DATA) + assert_table_equals_data(table, SMALL_DATA) + + +def test_construction_variants(): + """Sources (list, structured array), expected_size, and compact flag.""" + # list of tuples and structured array produce identical tables + assert_table_equals_data(CTable(Row, new_data=SMALL_DATA), SMALL_DATA) + assert_table_equals_data(CTable(Row, new_data=SMALL_STRUCT), SMALL_DATA) + + # expected_size smaller than data → resize; larger → preallocated + for es in [1, 5]: + assert_table_equals_data(CTable(Row, new_data=SMALL_DATA, expected_size=es), SMALL_DATA) + table_large = CTable(Row, new_data=SMALL_DATA, expected_size=1000) + assert_table_equals_data(table_large, SMALL_DATA) + assert all(len(col) == 1000 for col in table_large._cols.values()) + + # compact flag is stored and data is intact + table_false = CTable(Row, new_data=SMALL_DATA, compact=False) + assert table_false.auto_compact is False + assert_table_equals_data(table_false, SMALL_DATA) + + table_true = CTable(Row, new_data=SMALL_DATA, compact=True) + assert table_true.auto_compact is True + assert_table_equals_data(table_true, SMALL_DATA) + + +def test_append_and_clone(): + """Build table row by row, then clone it into a new CTable.""" + table = CTable(Row) + for row in SMALLEST_DATA: + table.append(row) + assert_table_equals_data(table, SMALLEST_DATA) + + cloned = CTable(Row, new_data=table) + assert_table_equals_data(cloned, SMALLEST_DATA) + assert table is not cloned + + +def test_invalid_append(): + """Constraint violation and incompatible type both raise errors.""" + table = CTable(Row, expected_size=1) + + # Constraint violation: id must be >= 0 + with pytest.raises(ValueError): + table.append((-1, 1 + 2j, 95.5, True)) + + # Constraint violation: score must be <= 100 + with pytest.raises(ValueError): + table.append((1, 1 + 2j, 150.0, True)) + + # Incompatible type for id: string cannot be coerced to int + with pytest.raises((TypeError, ValueError)): + table.append(["invalid_text", 1 + 2j, 95.5, True]) + + +def test_extreme_values(): + """Extreme complex, float boundary, and large integer values in one table.""" + # Combine all extremes into one table to avoid repeated CTable construction + extreme_data = [ + (1, complex(1e308, -1e308), 0.0, True), + (2**32, 0j, 100.0, False), + (2**60, complex(-1e308, 1e308), 0.0001, True), + (4, 0j, 99.9999, False), + ] + assert_table_equals_data(CTable(Row, new_data=extreme_data), extreme_data) + + +def test_extend_append_and_resize(): + """Auto-resize via append one-by-one, then extend+append beyond initial size.""" + # Append beyond expected_size triggers resize + table = CTable(Row, expected_size=2) + for row in SMALL_DATA: + table.append(row) + assert_table_equals_data(table, SMALL_DATA) + assert all(len(col) >= 5 for col in table._cols.values()) + + # Extend beyond expected_size, then append the last row + table2 = CTable(Row, expected_size=2) + table2.extend(SMALL_DATA[:4]) + assert len(table2) == 4 + table2.append(SMALL_DATA[4]) + assert_table_equals_data(table2, SMALL_DATA) + + +def test_column_integrity(): + """Column access via [] and getattr, and correct dtypes.""" + table = CTable(Row, new_data=SMALL_DATA) + + assert isinstance(table["id"], blosc2.ctable.Column) + assert isinstance(table.score, blosc2.ctable.Column) + + assert table._cols["id"].dtype == np.int64 + assert table._cols["c_val"].dtype == np.complex128 + assert table._cols["score"].dtype == np.float64 + assert table._cols["active"].dtype == np.bool_ + + +def test_valid_rows(): + """_valid_rows has exactly 5 True entries after creation and after extend.""" + table_direct = CTable(Row, new_data=SMALL_DATA) + assert blosc2.count_nonzero(table_direct._valid_rows) == 5 + + table_extended = CTable(Row) + table_extended.extend(SMALL_DATA) + assert blosc2.count_nonzero(table_extended._valid_rows) == 5 + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_csv_interop.py b/tests/ctable/test_csv_interop.py new file mode 100644 index 00000000..7b07277a --- /dev/null +++ b/tests/ctable/test_csv_interop.py @@ -0,0 +1,235 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for CTable.to_csv() and CTable.from_csv().""" + +import csv +import os +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + label: str = blosc2.field(blosc2.string(max_length=16), default="") + + +DATA10 = [(i, float(i * 10 % 100), i % 2 == 0, f"r{i}") for i in range(10)] + + +@pytest.fixture +def tmp_csv(tmp_path): + return str(tmp_path / "table.csv") + + +@pytest.fixture +def table10(): + return CTable(Row, new_data=DATA10) + + +# =========================================================================== +# to_csv() +# =========================================================================== + + +def test_to_csv_creates_file(table10, tmp_csv): + table10.to_csv(tmp_csv) + assert os.path.exists(tmp_csv) + + +def test_to_csv_header_row(table10, tmp_csv): + table10.to_csv(tmp_csv) + with open(tmp_csv) as f: + first = f.readline().strip() + assert first == "id,score,active,label" + + +def test_to_csv_row_count(table10, tmp_csv): + table10.to_csv(tmp_csv) + with open(tmp_csv) as f: + rows = list(csv.reader(f)) + assert len(rows) == 11 # 1 header + 10 data + + +def test_to_csv_no_header(table10, tmp_csv): + table10.to_csv(tmp_csv, header=False) + with open(tmp_csv) as f: + rows = list(csv.reader(f)) + assert len(rows) == 10 + + +def test_to_csv_int_values(table10, tmp_csv): + table10.to_csv(tmp_csv) + with open(tmp_csv) as f: + reader = csv.DictReader(f) + ids = [int(row["id"]) for row in reader] + assert ids == list(range(10)) + + +def test_to_csv_float_values(table10, tmp_csv): + table10.to_csv(tmp_csv) + with open(tmp_csv) as f: + reader = csv.DictReader(f) + scores = [float(row["score"]) for row in reader] + assert scores == [r[1] for r in DATA10] + + +def test_to_csv_bool_values(table10, tmp_csv): + table10.to_csv(tmp_csv) + with open(tmp_csv) as f: + reader = csv.DictReader(f) + actives = [row["active"] for row in reader] + # numpy bool serialises as "True"/"False" + assert actives == [str(r[2]) for r in DATA10] + + +def test_to_csv_string_values(table10, tmp_csv): + table10.to_csv(tmp_csv) + with open(tmp_csv) as f: + reader = csv.DictReader(f) + labels = [row["label"] for row in reader] + assert labels == [r[3] for r in DATA10] + + +def test_to_csv_custom_separator(table10, tmp_csv): + table10.to_csv(tmp_csv, sep="\t") + with open(tmp_csv) as f: + first = f.readline().strip() + assert "\t" in first + assert "," not in first + + +def test_to_csv_skips_deleted_rows(table10, tmp_csv): + table10.delete([0, 1]) + table10.to_csv(tmp_csv) + with open(tmp_csv) as f: + rows = list(csv.reader(f)) + assert len(rows) == 9 # 1 header + 8 live rows + assert rows[1][0] == "2" # first live id + + +def test_to_csv_empty_table(tmp_csv): + t = CTable(Row) + t.to_csv(tmp_csv) + with open(tmp_csv) as f: + rows = list(csv.reader(f)) + assert rows == [["id", "score", "active", "label"]] + + +def test_to_csv_select_view(table10, tmp_csv): + table10.select(["id", "label"]).to_csv(tmp_csv) + with open(tmp_csv) as f: + reader = csv.DictReader(f) + rows = list(reader) + assert list(rows[0].keys()) == ["id", "label"] + assert len(rows) == 10 + + +# =========================================================================== +# from_csv() +# =========================================================================== + + +def test_from_csv_returns_ctable(table10, tmp_csv): + table10.to_csv(tmp_csv) + t2 = CTable.from_csv(tmp_csv, Row) + assert isinstance(t2, CTable) + + +def test_from_csv_row_count(table10, tmp_csv): + table10.to_csv(tmp_csv) + t2 = CTable.from_csv(tmp_csv, Row) + assert len(t2) == 10 + + +def test_from_csv_column_names(table10, tmp_csv): + table10.to_csv(tmp_csv) + t2 = CTable.from_csv(tmp_csv, Row) + assert t2.col_names == ["id", "score", "active", "label"] + + +def test_from_csv_int_values(table10, tmp_csv): + table10.to_csv(tmp_csv) + t2 = CTable.from_csv(tmp_csv, Row) + np.testing.assert_array_equal(t2["id"].to_numpy(), table10["id"].to_numpy()) + + +def test_from_csv_float_values(table10, tmp_csv): + table10.to_csv(tmp_csv) + t2 = CTable.from_csv(tmp_csv, Row) + np.testing.assert_allclose(t2["score"].to_numpy(), table10["score"].to_numpy()) + + +def test_from_csv_bool_values(table10, tmp_csv): + table10.to_csv(tmp_csv) + t2 = CTable.from_csv(tmp_csv, Row) + # bool is serialised as "True"/"False"; np.array(..., dtype=bool) parses that + np.testing.assert_array_equal(t2["active"].to_numpy(), table10["active"].to_numpy()) + + +def test_from_csv_string_values(table10, tmp_csv): + table10.to_csv(tmp_csv) + t2 = CTable.from_csv(tmp_csv, Row) + assert t2["label"].to_numpy().tolist() == table10["label"].to_numpy().tolist() + + +def test_from_csv_no_header(table10, tmp_csv): + table10.to_csv(tmp_csv, header=False) + t2 = CTable.from_csv(tmp_csv, Row, header=False) + assert len(t2) == 10 + np.testing.assert_array_equal(t2["id"].to_numpy(), table10["id"].to_numpy()) + + +def test_from_csv_custom_separator(table10, tmp_csv): + table10.to_csv(tmp_csv, sep="\t") + t2 = CTable.from_csv(tmp_csv, Row, sep="\t") + assert len(t2) == 10 + + +def test_from_csv_empty_file(tmp_csv): + with open(tmp_csv, "w") as f: + f.write("id,score,active,label\n") + t = CTable.from_csv(tmp_csv, Row) + assert len(t) == 0 + assert t.col_names == ["id", "score", "active", "label"] + + +def test_from_csv_roundtrip(table10, tmp_csv): + """to_csv then from_csv preserves all values.""" + table10.to_csv(tmp_csv) + t2 = CTable.from_csv(tmp_csv, Row) + for name in ["id", "score"]: + np.testing.assert_array_equal(t2[name].to_numpy(), table10[name].to_numpy()) + np.testing.assert_array_equal(t2["active"].to_numpy(), table10["active"].to_numpy()) + assert t2["label"].to_numpy().tolist() == table10["label"].to_numpy().tolist() + + +def test_from_csv_wrong_field_count_raises(tmp_csv): + with open(tmp_csv, "w") as f: + f.write("id,score,active,label\n") + f.write("1,2.0\n") # only 2 fields instead of 4 + with pytest.raises(ValueError, match="expected 4 fields"): + CTable.from_csv(tmp_csv, Row) + + +def test_from_csv_not_dataclass_raises(tmp_csv): + with open(tmp_csv, "w") as f: + f.write("id\n1\n") + with pytest.raises(TypeError): + CTable.from_csv(tmp_csv, int) + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_ctable_dataclass_schema.py b/tests/ctable/test_ctable_dataclass_schema.py new file mode 100644 index 00000000..90283269 --- /dev/null +++ b/tests/ctable/test_ctable_dataclass_schema.py @@ -0,0 +1,289 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""End-to-end CTable tests using the dataclass schema API.""" + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable +from blosc2.schema_compiler import schema_from_dict + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +@dataclass +class RowComplex: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +# ------------------------------------------------------------------- +# Construction +# ------------------------------------------------------------------- + + +def test_construction_empty(): + t = CTable(Row) + assert len(t) == 0 + assert t.ncols == 3 + assert t.col_names == ["id", "score", "active"] + + +def test_construction_with_data(): + data = [(i, float(i), True) for i in range(10)] + t = CTable(Row, new_data=data) + assert len(t) == 10 + + +def test_construction_expected_size(): + t = CTable(Row, expected_size=500) + assert all(len(col) == 500 for col in t._cols.values()) + + +# ------------------------------------------------------------------- +# append — different input shapes +# ------------------------------------------------------------------- + + +def test_append_tuple(): + t = CTable(Row) + t.append((1, 50.0, True)) + assert len(t) == 1 + assert t.row[0].id[0] == 1 + assert t.row[0].score[0] == 50.0 + assert t.row[0].active[0] + + +def test_append_list(): + t = CTable(Row) + t.append([2, 75.0, False]) + assert len(t) == 1 + assert t.row[0].id[0] == 2 + + +def test_append_dict(): + t = CTable(Row) + t.append({"id": 3, "score": 25.0, "active": True}) + assert len(t) == 1 + assert t.row[0].id[0] == 3 + + +def test_append_dataclass_instance(): + t = CTable(Row) + + @dataclass + class Row2: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + t2 = CTable(Row2) + # Simulate appending a dict (dataclass instance path) + t2.append({"id": 4, "score": 10.0, "active": False}) + assert t2.row[0].id[0] == 4 + + +def test_append_defaults_filled(): + """Omitting optional fields fills them from defaults.""" + t = CTable(Row) + t.append((5,)) # only id; score=0.0 and active=True filled in + assert t.row[0].score[0] == 0.0 + assert t.row[0].active[0] + + +# ------------------------------------------------------------------- +# extend — iterable of rows +# ------------------------------------------------------------------- + + +def test_extend_list_of_tuples(): + t = CTable(Row, expected_size=10) + t.extend([(i, float(i), i % 2 == 0) for i in range(10)]) + assert len(t) == 10 + + +def test_extend_list_of_dicts(): + """extend() also accepts list of dicts via zip(*data) → positional path.""" + # This goes through the zip(*data) path so dicts aren't directly supported + # in extend; test that the common tuple path works correctly. + t = CTable(Row, expected_size=5) + data = [(i, float(i * 10), True) for i in range(5)] + t.extend(data) + for i in range(5): + assert t.row[i].id[0] == i + + +def test_extend_numpy_structured(): + dtype = np.dtype([("id", np.int64), ("score", np.float64), ("active", np.bool_)]) + arr = np.array([(1, 50.0, True), (2, 75.0, False)], dtype=dtype) + t = CTable(Row, expected_size=5) + t.extend(arr) + assert len(t) == 2 + assert t.row[0].id[0] == 1 + assert t.row[1].score[0] == 75.0 + + +# ------------------------------------------------------------------- +# extend — per-call validate override +# ------------------------------------------------------------------- + + +def test_extend_validate_override_false(): + """validate=False on a per-call basis skips checks even for a table with validate=True.""" + t = CTable(Row, expected_size=5, validate=True) + # Would fail if validate were applied + t.extend([(-1, 200.0, True)], validate=False) + assert len(t) == 1 + + +def test_extend_validate_override_true(): + """validate=True on a per-call basis enforces checks even for a table with validate=False.""" + t = CTable(Row, expected_size=5, validate=False) + with pytest.raises(ValueError): + t.extend([(-1, 50.0, True)], validate=True) + + +def test_extend_validate_none_uses_table_default(): + t_on = CTable(Row, expected_size=5, validate=True) + with pytest.raises(ValueError): + t_on.extend([(-1, 50.0, True)], validate=None) + + t_off = CTable(Row, expected_size=5, validate=False) + t_off.extend([(-1, 50.0, True)], validate=None) # no error + assert len(t_off) == 1 + + +# ------------------------------------------------------------------- +# Schema introspection (Step 9) +# ------------------------------------------------------------------- + + +def test_schema_property(): + from blosc2.schema_compiler import CompiledSchema + + t = CTable(Row) + assert isinstance(t.schema, CompiledSchema) + assert t.schema.row_cls is Row + + +def test_column_schema(): + from blosc2.schema_compiler import CompiledColumn + + t = CTable(Row) + col = t.column_schema("id") + assert isinstance(col, CompiledColumn) + assert col.name == "id" + assert col.spec.ge == 0 + + +def test_column_schema_unknown(): + t = CTable(Row) + with pytest.raises(KeyError, match="no_such_col"): + t.column_schema("no_such_col") + + +def test_schema_dict(): + t = CTable(Row) + d = t.schema_dict() + assert d["version"] == 1 + assert d["row_cls"] == "Row" + col_names = [c["name"] for c in d["columns"]] + assert col_names == ["id", "score", "active"] + + +def test_schema_dict_roundtrip(): + """schema_from_dict on a CTable's schema_dict restores column structure.""" + t = CTable(Row) + d = t.schema_dict() + restored = schema_from_dict(d) + assert len(restored.columns) == 3 + assert restored.columns_by_name["id"].spec.ge == 0 + assert restored.columns_by_name["score"].spec.le == 100 + + +# ------------------------------------------------------------------- +# Per-column cparams plumbing +# ------------------------------------------------------------------- + + +def test_per_column_cparams(): + """Columns with custom cparams get their own NDArray settings.""" + + @dataclass + class CustomRow: + id: int = blosc2.field(blosc2.int64(), cparams={"clevel": 9}) + score: float = blosc2.field(blosc2.float64(), default=0.0) + + t = CTable(CustomRow, expected_size=10) + # The column schema reflects the cparams + assert t.column_schema("id").config.cparams == {"clevel": 9} + assert t.column_schema("score").config.cparams is None + + +# ------------------------------------------------------------------- +# New integer / float spec types used in CTable +# ------------------------------------------------------------------- + + +def test_new_spec_types_in_ctable(): + """int8, uint16, float32 and friends work correctly end-to-end in CTable.""" + + @dataclass + class Compact: + flags: int = blosc2.field(blosc2.uint8(le=255)) + level: int = blosc2.field(blosc2.int8(ge=-128, le=127), default=0) + ratio: float = blosc2.field(blosc2.float32(ge=0.0, le=1.0), default=0.0) + + t = CTable(Compact, expected_size=10) + t.extend([(0, -1, 0.0), (255, 127, 1.0), (128, 0, 0.5)]) + assert len(t) == 3 + assert t._cols["flags"].dtype == np.dtype(np.uint8) + assert t._cols["level"].dtype == np.dtype(np.int8) + assert t._cols["ratio"].dtype == np.dtype(np.float32) + + +def test_new_spec_constraints_enforced(): + """Constraints on new spec types are enforced by both append and extend.""" + + # uint8 with explicit ge=0: negative value rejected by Pydantic + @dataclass + class R: + x: int = blosc2.field(blosc2.uint8(ge=0, le=200)) + + t = CTable(R, expected_size=5) + with pytest.raises(ValueError): + t.append((-1,)) # violates ge=0 + with pytest.raises(ValueError): + t.append((201,)) # violates le=200 + + # int8 with ge/le: vectorized extend checks + @dataclass + class R2: + x: int = blosc2.field(blosc2.int8(ge=0, le=100)) + + t2 = CTable(R2, expected_size=5) + with pytest.raises(ValueError): + t2.extend([(101,)]) # violates le=100 + with pytest.raises(ValueError): + t2.extend([(-1,)]) # violates ge=0 + + +if __name__ == "__main__": + import pytest + + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_ctable_indexing.py b/tests/ctable/test_ctable_indexing.py new file mode 100644 index 00000000..d39fd6fd --- /dev/null +++ b/tests/ctable/test_ctable_indexing.py @@ -0,0 +1,568 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for CTable persistent and in-memory indexing.""" + +import dataclasses +import shutil +import tempfile +import weakref +from pathlib import Path + +import numpy as np +import pytest + +import blosc2 + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +@dataclasses.dataclass +class Row: + id: int = blosc2.field(blosc2.int32()) + value: float = blosc2.field(blosc2.float64()) + category: int = blosc2.field(blosc2.int32()) + + +def _make_table(n=100, persistent_path=None): + """Return a CTable with *n* rows, optionally persistent at *persistent_path*.""" + if persistent_path is not None: + t = blosc2.CTable(Row, urlpath=persistent_path, mode="w") + else: + t = blosc2.CTable(Row) + for i in range(n): + t.append([i, float(i) * 1.5, i % 5]) + return t + + +# --------------------------------------------------------------------------- +# In-memory table tests +# --------------------------------------------------------------------------- + + +def test_create_index_in_memory(): + t = _make_table(50) + idx = t.create_index("id") + assert idx is not None + assert idx.col_name == "id" + assert not idx.stale + assert len(t.indexes) == 1 + assert t.indexes[0].col_name == "id" + + +def test_create_index_in_memory_duplicate_raises(): + t = _make_table(20) + t.create_index("id") + with pytest.raises(ValueError, match="Index already exists"): + t.create_index("id") + + +def test_drop_index_in_memory(): + t = _make_table(20) + t.create_index("id") + t.drop_index("id") + assert len(t.indexes) == 0 + with pytest.raises(KeyError): + t.index("id") + + +def test_drop_nonexistent_index_raises(): + t = _make_table(20) + with pytest.raises(KeyError, match="No index found"): + t.drop_index("id") + + +def test_drop_indexed_column_clears_catalog(): + t = _make_table(20) + t.create_index("id") + t.drop_column("id") + assert [idx.col_name for idx in t.indexes] == [] + with pytest.raises(KeyError, match="No index found"): + t.index("id") + + +def test_where_with_index_matches_scan_in_memory(): + t = _make_table(200) + t.create_index("id") + result_idx = t.where(t["id"] > 100) + # Drop index to force scan + t.drop_index("id") + result_scan = t.where(t["id"] > 100) + ids_idx = sorted(int(v) for v in result_idx["id"].to_numpy()) + ids_scan = sorted(int(v) for v in result_scan["id"].to_numpy()) + assert ids_idx == ids_scan + + +def test_bool_column_composes_naturally_in_where(): + @dataclasses.dataclass + class BoolRow: + sensor_id: int = blosc2.field(blosc2.int32()) + region: str = blosc2.field(blosc2.string(max_length=8), default="") + active: bool = blosc2.field(blosc2.bool(), default=True) + + t = blosc2.CTable(BoolRow) + for i in range(20): + t.append([i, "north" if i % 4 == 0 else "south", i % 2 == 0]) + + result = t.where((t["sensor_id"] >= 8) & t["active"] & (t["region"] == "north")) + assert sorted(int(v) for v in result["sensor_id"].to_numpy()) == [8, 12, 16] + + result_bare = t.where(t["active"]) + assert sorted(int(v) for v in result_bare["sensor_id"].to_numpy()) == list(range(0, 20, 2)) + + +def test_rebuild_index_in_memory(): + t = _make_table(30) + t.create_index("id") + t.append([999, 999.0, 4]) # marks stale + assert t.index("id").stale + idx2 = t.rebuild_index("id") + assert not idx2.stale + result = t.where(t["id"] == 999) + assert len(result) == 1 + + +def test_stale_on_append_in_memory(): + t = _make_table(20) + t.create_index("id") + t.append([100, 100.0, 0]) + assert t.index("id").stale + + +def test_stale_on_extend_in_memory(): + t = _make_table(20) + t.create_index("id") + t.extend([[101, 101.0, 0], [102, 102.0, 1]]) + assert t.index("id").stale + + +def test_stale_on_column_setitem_in_memory(): + t = _make_table(20) + t.create_index("id") + t["id"][0] = 999 + assert t.index("id").stale + + +def test_stale_on_column_assign_in_memory(): + t = _make_table(20) + t.create_index("id") + t["id"].assign(np.arange(20, dtype=np.int32)) + assert t.index("id").stale + + +def test_delete_bumps_visibility_epoch_not_stale_in_memory(): + t = _make_table(20) + t.create_index("id") + t.delete(0) + idx = t.index("id") + # delete should NOT mark stale (only bumps visibility_epoch) + assert not idx.stale + _, vis_e = t._storage.get_epoch_counters() + assert vis_e >= 1 + + +def test_stale_fallback_to_scan_in_memory(): + t = _make_table(50) + t.create_index("id") + t.append([200, 200.0, 0]) # marks stale + # Query should still work (falls back to scan) + result = t.where(t["id"] > 40) + ids = sorted(int(v) for v in result["id"].to_numpy()) + assert 200 in ids + assert 41 in ids + + +def test_compact_index_in_memory(): + t = _make_table(50, persistent_path=None) + t.create_index("id", kind=blosc2.IndexKind.FULL) + # compact_index should not raise for full indexes + t.compact_index("id") + + +def test_multi_column_conjunction_uses_multiple_indexes_in_memory(): + t = _make_table(200) + t.create_index("id", kind=blosc2.IndexKind.FULL) + t.create_index("category", kind=blosc2.IndexKind.FULL) + expr = (t["id"] >= 50) & (t["id"] < 120) & (t["category"] == 3) + result_idx = t.where(expr) + t.drop_index("id") + t.drop_index("category") + result_scan = t.where(expr) + ids_idx = sorted(int(v) for v in result_idx["id"].to_numpy()) + ids_scan = sorted(int(v) for v in result_scan["id"].to_numpy()) + assert ids_idx == ids_scan + + +# --------------------------------------------------------------------------- +# Persistent table tests +# --------------------------------------------------------------------------- + + +@pytest.fixture +def tmpdir(): + d = tempfile.mkdtemp() + yield Path(d) + shutil.rmtree(d, ignore_errors=True) + + +def test_create_index_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(50, persistent_path=path) + idx = t.create_index("id") + assert not idx.stale + # Sidecar directory must exist + index_dir = Path(path) / "_indexes" / "id" + assert index_dir.exists() + # At least one .b2nd sidecar file + sidecars = list(index_dir.glob("**/*.b2nd")) + assert sidecars, "No sidecar .b2nd files found" + + +def test_create_index_persistent_does_not_cache_sidecar_handles(tmpdir): + import blosc2.indexing as indexing + + path = str(tmpdir / "table.b2d") + t = _make_table(50, persistent_path=path) + t.create_index("id", kind=blosc2.IndexKind.FULL) + + cached = [ + key + for key in indexing._SIDECAR_HANDLE_CACHE + if key[0][0] == "persistent" and str(tmpdir) in key[0][1] + ] + assert cached == [] + + +def test_persistent_ctable_releases_immediately_without_gc(tmpdir): + path = str(tmpdir / "table.b2d") + + def build_table(): + t = _make_table(50, persistent_path=path) + t.create_index("id", kind=blosc2.IndexKind.FULL) + return weakref.ref(t) + + table_ref = build_table() + assert table_ref() is None + + +def test_catalog_survives_reopen(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(30, persistent_path=path) + t.create_index("id") + del t # close + + t2 = blosc2.open(path, mode="r") + idxs = t2.indexes + assert len(idxs) == 1 + assert idxs[0].col_name == "id" + assert not idxs[0].stale + + +def test_where_with_index_matches_scan_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(200, persistent_path=path) + t.create_index("id") + result_idx = t.where(t["id"] > 150) + + t.drop_index("id") + result_scan = t.where(t["id"] > 150) + + ids_idx = sorted(int(v) for v in result_idx["id"].to_numpy()) + ids_scan = sorted(int(v) for v in result_scan["id"].to_numpy()) + assert ids_idx == ids_scan + + +def test_persistent_index_drop_releases_sidecars_without_gc(tmpdir): + import gc + + def run_query_and_drop(): + path = str(tmpdir / "table.b2d") + t = _make_table(200, persistent_path=path) + t.create_index("id") + result = t.where(t["id"] > 150) + ids = sorted(int(v) for v in result["id"].to_numpy()) + assert ids == list(range(151, 200)) + t.drop_index("id") + + run_query_and_drop() + + sidecars = [ + obj + for obj in gc.get_objects() + if isinstance(obj, blosc2.NDArray) and obj.urlpath and str(tmpdir) in obj.urlpath and "__index__" in obj.urlpath + ] + assert sidecars == [] + + +def test_drop_index_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(30, persistent_path=path) + t.create_index("id") + t.drop_index("id") + assert len(t.indexes) == 0 + index_dir = Path(path) / "_indexes" / "id" + # After drop, index dir should be gone (or empty) + sidecars = list(index_dir.glob("**/*.b2nd")) if index_dir.exists() else [] + assert sidecars == [] + + +def test_drop_index_persistent_catalog_cleared(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(30, persistent_path=path) + t.create_index("id") + t.drop_index("id") + del t + + t2 = blosc2.open(path, mode="r") + assert len(t2.indexes) == 0 + + +def test_drop_indexed_column_removes_persistent_sidecars(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(30, persistent_path=path) + t.create_index("id") + t.drop_column("id") + assert len(t.indexes) == 0 + assert not (Path(path) / "_indexes" / "id").exists() + + +def test_rebuild_index_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(50, persistent_path=path) + t.create_index("id") + t.append([500, 750.0, 2]) # marks stale + assert t.index("id").stale + idx2 = t.rebuild_index("id") + assert not idx2.stale + result = t.where(t["id"] == 500) + assert len(result) == 1 + + +def test_compact_index_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(50, persistent_path=path) + t.create_index("id", kind=blosc2.IndexKind.FULL) + t.compact_index("id") + # Query should still work after compact + result = t.where(t["id"] > 40) + ids = sorted(int(v) for v in result["id"].to_numpy()) + expected = list(range(41, 50)) + assert ids == expected + + +def test_stale_on_append_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(20, persistent_path=path) + t.create_index("id") + t.append([200, 300.0, 1]) + assert t.index("id").stale + + +def test_stale_persists_after_reopen(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(20, persistent_path=path) + t.create_index("id") + t.append([200, 300.0, 1]) # marks stale + del t + + t2 = blosc2.open(path, mode="r") + assert t2.index("id").stale + + +def test_delete_bumps_visibility_epoch_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(20, persistent_path=path) + t.create_index("id") + t.delete(0) + idx = t.index("id") + # delete should NOT mark index stale + assert not idx.stale + _, vis_e = t._storage.get_epoch_counters() + assert vis_e >= 1 + + +def test_query_after_reopen_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(100, persistent_path=path) + t.create_index("id") + del t + + t2 = blosc2.open(path, mode="r") + result = t2.where(t2["id"] > 90) + ids = sorted(int(v) for v in result["id"].to_numpy()) + assert ids == list(range(91, 100)) + + +def test_rename_indexed_column_rebuilds_catalog_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(40, persistent_path=path) + t.create_index("id") + t.rename_column("id", "newid") + assert [idx.col_name for idx in t.indexes] == ["newid"] + assert not (Path(path) / "_indexes" / "id").exists() + assert (Path(path) / "_indexes" / "newid").exists() + result = t.where(t["newid"] > 35) + assert sorted(int(v) for v in result["newid"].to_numpy()) == [36, 37, 38, 39] + + +# --------------------------------------------------------------------------- +# View tests +# --------------------------------------------------------------------------- + + +def test_view_cannot_create_index(): + t = _make_table(20) + view = t.where(t["id"] > 5) + with pytest.raises(ValueError, match="view"): + view.create_index("id") + + +def test_view_cannot_drop_index(): + t = _make_table(20) + t.create_index("id") + view = t.where(t["id"] > 5) + with pytest.raises(ValueError, match="view"): + view.drop_index("id") + + +def test_view_cannot_rebuild_index(): + t = _make_table(20) + t.create_index("id") + view = t.where(t["id"] > 5) + with pytest.raises(ValueError, match="view"): + view.rebuild_index("id") + + +def test_view_cannot_compact_index(): + t = _make_table(20) + t.create_index("id") + view = t.where(t["id"] > 5) + with pytest.raises(ValueError, match="view"): + view.compact_index("id") + + +def test_view_query_uses_root_index(): + t = _make_table(200) + t.create_index("id") + # Query on the original table + result_direct = t.where(t["id"] > 180) + ids_direct = sorted(int(v) for v in result_direct["id"].to_numpy()) + assert ids_direct == list(range(181, 200)) + + +def test_malformed_catalog_entry_raises_clear_error(): + t = _make_table(20) + t._storage.save_index_catalog({"id": {"kind": "bucket"}}) + with pytest.raises(ValueError, match="Malformed index metadata"): + t.where(t["id"] > 5) + + +# --------------------------------------------------------------------------- +# index() and indexes property +# --------------------------------------------------------------------------- + + +def test_index_lookup_missing_raises(): + t = _make_table(10) + with pytest.raises(KeyError): + t.index("nonexistent") + + +def test_indexes_empty_on_new_table(): + t = _make_table(10) + assert t.indexes == [] + + +def test_indexes_multiple_columns(): + t = _make_table(30) + t.create_index("id") + t.create_index("category") + assert len(t.indexes) == 2 + col_names = {idx.col_name for idx in t.indexes} + assert col_names == {"id", "category"} + + +def test_indexed_ctable_b2z_double_open_append_no_corruption(tmp_path): + """Opening an indexed CTable .b2z in append mode twice must not corrupt it. + + Regression test: GC of a CTable opened from .b2z was calling close() → + to_b2z() even when nothing was modified, overwriting the archive with a + near-empty ZIP that broke subsequent opens. + """ + path = str(tmp_path / "indexed.b2z") + b2d_path = str(tmp_path / "indexed.b2d") + + t = _make_table(50, persistent_path=b2d_path) + t.create_index("id") + t._storage._store.to_b2z(filename=path, overwrite=True) + t._storage._store.close() + shutil.rmtree(b2d_path) + + # First open without explicit close — GC must not corrupt the archive + t1 = blosc2.open(path, mode="a") + assert t1.nrows == 50 + assert len(t1.indexes) == 1 + del t1 # triggers __del__; must NOT repack/corrupt + + # Second open must succeed and see correct data + t2 = blosc2.open(path, mode="a") + assert t2.nrows == 50 + assert len(t2.indexes) == 1 + del t2 + + +def test_indexing_purges_stale_persistent_caches(): + import blosc2.indexing as indexing + + with tempfile.TemporaryDirectory() as tmpdir: + path = str(Path(tmpdir) / "table.b2d") + t = _make_table(50, persistent_path=path) + t.create_index("id") + _ = t.where(t["id"] > 10) + t.close() + + persistent_scope = ("persistent", str(Path(path).resolve())) + indexing._PERSISTENT_INDEXES[persistent_scope] = {"version": 1, "indexes": {}} + indexing._DATA_CACHE[(persistent_scope, "token", "partial", "offsets")] = np.arange(3, dtype=np.int64) + indexing._SIDECAR_HANDLE_CACHE[(persistent_scope, "token", "partial_handle", "offsets")] = object() + indexing._QUERY_CACHE_STORE_HANDLES[str(Path(tmpdir) / "query-cache.b2frame")] = object() + indexing._GATHER_MMAP_HANDLES[str(Path(tmpdir) / "gather-cache.b2nd")] = object() + + indexing._purge_stale_persistent_caches() + + assert all(tmpdir not in key[1] for key in indexing._PERSISTENT_INDEXES if key[0] == "persistent") + assert all(tmpdir not in key[0][1] for key in indexing._DATA_CACHE if key[0][0] == "persistent") + assert all(tmpdir not in key[0][1] for key in indexing._SIDECAR_HANDLE_CACHE if key[0][0] == "persistent") + assert all(tmpdir not in path for path in indexing._QUERY_CACHE_STORE_HANDLES) + assert all(tmpdir not in path for path in indexing._GATHER_MMAP_HANDLES) + + +def test_indexing_purge_tolerates_reentrant_sidecar_handle_cache_mutation(monkeypatch): + import blosc2.indexing as indexing + + stale_scope = ("persistent", "/tmp/stale-index.b2nd") + stale_key = (stale_scope, "token", "partial_handle", "offsets") + injected_key = (("memory", 12345), "token", "partial_handle", "offsets") + sentinel = object() + original_exists = indexing._persistent_cache_path_exists + + indexing._SIDECAR_HANDLE_CACHE[stale_key] = sentinel + + def mutating_exists(path): + indexing._SIDECAR_HANDLE_CACHE[injected_key] = sentinel + if path == stale_scope[1]: + return False + return original_exists(path) + + monkeypatch.setattr(indexing, "_persistent_cache_path_exists", mutating_exists) + + indexing._purge_stale_persistent_caches() + + assert stale_key not in indexing._SIDECAR_HANDLE_CACHE + indexing._SIDECAR_HANDLE_CACHE.pop(injected_key, None) diff --git a/tests/ctable/test_delete_rows.py b/tests/ctable/test_delete_rows.py new file mode 100644 index 00000000..2f8e9013 --- /dev/null +++ b/tests/ctable/test_delete_rows.py @@ -0,0 +1,203 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass + +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +def generate_test_data(n_rows: int) -> list: + return [(i, complex(i, -i), float((i * 7) % 100), bool(i % 2)) for i in range(1, n_rows + 1)] + + +# ------------------------------------------------------------------- +# Tests +# ------------------------------------------------------------------- + + +def test_delete_single_element(): + """First, last, middle deletion once; and repeated deletion from front/back.""" + data = generate_test_data(50) + + # Delete first + t = CTable(Row, new_data=data, expected_size=50) + t.delete(0) + assert len(t) == 49 + assert not t._valid_rows[0] + + # Delete last + t2 = CTable(Row, new_data=data, expected_size=50) + t2.delete(-1) + assert len(t2) == 49 + + # Delete middle + t3 = CTable(Row, new_data=data, expected_size=50) + t3.delete(25) + assert len(t3) == 49 + + # Delete first 10 times in a row + t4 = CTable(Row, new_data=data, expected_size=50) + for i in range(10): + t4.delete(0) + assert len(t4) == 50 - (i + 1) + assert len(t4) == 40 + + # Delete last 10 times in a row + t5 = CTable(Row, new_data=data, expected_size=50) + for i in range(10): + t5.delete(-1) + assert len(t5) == 50 - (i + 1) + assert len(t5) == 40 + + +def test_delete_list_of_positions(): + """Scattered, consecutive, even, odd, and slice-equivalent list deletions.""" + data = generate_test_data(50) + + # Scattered + t = CTable(Row, new_data=data, expected_size=50) + t.delete([0, 10, 20, 30, 40]) + assert len(t) == 45 + + # Consecutive block + t2 = CTable(Row, new_data=data, expected_size=50) + t2.delete([5, 6, 7, 8, 9]) + assert len(t2) == 45 + + # All even positions + t3 = CTable(Row, new_data=data, expected_size=50) + t3.delete(list(range(0, 50, 2))) + assert len(t3) == 25 + + # All odd positions + t4 = CTable(Row, new_data=data, expected_size=50) + t4.delete(list(range(1, 50, 2))) + assert len(t4) == 25 + + # Slice-equivalent: range(10, 20) + t5 = CTable(Row, new_data=data, expected_size=50) + t5.delete(list(range(10, 20))) + assert len(t5) == 40 + + # Slice with step: range(0, 20, 2) + t6 = CTable(Row, new_data=data, expected_size=50) + t6.delete(list(range(0, 20, 2))) + assert len(t6) == 40 + + # First 10 rows + t7 = CTable(Row, new_data=data, expected_size=50) + t7.delete(list(range(0, 10))) + assert len(t7) == 40 + + # Last 10 rows + t8 = CTable(Row, new_data=data, expected_size=50) + t8.delete(list(range(40, 50))) + assert len(t8) == 40 + + +def test_delete_out_of_bounds(): + """All IndexError scenarios: full table, partial table, empty table, negative.""" + data = generate_test_data(50) + + # Beyond length on full table + t = CTable(Row, new_data=data, expected_size=50) + with pytest.raises(IndexError): + t.delete(60) + with pytest.raises(IndexError): + t.delete(-60) + + # Beyond nrows on partial table (capacity 50, only 25 rows) + t2 = CTable(Row, new_data=generate_test_data(25), expected_size=50) + assert len(t2) == 25 + with pytest.raises(IndexError): + t2.delete(35) + + # Empty table: positions 0, 25, -1 all raise + for pos in [0, 25, -1]: + empty = CTable(Row, expected_size=50) + assert len(empty) == 0 + with pytest.raises(IndexError): + empty.delete(pos) + + +def test_delete_edge_cases(): + """Same position twice, all rows front/back, negative and mixed indices.""" + data = generate_test_data(50) + + # Same logical position twice: second delete hits what was position 11 + t = CTable(Row, new_data=data, expected_size=50) + t.delete(10) + assert len(t) == 49 + t.delete(10) + assert len(t) == 48 + + # Delete all rows from the front one by one + t2 = CTable(Row, new_data=data, expected_size=50) + for _ in range(50): + t2.delete(0) + assert len(t2) == 0 + + # Delete all rows from the back one by one + t3 = CTable(Row, new_data=data, expected_size=50) + for _ in range(50): + t3.delete(-1) + assert len(t3) == 0 + + # Negative indices list + t4 = CTable(Row, new_data=data, expected_size=50) + t4.delete([-1, -5, -10]) + assert len(t4) == 47 + + # Mixed positive and negative indices + t5 = CTable(Row, new_data=data, expected_size=50) + t5.delete([0, -1, 25]) + assert len(t5) == 47 + + +def test_delete_invalid_types(): + """string, float, and list-with-strings all raise errors.""" + data = generate_test_data(50) + + t = CTable(Row, new_data=data, expected_size=50) + with pytest.raises(TypeError): + t.delete("invalid") + with pytest.raises(TypeError): + t.delete(10.5) + with pytest.raises(IndexError): + t.delete([0, "invalid", 10]) + + +def test_delete_stress(): + """Large batch deletion and alternating multi-pass pattern.""" + data = generate_test_data(50) + + # Delete 40 out of 50 at once + t = CTable(Row, new_data=data, expected_size=50) + t.delete(list(range(0, 40))) + assert len(t) == 10 + + # Alternating two-pass deletion + t2 = CTable(Row, new_data=data, expected_size=50) + t2.delete(list(range(0, 50, 2))) # delete all even -> 25 remain + assert len(t2) == 25 + t2.delete(list(range(0, 25, 2))) # delete every other of remaining -> ~12 + assert len(t2) == 12 + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_extend_delete.py b/tests/ctable/test_extend_delete.py new file mode 100644 index 00000000..41e82e4a --- /dev/null +++ b/tests/ctable/test_extend_delete.py @@ -0,0 +1,220 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +def generate_test_data(n_rows: int, start_id: int = 1) -> list: + return [(start_id + i, complex(i, -i), float((i * 7) % 100), bool(i % 2)) for i in range(n_rows)] + + +def get_valid_mask(table: CTable) -> np.ndarray: + return np.array(table._valid_rows[: len(table._valid_rows)], dtype=bool) + + +def assert_mask_matches(table: CTable, expected_mask: list): + actual = get_valid_mask(table)[: len(expected_mask)] + np.testing.assert_array_equal( + actual, + np.array(expected_mask, dtype=bool), + err_msg=f"Mask mismatch.\nExpected: {expected_mask}\nGot: {actual}", + ) + + +def assert_data_at_positions(table: CTable, positions: list, expected_ids: list): + for pos, expected_id in zip(positions, expected_ids, strict=False): + actual_id = int(table._cols["id"][pos]) + assert actual_id == expected_id, f"Position {pos}: expected ID {expected_id}, got {actual_id}" + + +# ------------------------------------------------------------------- +# Tests +# ------------------------------------------------------------------- + + +def test_gap_fill_mask_and_positions(): + """extend and append fill from last valid position; mask is updated correctly.""" + # extend after deletions: mask and physical positions + t = CTable(Row, new_data=generate_test_data(7, 1), expected_size=10) + t.delete([0, 2, 4, 6]) + assert_mask_matches(t, [False, True, False, True, False, True, False]) + assert len(t) == 3 + t.extend(generate_test_data(3, 8)) + assert_mask_matches(t, [False, True, False, True, False, True, True, True, True]) + assert len(t) == 6 + assert_data_at_positions(t, [6, 7, 8], [8, 9, 10]) + + # append fills from last valid position, not into holes + t2 = CTable(Row, new_data=generate_test_data(5, 1), expected_size=10) + t2.delete([1, 3]) + assert_mask_matches(t2, [True, False, True, False, True]) + t2.append((6, 1j, 50.0, True)) + assert_mask_matches(t2, [True, False, True, False, True, True]) + t2.append((7, 2j, 60.0, False)) + assert_mask_matches(t2, [True, False, True, False, True, True, True]) + + # extend fills from last valid position when there's enough capacity + t3 = CTable(Row, new_data=generate_test_data(10, 1), expected_size=15) + t3.delete([2, 4, 6]) + t3.extend(generate_test_data(3, 20)) + assert_data_at_positions(t3, [10, 11, 12], [20, 21, 22]) + + +def test_resize_behavior(): + """Resize triggered when capacity is full; compact=True avoids massive resize.""" + # compact=False: append beyond capacity must resize + t = CTable(Row, new_data=generate_test_data(10, 1), expected_size=10, compact=False) + t.delete(list(range(9))) + assert len(t) == 1 + initial_cap = len(t._valid_rows) + t.append((11, 5j, 75.0, True)) + assert len(t._valid_rows) > initial_cap + + # compact=True: no massive resize after deletions + extend + t2 = CTable(Row, new_data=generate_test_data(10, 1), expected_size=10, compact=True) + t2.delete(list(range(9))) + assert len(t2) == 1 + initial_cap2 = len(t2._valid_rows) + t2.extend(generate_test_data(3, 11)) + assert len(t2._valid_rows) <= initial_cap2 * 2 + + # extend exceeding capacity always resizes regardless of compact + t3 = CTable(Row, new_data=generate_test_data(5, 1), expected_size=10, compact=False) + t3.delete([0, 2, 4]) + initial_cap3 = len(t3._valid_rows) + t3.extend(generate_test_data(20, 100)) + assert len(t3._valid_rows) > initial_cap3 + + +def test_mixed_append_extend_with_gaps(): + """Multiple extends, appends, and deletes interleaved; lengths stay correct.""" + # Multiple extends with intermediate deletions + t = CTable(Row, expected_size=20) + t.extend(generate_test_data(5, 1)) + t.extend(generate_test_data(3, 10)) + assert len(t) == 8 + t.delete([2, 4, 6]) + assert len(t) == 5 + t.extend(generate_test_data(2, 20)) + assert len(t) == 7 + t.delete([0, 1]) + assert len(t) == 5 + t.extend(generate_test_data(4, 30)) + assert len(t) == 9 + + # append + extend mixed, delete all then re-extend + t2 = CTable(Row, expected_size=20) + for i in range(5): + t2.append((i + 1, complex(i), float(i * 10), True)) + assert len(t2) == 5 + t2.extend(generate_test_data(5, 10)) + assert len(t2) == 10 + t2.delete([1, 3, 5, 7, 9]) + assert len(t2) == 5 + t2.append((100, 0j, 50.0, False)) + assert len(t2) == 6 + t2.extend(generate_test_data(3, 200)) + assert len(t2) == 9 + + # Fill all gaps then extend; delete all then extend from scratch + t3 = CTable(Row, new_data=generate_test_data(10, 1), expected_size=15) + t3.delete(list(range(0, 10, 2))) + assert len(t3) == 5 + t3.extend(generate_test_data(5, 20)) + assert len(t3) == 10 + + t4 = CTable(Row, new_data=generate_test_data(10, 1), expected_size=15) + t4.delete(list(range(10))) + assert len(t4) == 0 + t4.extend(generate_test_data(5, 100)) + assert len(t4) == 5 + + +def test_compact_behavior(): + """Manual compact consolidates mask; auto-compact keeps data correct after extend.""" + # Manual compact: valid rows packed to front, extend fills after them + t = CTable(Row, new_data=generate_test_data(10, 1), expected_size=15, compact=False) + t.delete([1, 3, 5, 7, 9]) + assert len(t) == 5 + t.compact() + assert_mask_matches(t, [True] * 5 + [False] * 10) + t.extend(generate_test_data(3, 20)) + assert len(t) == 8 + + # Auto-compact: table stays consistent after heavy deletions + extend + t2 = CTable(Row, new_data=generate_test_data(10, 1), expected_size=15, compact=True) + t2.delete(list(range(0, 8))) + assert len(t2) == 2 + t2.extend(generate_test_data(10, 100)) + assert len(t2) == 12 + + +def test_complex_scenarios(): + """Sparse gaps, alternating cycles, data integrity, and full workflow.""" + # Sparse table: many scattered deletions then bulk extend + t = CTable(Row, new_data=generate_test_data(20, 1), expected_size=30) + t.delete([0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18]) + assert len(t) == 5 + t.extend(generate_test_data(10, 100)) + assert len(t) == 15 + + # Alternating extend/delete cycles + t2 = CTable(Row, expected_size=50) + for cycle in range(5): + t2.extend(generate_test_data(10, cycle * 100)) + current_len = len(t2) + if current_len >= 5: + t2.delete(list(range(0, min(5, current_len)))) + + # Data integrity: correct row values survive delete + extend + t3 = CTable( + Row, new_data=[(1, 1j, 10.0, True), (2, 2j, 20.0, False), (3, 3j, 30.0, True)], expected_size=10 + ) + t3.delete(1) + assert t3.row[0].id[0] == 1 + assert t3.row[1].id[0] == 3 + t3.extend([(10, 10j, 100.0, True), (11, 11j, 100.0, False)]) + assert t3.row[0].id[0] == 1 + assert t3.row[1].id[0] == 3 + assert t3.row[2].id[0] == 10 + assert t3.row[3].id[0] == 11 + + # Full workflow + t4 = CTable(Row, expected_size=20, compact=False) + t4.extend(generate_test_data(10, 1)) + assert len(t4) == 10 + t4.delete([0, 2, 4, 6, 8]) + assert len(t4) == 5 + t4.append((100, 0j, 50.0, True)) + t4.append((101, 1j, 60.0, False)) + assert len(t4) == 7 + t4.extend(generate_test_data(5, 200)) + assert len(t4) == 12 + t4.delete([3, 7, 10]) + assert len(t4) == 9 + t4.extend(generate_test_data(3, 300)) + assert len(t4) == 12 + assert t4.nrows == 12 + assert t4.ncols == 4 + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_nullable.py b/tests/ctable/test_nullable.py new file mode 100644 index 00000000..1a13df62 --- /dev/null +++ b/tests/ctable/test_nullable.py @@ -0,0 +1,549 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for nullable column support (null_value sentinel).""" + +from __future__ import annotations + +import math +import os +import pathlib +import shutil +import tempfile +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + +# --------------------------------------------------------------------------- +# Schemas used across tests +# --------------------------------------------------------------------------- + + +@dataclass +class IntRow: + id: int = blosc2.field(blosc2.int64()) + score: int = blosc2.field(blosc2.int64(ge=0, le=1000, null_value=-1)) + + +@dataclass +class FloatRow: + name: str = blosc2.field(blosc2.string(max_length=16)) + value: float = blosc2.field(blosc2.float64(null_value=float("nan"))) + + +@dataclass +class StrRow: + label: str = blosc2.field(blosc2.string(max_length=16, null_value="")) + rank: int = blosc2.field(blosc2.int64()) + + +@dataclass +class BoolRow: + code: int = blosc2.field(blosc2.int64(null_value=-999)) + flag: bool = blosc2.field(blosc2.bool(), default=False) + + +TABLE_ROOT = pathlib.Path(__file__).parent / "saved_ctable" / "test_nullable" + + +@pytest.fixture(autouse=True) +def clean_dir(): + if TABLE_ROOT.exists(): + shutil.rmtree(TABLE_ROOT) + TABLE_ROOT.mkdir(parents=True, exist_ok=True) + yield + if TABLE_ROOT.exists(): + shutil.rmtree(TABLE_ROOT) + + +def table_path(name: str) -> str: + return str(TABLE_ROOT / name) + + +# =========================================================================== +# null_value property +# =========================================================================== + + +def test_null_value_property_set(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1)]) + assert t["score"].null_value == -1 + + +def test_null_value_property_not_set(): + t = CTable(IntRow, new_data=[(1, 10)]) + assert t["id"].null_value is None + + +def test_null_value_nan(): + t = CTable(FloatRow, new_data=[("a", 1.0), ("b", float("nan"))]) + nv = t["value"].null_value + assert isinstance(nv, float) + assert math.isnan(nv) + + +def test_null_value_string(): + t = CTable(StrRow, new_data=[("hello", 1), ("", 2)]) + assert t["label"].null_value == "" + + +# =========================================================================== +# is_null / notnull / null_count +# =========================================================================== + + +def test_is_null_basic(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, 20), (4, -1)]) + mask = t["score"].is_null() + assert list(mask) == [False, True, False, True] + + +def test_notnull_basic(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, 20)]) + mask = t["score"].notnull() + assert list(mask) == [True, False, True] + + +def test_null_count(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, -1), (4, 5)]) + assert t["score"].null_count() == 2 + + +def test_null_count_zero(): + t = CTable(IntRow, new_data=[(1, 10), (2, 20)]) + assert t["score"].null_count() == 0 + + +def test_is_null_nan(): + t = CTable(FloatRow, new_data=[("a", 1.0), ("b", float("nan")), ("c", 3.0)]) + mask = t["value"].is_null() + assert list(mask) == [False, True, False] + + +def test_is_null_string_sentinel(): + t = CTable(StrRow, new_data=[("hello", 1), ("", 2), ("world", 3)]) + mask = t["label"].is_null() + assert list(mask) == [False, True, False] + + +def test_is_null_no_null_value(): + t = CTable(IntRow, new_data=[(1, 10), (2, 20)]) + # id has no null_value — is_null always returns all False + mask = t["id"].is_null() + assert list(mask) == [False, False] + + +def test_null_count_after_delete(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, -1), (4, 5)]) + t.delete([1]) # delete the row with score=-1 at physical index 1 + # Remaining: (1,10), (3,-1), (4,5) + assert t["score"].null_count() == 1 + + +# =========================================================================== +# Aggregates skip nulls +# =========================================================================== + + +def test_sum_skips_null(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, 20), (4, -1)]) + assert t["score"].sum() == 30 + + +def test_mean_skips_null(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, 30), (4, -1)]) + assert t["score"].mean() == pytest.approx(20.0) + + +def test_std_skips_null(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, 10), (4, -1)]) + # population std of [10, 10] = 0 + assert t["score"].std() == pytest.approx(0.0) + + +def test_min_skips_null(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, 5)]) + assert t["score"].min() == 5 + + +def test_max_skips_null(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, 5)]) + assert t["score"].max() == 10 + + +def test_min_nan_skips(): + t = CTable(FloatRow, new_data=[("a", float("nan")), ("b", 3.0), ("c", 1.0)]) + assert t["value"].min() == pytest.approx(1.0) + + +def test_max_nan_skips(): + t = CTable(FloatRow, new_data=[("a", float("nan")), ("b", 3.0), ("c", 1.0)]) + assert t["value"].max() == pytest.approx(3.0) + + +def test_mean_nan_returns_nan_when_all_null(): + t = CTable(FloatRow, new_data=[("a", float("nan")), ("b", float("nan"))]) + result = t["value"].mean() + assert math.isnan(result) + + +def test_min_all_null_raises(): + t = CTable(IntRow, new_data=[(1, -1), (2, -1)]) + with pytest.raises(ValueError, match="null"): + t["score"].min() + + +def test_max_all_null_raises(): + t = CTable(IntRow, new_data=[(1, -1), (2, -1)]) + with pytest.raises(ValueError, match="null"): + t["score"].max() + + +def test_any_skips_null(): + """any() on bool column with null_value — null rows are skipped.""" + + @dataclass + class BoolNull: + flag: bool = blosc2.field(blosc2.bool()) + active: bool = blosc2.field(blosc2.bool()) + + # bool doesn't support null_value directly in this test — just verify _nonnull_chunks + # behaves like iter_chunks when no null_value is set (already covered by existing tests). + t = CTable(BoolNull, new_data=[(True, False), (False, True)]) + assert t["flag"].any() is True + assert t["active"].any() is True + + +# =========================================================================== +# unique / value_counts exclude nulls +# =========================================================================== + + +def test_unique_excludes_null(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, 10), (4, -1), (5, 20)]) + u = t["score"].unique() + assert list(u) == [10, 20] + assert -1 not in u + + +def test_value_counts_excludes_null(): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, 10), (4, -1), (5, 20)]) + vc = t["score"].value_counts() + assert -1 not in vc + assert vc[10] == 2 + assert vc[20] == 1 + + +def test_unique_string_excludes_null(): + t = CTable(StrRow, new_data=[("hello", 1), ("", 2), ("hello", 3), ("world", 4)]) + u = t["label"].unique() + assert "" not in list(u) + assert "hello" in list(u) + assert "world" in list(u) + + +# =========================================================================== +# Append / extend with null sentinel bypass Pydantic +# =========================================================================== + + +def test_append_null_bypasses_constraint(): + """Appending a null sentinel that violates ge/le should succeed.""" + t = CTable(IntRow) + # score has ge=0, le=1000 but null_value=-1; appending -1 should not raise + t.append((1, -1)) + assert t["score"][0] == -1 + + +def test_append_normal_value_still_validated(): + t = CTable(IntRow) + with pytest.raises(ValueError): + t.append((1, 9999)) # violates le=1000 + + +def test_extend_null_bypasses_constraint(): + """extend() with null sentinel should not raise a constraint error.""" + t = CTable(IntRow) + t.extend([(1, 10), (2, -1), (3, 20)]) + scores = t["score"].to_numpy() + assert scores[1] == -1 + + +def test_extend_normal_value_still_validated(): + t = CTable(IntRow) + with pytest.raises(ValueError): + t.extend([(1, 10), (2, 9999)]) # 9999 violates le=1000 + + +# =========================================================================== +# sort_by: nulls last +# =========================================================================== + + +def test_sort_nulls_last_ascending(): + t = CTable(IntRow, new_data=[(1, 5), (2, -1), (3, 2), (4, -1), (5, 8)]) + s = t.sort_by("score") + scores = list(s["score"].to_numpy()) + # Non-null values sorted first, nulls (-1) at end + assert scores[:3] == [2, 5, 8] + assert scores[3] == -1 + assert scores[4] == -1 + + +def test_sort_nulls_last_descending(): + t = CTable(IntRow, new_data=[(1, 5), (2, -1), (3, 2), (4, -1), (5, 8)]) + s = t.sort_by("score", ascending=False) + scores = list(s["score"].to_numpy()) + # Non-null values sorted descending first, nulls last + assert scores[:3] == [8, 5, 2] + assert scores[3] == -1 + assert scores[4] == -1 + + +def test_sort_nulls_last_nan(): + t = CTable(FloatRow, new_data=[("a", 3.0), ("b", float("nan")), ("c", 1.0)]) + s = t.sort_by("value") + values = list(s["value"].to_numpy()) + assert values[0] == pytest.approx(1.0) + assert values[1] == pytest.approx(3.0) + assert math.isnan(values[2]) + + +def test_sort_multi_nulls_last(): + t = CTable(IntRow, new_data=[(1, -1), (2, 5), (3, -1), (4, 5)]) + s = t.sort_by(["score", "id"]) + scores = list(s["score"].to_numpy()) + ids = list(s["id"].to_numpy()) + # score 5 rows first (id 2, then id 4), then score -1 rows + assert scores[:2] == [5, 5] + assert ids[:2] == [2, 4] + assert scores[2] == -1 + assert scores[3] == -1 + + +def test_sort_no_nulls_unchanged(): + """Columns without null_value still sort normally.""" + t = CTable(IntRow, new_data=[(3, 30), (1, 10), (2, 20)]) + s = t.sort_by("id") + np.testing.assert_array_equal(s["id"].to_numpy(), [1, 2, 3]) + + +# =========================================================================== +# describe() shows null count +# =========================================================================== + + +def test_describe_shows_null_count(capsys): + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, 20)]) + t.describe() + out = capsys.readouterr().out + assert "null" in out.lower() + assert "1" in out # 1 null + + +def test_describe_no_null_line_when_zero_nulls(capsys): + t = CTable(IntRow, new_data=[(1, 10), (2, 20)]) + t.describe() + out = capsys.readouterr().out + # No null line when null_count == 0 + # The word "null" should not appear in the score section + # (the column has null_value=-1 but no actual null values) + assert "null" not in out.lower() + + +# =========================================================================== +# to_arrow: null masking +# =========================================================================== + + +def test_to_arrow_null_masking(): + pytest.importorskip("pyarrow") + + t = CTable(IntRow, new_data=[(1, 10), (2, -1), (3, 20)]) + arrow = t.to_arrow() + score_col = arrow.column("score") + assert score_col[0].as_py() == 10 + assert score_col[1].as_py() is None # null sentinel → Arrow null + assert score_col[2].as_py() == 20 + + +def test_to_arrow_nan_masking(): + pytest.importorskip("pyarrow") + + t = CTable(FloatRow, new_data=[("a", 1.0), ("b", float("nan")), ("c", 3.0)]) + arrow = t.to_arrow() + val_col = arrow.column("value") + assert val_col[0].as_py() == pytest.approx(1.0) + assert val_col[1].as_py() is None # NaN sentinel → Arrow null + assert val_col[2].as_py() == pytest.approx(3.0) + + +def test_to_arrow_string_null_masking(): + pytest.importorskip("pyarrow") + + t = CTable(StrRow, new_data=[("hello", 1), ("", 2), ("world", 3)]) + arrow = t.to_arrow() + label_col = arrow.column("label") + assert label_col[0].as_py() == "hello" + assert label_col[1].as_py() is None # empty string → Arrow null + assert label_col[2].as_py() == "world" + + +def test_to_arrow_no_null_value_no_masking(): + pytest.importorskip("pyarrow") + + t = CTable(IntRow, new_data=[(1, 10), (2, 20)]) + arrow = t.to_arrow() + # id column has no null_value → all values present + id_col = arrow.column("id") + assert id_col.null_count == 0 + + +# =========================================================================== +# from_csv: empty cells → sentinel +# =========================================================================== + + +def test_from_csv_empty_cell_to_null(): + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + f.write("id,score\n") + f.write("1,10\n") + f.write("2,\n") # empty score → null sentinel + f.write("3,20\n") + fname = f.name + try: + t = CTable.from_csv(fname, IntRow) + scores = t["score"].to_numpy() + assert scores[0] == 10 + assert scores[1] == -1 # null sentinel + assert scores[2] == 20 + assert t["score"].null_count() == 1 + finally: + os.unlink(fname) + + +def test_from_csv_empty_string_cell_to_null(): + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + f.write("label,rank\n") + f.write("hello,1\n") + f.write(",2\n") # empty label → null sentinel "" + f.write("world,3\n") + fname = f.name + try: + t = CTable.from_csv(fname, StrRow) + labels = t["label"].to_numpy() + assert labels[0] == "hello" + assert labels[1] == "" # null sentinel + assert labels[2] == "world" + assert t["label"].null_count() == 1 + finally: + os.unlink(fname) + + +def test_from_csv_no_null_value_non_empty_cells(): + """Without null_value, normal values are read and stored correctly.""" + + @dataclass + class SimpleRow: + x: int = blosc2.field(blosc2.int64()) + + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + f.write("x\n") + f.write("5\n") + f.write("7\n") + f.write("10\n") + fname = f.name + try: + t = CTable.from_csv(fname, SimpleRow) + arr = t["x"].to_numpy() + assert list(arr) == [5, 7, 10] + assert t["x"].null_count() == 0 + finally: + os.unlink(fname) + + +# =========================================================================== +# Persistence: null_value round-trips through schema serialization +# =========================================================================== + + +def test_null_value_persists_to_disk(): + path = table_path("null_persist") + t = CTable(IntRow, urlpath=path, mode="w", new_data=[(1, 10), (2, -1), (3, 20)]) + del t + t2 = CTable.open(path) + assert t2["score"].null_value == -1 + assert t2["score"].null_count() == 1 + + +def test_null_value_nan_persists(): + path = table_path("null_nan_persist") + t = CTable(FloatRow, urlpath=path, mode="w", new_data=[("a", 1.0), ("b", float("nan"))]) + del t + t2 = CTable.open(path) + nv = t2["value"].null_value + assert isinstance(nv, float) + assert math.isnan(nv) + assert t2["value"].null_count() == 1 + + +def test_null_value_string_persists(): + path = table_path("null_str_persist") + t = CTable(StrRow, urlpath=path, mode="w", new_data=[("hello", 1), ("", 2)]) + del t + t2 = CTable.open(path) + assert t2["label"].null_value == "" + assert t2["label"].null_count() == 1 + + +# =========================================================================== +# Edge cases +# =========================================================================== + + +def test_all_nulls_unique_empty(): + t = CTable(IntRow, new_data=[(1, -1), (2, -1)]) + u = t["score"].unique() + assert len(u) == 0 + + +def test_all_nulls_value_counts_empty(): + t = CTable(IntRow, new_data=[(1, -1), (2, -1)]) + vc = t["score"].value_counts() + assert len(vc) == 0 + + +def test_null_value_does_not_affect_non_nullable_column(): + t = CTable(IntRow, new_data=[(1, 10), (2, 20)]) + # id column has no null_value — aggregates work normally + assert t["id"].sum() == 3 + assert t["id"].min() == 1 + assert t["id"].max() == 2 + + +def test_schema_null_value_in_metadata(): + """null_value appears in schema_to_dict output for persistence.""" + from blosc2.schema_compiler import schema_to_dict + + @dataclass + class SomeRow: + x: int = blosc2.field(blosc2.int64(null_value=-999)) + label: str = blosc2.field(blosc2.string(max_length=8, null_value="N/A")) + + t = CTable(SomeRow, new_data=[(1, "hello"), (-999, "N/A")]) + d = schema_to_dict(t._schema) + cols = {c["name"]: c for c in d["columns"]} + assert cols["x"]["null_value"] == -999 + assert cols["label"]["null_value"] == "N/A" + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_row_logic.py b/tests/ctable/test_row_logic.py new file mode 100644 index 00000000..bece75c0 --- /dev/null +++ b/tests/ctable/test_row_logic.py @@ -0,0 +1,217 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable +from blosc2.ctable import Column + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0)) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +def generate_test_data(n_rows: int, start_id: int = 0) -> list: + return [(start_id + i, float(i * 10), i % 2 == 0) for i in range(n_rows)] + + +# ------------------------------------------------------------------- +# Tests +# ------------------------------------------------------------------- + + +def test_row_int_indexing(): + """int indexing: no holes, with holes, negative indices, and out-of-range.""" + data = generate_test_data(20) + + # No holes: spot checks + t = CTable(Row, new_data=data) + r = t.row[0] + assert isinstance(r, CTable) + assert len(r) == 1 + assert r.id[0] == 0 + assert r.score[0] == 0.0 + assert r.active[0] + assert t.row[10].id[0] == 10 + assert t.row[10].score[0] == 100.0 + + # Negative indices + assert t.row[-1].id[0] == 19 + assert t.row[-5].id[0] == 15 + + # With holes: delete odd positions -> valid: 0,2,4,6,8,10... + t.delete([1, 3, 5, 7, 9]) + assert t.row[0].id[0] == 0 + assert t.row[1].id[0] == 2 + assert t.row[5].id[0] == 10 + + # Out of range + t2 = CTable(Row, new_data=generate_test_data(10)) + for idx in [10, 100, -11]: + with pytest.raises(IndexError): + _ = t2.row[idx] + + +def test_row_slice_indexing(): + """Slice indexing: no holes, with holes, step, negative, beyond bounds, empty/full.""" + data = generate_test_data(20) + + # No holes + t = CTable(Row, new_data=data) + assert isinstance(t.row[0:5], CTable) + assert list(t.row[0:5].id) == [0, 1, 2, 3, 4] + assert list(t.row[10:15].id) == [10, 11, 12, 13, 14] + assert list(t.row[::2].id) == [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] + + # With step + assert list(t.row[0:10:2].id) == [0, 2, 4, 6, 8] + assert list(t.row[1:10:3].id) == [1, 4, 7] + + # Negative indices + assert list(t.row[-5:].id) == [15, 16, 17, 18, 19] + assert list(t.row[-10:-5].id) == [10, 11, 12, 13, 14] + + # With holes: delete odd positions + t.delete([1, 3, 5, 7, 9]) + assert list(t.row[0:5].id) == [0, 2, 4, 6, 8] + assert list(t.row[5:10].id) == [10, 11, 12, 13, 14] + + # Beyond bounds + t2 = CTable(Row, new_data=generate_test_data(10)) + assert len(t2.row[11:20]) == 0 + assert list(t2.row[5:100].id) == [5, 6, 7, 8, 9] + assert len(t2.row[100:]) == 0 + + # Empty and full slices + assert len(t2.row[5:5]) == 0 + assert len(t2.row[0:0]) == 0 + result = t2.row[:] + assert len(result) == 10 + assert list(result.id) == list(range(10)) + + +def test_row_list_indexing(): + """List indexing: no holes, with holes, out-of-range, edge cases.""" + data = generate_test_data(20) + + # No holes + t = CTable(Row, new_data=data) + r = t.row[[0, 5, 10, 15]] + assert isinstance(r, CTable) + assert len(r) == 4 + assert set(r.id) == {0, 5, 10, 15} + assert set(t.row[[19, 0, 10]].id) == {0, 10, 19} + + # With holes: delete [1,3,5,7,9] -> logical 0->id0, 1->id2, 2->id4... + t.delete([1, 3, 5, 7, 9]) + assert set(t.row[[0, 2, 4]].id) == {0, 4, 8} + assert set(t.row[[5, 3, 1]].id) == {2, 6, 10} + + # Negative indices in list + t2 = CTable(Row, new_data=generate_test_data(10)) + assert set(t2.row[[0, -1, 5]].id) == {0, 5, 9} + + # Single element + assert t2.row[[5]].id[0] == 5 + + # Duplicate indices -> deduplicated + r_dup = t2.row[[5, 5, 5]] + assert len(r_dup) == 1 + assert r_dup.id[0] == 5 + + # Empty list + assert len(t2.row[[]]) == 0 + + # Out of range + for bad in [[0, 5, 100], [0, 1, -11]]: + with pytest.raises(IndexError): + _ = t2.row[bad] + + +def test_row_view_properties(): + """View metadata, base chain, mask integrity, column liveness, and chained views.""" + data = generate_test_data(100) + tabla0 = CTable(Row, new_data=data) + + # Base is None on root table + assert tabla0.base is None + + # View properties are shared with parent + v = tabla0.row[0:10] + assert v.base is tabla0 + assert v._row_type == tabla0._row_type + assert v._cols is tabla0._cols + assert v._col_widths == tabla0._col_widths + assert v.col_names == tabla0.col_names + + # Read ops on view + view = tabla0.row[5:15] + assert view.id[0] == 5 + assert view.score[0] == 50.0 + assert not view.active[0] + assert list(view.id) == list(range(5, 15)) + + # Mask integrity + assert np.count_nonzero(view._valid_rows[:]) == 10 + + # Column is live (points back to its view) + col = view.id + assert isinstance(col, Column) + assert col._table is view + + # Chained views: base always points to immediate parent + tabla1 = tabla0.row[:50] + assert tabla1.base is tabla0 + assert len(tabla1) == 50 + + tabla2 = tabla1.row[:10] + assert tabla2.base is tabla1 + assert len(tabla2) == 10 + assert list(tabla2.id) == list(range(10)) + + tabla3 = tabla2.row[5:] + assert tabla3.base is tabla2 + assert len(tabla3) == 5 + assert list(tabla3.id) == [5, 6, 7, 8, 9] + + # Chained view with holes on parent + tabla0.delete([5, 10, 15, 20, 25]) + tv1 = tabla0.row[:30] + assert tv1.base is tabla0 + assert len(tv1) == 30 + tv2 = tv1.row[10:20] + assert tv2.base is tv1 + assert len(tv2) == 10 + + +def test_row_edge_cases(): + """Empty table, fully-deleted table: int raises IndexError, slice returns empty.""" + # Empty table + empty = CTable(Row) + with pytest.raises(IndexError): + _ = empty.row[0] + assert len(empty.row[:]) == 0 + assert len(empty.row[0:10]) == 0 + + # All rows deleted + data = generate_test_data(10) + t = CTable(Row, new_data=data) + t.delete(list(range(10))) + with pytest.raises(IndexError): + _ = t.row[0] + assert len(t.row[:]) == 0 + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_schema_compiler.py b/tests/ctable/test_schema_compiler.py new file mode 100644 index 00000000..812a8c63 --- /dev/null +++ b/tests/ctable/test_schema_compiler.py @@ -0,0 +1,258 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for compile_schema(), schema_to_dict(), and schema_from_dict().""" + +from dataclasses import MISSING, dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2.schema import bool as b2_bool +from blosc2.schema import complex128, float64, int64, string +from blosc2.schema_compiler import ( + CompiledSchema, + compile_schema, + schema_from_dict, + schema_to_dict, +) + +# ------------------------------------------------------------------- +# Fixtures +# ------------------------------------------------------------------- + + +@dataclass +class Simple: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +@dataclass +class WithString: + name: str = blosc2.field(blosc2.string(max_length=16)) + value: float = blosc2.field(blosc2.float64(), default=0.0) + + +@dataclass +class WithComplex: + id: int = blosc2.field(blosc2.int64()) + c_val: complex = blosc2.field(blosc2.complex128(), default=0j) + + +# ------------------------------------------------------------------- +# compile_schema — explicit b2.field() +# ------------------------------------------------------------------- + + +def test_compile_returns_compiled_schema(): + s = compile_schema(Simple) + assert isinstance(s, CompiledSchema) + assert s.row_cls is Simple + + +def test_compile_column_count(): + s = compile_schema(Simple) + assert len(s.columns) == 3 + + +def test_compile_column_names_order(): + s = compile_schema(Simple) + assert [c.name for c in s.columns] == ["id", "score", "active"] + + +def test_compile_column_dtypes(): + s = compile_schema(Simple) + assert s.columns_by_name["id"].dtype == np.dtype(np.int64) + assert s.columns_by_name["score"].dtype == np.dtype(np.float64) + assert s.columns_by_name["active"].dtype == np.dtype(np.bool_) + + +def test_compile_column_specs(): + s = compile_schema(Simple) + assert isinstance(s.columns_by_name["id"].spec, int64) + assert s.columns_by_name["id"].spec.ge == 0 + assert isinstance(s.columns_by_name["score"].spec, float64) + assert s.columns_by_name["score"].spec.le == 100 + + +def test_compile_defaults(): + s = compile_schema(Simple) + assert s.columns_by_name["id"].default is MISSING # required + assert s.columns_by_name["score"].default == 0.0 + assert s.columns_by_name["active"].default is True + + +def test_compile_py_types(): + s = compile_schema(Simple) + assert s.columns_by_name["id"].py_type is int + assert s.columns_by_name["score"].py_type is float + assert s.columns_by_name["active"].py_type is bool + + +def test_compile_string_column(): + s = compile_schema(WithString) + col = s.columns_by_name["name"] + assert isinstance(col.spec, string) + assert col.spec.max_length == 16 + assert col.dtype == np.dtype("U16") + + +def test_compile_complex_column(): + s = compile_schema(WithComplex) + col = s.columns_by_name["c_val"] + assert isinstance(col.spec, complex128) + assert col.dtype == np.dtype(np.complex128) + assert col.default == 0j + + +# ------------------------------------------------------------------- +# compile_schema — inferred shorthand (plain annotations) +# ------------------------------------------------------------------- + + +@dataclass +class Inferred: + count: int + ratio: float + flag: bool + + +def test_inferred_shorthand(): + s = compile_schema(Inferred) + assert len(s.columns) == 3 + assert isinstance(s.columns_by_name["count"].spec, int64) + assert isinstance(s.columns_by_name["ratio"].spec, float64) + assert isinstance(s.columns_by_name["flag"].spec, b2_bool) + + +def test_inferred_no_constraints(): + s = compile_schema(Inferred) + for col in s.columns: + assert col.spec.to_pydantic_kwargs() == {} + + +# ------------------------------------------------------------------- +# compile_schema — annotation / spec mismatch rejection +# ------------------------------------------------------------------- + + +def test_annotation_spec_mismatch(): + @dataclass + class Bad: + x: str = blosc2.field(blosc2.int64()) # str annotation but int64 spec + + with pytest.raises(TypeError, match="incompatible"): + compile_schema(Bad) + + +def test_non_dataclass_rejected(): + class NotADataclass: + pass + + with pytest.raises(TypeError, match="dataclass"): + compile_schema(NotADataclass) + + +# ------------------------------------------------------------------- +# compile_schema — per-column cparams config +# ------------------------------------------------------------------- + + +def test_column_cparams_stored(): + @dataclass + class WithCparams: + id: int = blosc2.field(blosc2.int64(), cparams={"clevel": 9}) + score: float = blosc2.field(blosc2.float64(), default=0.0) + + s = compile_schema(WithCparams) + assert s.columns_by_name["id"].config.cparams == {"clevel": 9} + assert s.columns_by_name["score"].config.cparams is None + + +# ------------------------------------------------------------------- +# schema_to_dict / schema_from_dict (Step 12) +# ------------------------------------------------------------------- + + +def test_schema_to_dict_structure(): + d = schema_to_dict(compile_schema(Simple)) + assert d["version"] == 1 + assert d["row_cls"] == "Simple" + assert len(d["columns"]) == 3 + + +def test_schema_to_dict_column_fields(): + d = schema_to_dict(compile_schema(Simple)) + id_col = next(c for c in d["columns"] if c["name"] == "id") + assert id_col["kind"] == "int64" + assert id_col["ge"] == 0 + assert id_col["default"] is None # MISSING → None + + +def test_schema_to_dict_default_values(): + d = schema_to_dict(compile_schema(Simple)) + score_col = next(c for c in d["columns"] if c["name"] == "score") + assert score_col["default"] == 0.0 + + active_col = next(c for c in d["columns"] if c["name"] == "active") + assert active_col["default"] is True + + +def test_schema_to_dict_complex_default(): + d = schema_to_dict(compile_schema(WithComplex)) + c_col = next(c for c in d["columns"] if c["name"] == "c_val") + assert c_col["default"]["__complex__"] is True + assert c_col["default"]["real"] == 0.0 + assert c_col["default"]["imag"] == 0.0 + + +def test_schema_roundtrip(): + """schema_from_dict(schema_to_dict(s)) reproduces the same column structure.""" + original = compile_schema(Simple) + d = schema_to_dict(original) + restored = schema_from_dict(d) + + assert len(restored.columns) == len(original.columns) + for orig_col, rest_col in zip(original.columns, restored.columns, strict=True): + assert orig_col.name == rest_col.name + assert orig_col.dtype == rest_col.dtype + assert type(orig_col.spec) is type(rest_col.spec) + if orig_col.default is MISSING: + assert rest_col.default is MISSING + else: + assert orig_col.default == rest_col.default + + +def test_schema_from_dict_no_row_cls(): + """Reconstructed schema has row_cls=None (original class not available).""" + d = schema_to_dict(compile_schema(Simple)) + restored = schema_from_dict(d) + assert restored.row_cls is None + + +def test_schema_from_dict_preserves_constraints(): + d = schema_to_dict(compile_schema(Simple)) + restored = schema_from_dict(d) + id_col = restored.columns_by_name["id"] + assert id_col.spec.ge == 0 + score_col = restored.columns_by_name["score"] + assert score_col.spec.le == 100 + + +def test_schema_from_dict_unknown_kind(): + d = {"version": 1, "row_cls": "X", "columns": [{"name": "x", "kind": "unknown", "default": None}]} + with pytest.raises(ValueError, match="Unknown column kind"): + schema_from_dict(d) + + +def test_schema_from_dict_unsupported_version(): + d = {"version": 99, "row_cls": "X", "columns": []} + with pytest.raises(ValueError, match="Unsupported schema version"): + schema_from_dict(d) diff --git a/tests/ctable/test_schema_mutations.py b/tests/ctable/test_schema_mutations.py new file mode 100644 index 00000000..9ede690f --- /dev/null +++ b/tests/ctable/test_schema_mutations.py @@ -0,0 +1,440 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for add_column, drop_column, rename_column, Column.assign, +and the corrected view mutability model.""" + +import os +import pathlib +import shutil +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + +TABLE_ROOT = str(pathlib.Path(__file__).parent / "saved_ctable" / "test_schema_mutations") + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +DATA10 = [(i, float(i * 10), True) for i in range(10)] + + +@pytest.fixture(autouse=True) +def clean_dir(): + if os.path.exists(TABLE_ROOT): + shutil.rmtree(TABLE_ROOT) + os.makedirs(TABLE_ROOT, exist_ok=True) + yield + if os.path.exists(TABLE_ROOT): + shutil.rmtree(TABLE_ROOT) + + +def table_path(name): + return os.path.join(TABLE_ROOT, f"{name}.b2d") + + +# =========================================================================== +# View mutability — value writes allowed, structural changes blocked +# =========================================================================== + + +def test_view_allows_column_setitem(): + """Writing values through a view modifies the parent table.""" + t = CTable(Row, new_data=DATA10) + view = t.where(t["id"] > 4) # rows 5-9 + # double scores of those rows using __setitem__ + indices = list(range(len(view))) + new_scores = view["score"].to_numpy() * 2 + view["score"][indices] = new_scores + # check parent was modified + assert t["score"][5] == pytest.approx(100.0) # was 50.0 + + +def test_view_allows_assign(): + """assign() through a view modifies the parent table.""" + t = CTable(Row, new_data=DATA10) + view = t.where(t["id"] > 4) + view["score"].assign(np.zeros(len(view))) + assert t["score"][5] == pytest.approx(0.0) + assert t["score"][4] == pytest.approx(40.0) # untouched + + +def test_view_blocks_append(): + t = CTable(Row, new_data=DATA10) + view = t.where(t["id"] > 4) + with pytest.raises(TypeError): + view.append((99, 10.0, True)) + + +def test_view_blocks_delete(): + t = CTable(Row, new_data=DATA10) + view = t.where(t["id"] > 4) + with pytest.raises(ValueError, match="view"): + view.delete(0) + + +def test_view_blocks_compact(): + t = CTable(Row, new_data=DATA10) + view = t.where(t["id"] > 4) + with pytest.raises(ValueError, match="view"): + view.compact() + + +def test_readonly_disk_table_blocks_assign(): + path = table_path("ro") + t = CTable(Row, urlpath=path, mode="w", new_data=DATA10) + t.close() + t_ro = CTable.open(path, mode="r") + with pytest.raises(ValueError, match="read-only"): + t_ro["score"].assign(np.ones(len(t_ro))) + + +def test_readonly_disk_table_blocks_setitem(): + path = table_path("ro_setitem") + t = CTable(Row, urlpath=path, mode="w", new_data=DATA10) + t.close() + t_ro = CTable.open(path, mode="r") + with pytest.raises(ValueError, match="read-only"): + t_ro["score"][0] = 99.0 + + +def test_blosc2_open_materializes_ctable(): + path = table_path("open_ctable") + t = CTable(Row, urlpath=path, mode="w", new_data=DATA10) + t.close() + opened = blosc2.open(path, mode="r") + assert isinstance(opened, CTable) + assert opened.col_names == ["id", "score", "active"] + np.testing.assert_array_equal(opened["id"].to_numpy(), np.arange(10)) + + +def test_blosc2_open_raw_treestore_without_manifest(): + path = table_path("raw_store") + with blosc2.TreeStore(path, mode="w", threshold=0) as tstore: + tstore["/group/node"] = np.arange(5) + + opened = blosc2.open(path, mode="r") + assert isinstance(opened, blosc2.TreeStore) + assert np.array_equal(opened["/group/node"][:], np.arange(5)) + + +def test_blosc2_open_raw_treestore_for_unknown_manifest_kind(): + path = table_path("unknown_manifest") + with blosc2.TreeStore(path, mode="w", threshold=0) as tstore: + meta = blosc2.SChunk() + meta.vlmeta["kind"] = "mystery" + meta.vlmeta["version"] = 1 + tstore["/_meta"] = meta + tstore["/payload"] = np.arange(3) + + opened = blosc2.open(path, mode="r") + assert isinstance(opened, blosc2.TreeStore) + assert np.array_equal(opened["/payload"][:], np.arange(3)) + + +def test_extensionless_ctable_path_uses_extensionless_store(): + path = os.path.join(TABLE_ROOT, "alias_ctable") + t = CTable(Row, urlpath=path, mode="w", new_data=DATA10) + t.close() + assert os.path.isdir(path) + opened = blosc2.open(path, mode="r") + assert isinstance(opened, CTable) + np.testing.assert_array_equal(opened["id"].to_numpy(), np.arange(10)) + + +# =========================================================================== +# Column.assign +# =========================================================================== + + +def test_assign_replaces_all_values(): + t = CTable(Row, new_data=DATA10) + t["score"].assign([99.0] * 10) + assert list(t["score"].to_numpy()) == [99.0] * 10 + + +def test_assign_coerces_python_ints_to_float(): + t = CTable(Row, new_data=DATA10) + t["score"].assign(list(range(10))) # Python ints → float64 + np.testing.assert_array_equal(t["score"].to_numpy(), np.arange(10, dtype=np.float64)) + + +def test_assign_wrong_length_raises(): + t = CTable(Row, new_data=DATA10) + with pytest.raises(ValueError, match="10"): + t["score"].assign([1.0, 2.0]) + + +def test_assign_through_view_touches_only_matching_rows(): + t = CTable(Row, new_data=DATA10) + view = t.where(t["id"] < 5) # rows 0-4 + view["score"].assign([0.0] * 5) + # rows 0-4 → 0, rows 5-9 unchanged + scores = t["score"].to_numpy() + np.testing.assert_array_equal(scores[:5], np.zeros(5)) + np.testing.assert_array_equal(scores[5:], np.arange(5, 10, dtype=np.float64) * 10) + + +def test_assign_respects_deleted_rows(): + t = CTable(Row, new_data=DATA10) + t.delete([0]) # delete id=0; 9 live rows remain + t["score"].assign([1.0] * 9) + assert len(t["score"].to_numpy()) == 9 + assert all(v == 1.0 for v in t["score"].to_numpy()) + + +# =========================================================================== +# add_column +# =========================================================================== + + +def test_add_column_appears_in_col_names(): + t = CTable(Row, new_data=DATA10) + t.add_column("weight", blosc2.float64(), 0.0) + assert "weight" in t.col_names + + +def test_add_column_fills_default_for_existing_rows(): + t = CTable(Row, new_data=DATA10) + t.add_column("weight", blosc2.float64(), 5.5) + np.testing.assert_array_equal(t["weight"].to_numpy(), np.full(10, 5.5)) + + +def test_add_column_new_rows_can_use_it(): + t = CTable(Row, new_data=DATA10) + t.add_column("weight", blosc2.float64(), 0.0) + # After adding, extend doesn't know about weight — add manually + t["weight"].assign(np.ones(10) * 2.0) + assert t["weight"].mean() == pytest.approx(2.0) + + +def test_add_column_schema_updated(): + t = CTable(Row, new_data=DATA10) + t.add_column("weight", blosc2.float64(), 0.0) + assert "weight" in t.schema.columns_by_name + + +def test_add_column_persists_on_disk(): + path = table_path("add_col") + t = CTable(Row, urlpath=path, mode="w", new_data=DATA10) + t.add_column("weight", blosc2.float64(), 7.0) + t.close() + t2 = CTable.open(path) + assert "weight" in t2.col_names + np.testing.assert_array_equal(t2["weight"].to_numpy(), np.full(10, 7.0)) + + +def test_add_column_view_raises(): + t = CTable(Row, new_data=DATA10) + view = t.where(t["id"] > 4) + with pytest.raises(ValueError, match="view"): + view.add_column("weight", blosc2.float64(), 0.0) + + +def test_add_column_duplicate_raises(): + t = CTable(Row, new_data=DATA10) + with pytest.raises(ValueError, match="already exists"): + t.add_column("score", blosc2.float64(), 0.0) + + +def test_add_column_bad_default_raises(): + t = CTable(Row, new_data=DATA10) + with pytest.raises(TypeError): + t.add_column("flag", blosc2.int8(), "not_a_number") + + +def test_add_column_skips_deleted_rows(): + t = CTable(Row, new_data=DATA10) + t.delete([0, 1]) # 8 live rows + t.add_column("weight", blosc2.float64(), 3.0) + vals = t["weight"].to_numpy() + assert len(vals) == 8 + assert all(v == 3.0 for v in vals) + + +# =========================================================================== +# drop_column +# =========================================================================== + + +def test_drop_column_removes_from_col_names(): + t = CTable(Row, new_data=DATA10) + t.drop_column("active") + assert "active" not in t.col_names + + +def test_drop_column_schema_updated(): + t = CTable(Row, new_data=DATA10) + t.drop_column("active") + assert "active" not in t.schema.columns_by_name + + +def test_drop_column_last_raises(): + @dataclass + class OneCol: + id: int = blosc2.field(blosc2.int64()) + + t = CTable(OneCol, new_data=[(i,) for i in range(5)]) + with pytest.raises(ValueError, match="last"): + t.drop_column("id") + + +def test_drop_column_missing_raises(): + t = CTable(Row, new_data=DATA10) + with pytest.raises(KeyError): + t.drop_column("nonexistent") + + +def test_drop_column_view_raises(): + t = CTable(Row, new_data=DATA10) + view = t.where(t["id"] > 4) + with pytest.raises(ValueError, match="view"): + view.drop_column("active") + + +def test_drop_column_deletes_file_on_disk(): + path = table_path("drop_col") + t = CTable(Row, urlpath=path, mode="w", new_data=DATA10) + t.drop_column("active") + assert not os.path.exists(os.path.join(path, "_cols", "active.b2nd")) + + +def test_drop_column_persists_schema_on_disk(): + path = table_path("drop_schema") + t = CTable(Row, urlpath=path, mode="w", new_data=DATA10) + t.drop_column("active") + t.close() + t2 = CTable.open(path) + assert "active" not in t2.col_names + assert t2.ncols == 2 + + +# =========================================================================== +# rename_column +# =========================================================================== + + +def test_rename_column_updates_col_names(): + t = CTable(Row, new_data=DATA10) + t.rename_column("score", "points") + assert "points" in t.col_names + assert "score" not in t.col_names + + +def test_rename_column_data_intact(): + t = CTable(Row, new_data=DATA10) + original = t["score"].to_numpy().copy() + t.rename_column("score", "points") + np.testing.assert_array_equal(t["points"].to_numpy(), original) + + +def test_rename_column_schema_updated(): + t = CTable(Row, new_data=DATA10) + t.rename_column("score", "points") + assert "points" in t.schema.columns_by_name + assert "score" not in t.schema.columns_by_name + + +def test_rename_column_order_preserved(): + t = CTable(Row, new_data=DATA10) + t.rename_column("score", "points") + assert t.col_names == ["id", "points", "active"] + + +def test_rename_column_missing_raises(): + t = CTable(Row, new_data=DATA10) + with pytest.raises(KeyError): + t.rename_column("nonexistent", "foo") + + +def test_rename_column_conflict_raises(): + t = CTable(Row, new_data=DATA10) + with pytest.raises(ValueError, match="already exists"): + t.rename_column("score", "active") + + +def test_rename_column_view_raises(): + t = CTable(Row, new_data=DATA10) + view = t.where(t["id"] > 4) + with pytest.raises(ValueError, match="view"): + view.rename_column("score", "points") + + +def test_rename_column_persists_on_disk(): + path = table_path("rename_col") + t = CTable(Row, urlpath=path, mode="w", new_data=DATA10) + t.rename_column("score", "points") + t.close() + t2 = CTable.open(path) + assert "points" in t2.col_names + assert "score" not in t2.col_names + assert os.path.exists(os.path.join(path, "_cols", "points.b2nd")) + assert not os.path.exists(os.path.join(path, "_cols", "score.b2nd")) + + +# =========================================================================== +# Boolean mask indexing (pandas-style) +# =========================================================================== + + +def test_bool_mask_getitem(): + t = CTable(Row, new_data=DATA10) + mask = t["id"].to_numpy() % 2 == 0 # even ids + result = t["score"][mask] + np.testing.assert_array_equal(result, np.array([0.0, 20.0, 40.0, 60.0, 80.0])) + + +def test_bool_mask_setitem(): + t = CTable(Row, new_data=DATA10) + mask = t["id"].to_numpy() % 2 == 0 + t["score"][mask] = 0.0 + scores = t["score"].to_numpy() + np.testing.assert_array_equal(scores[0::2], np.zeros(5)) # evens zeroed + np.testing.assert_array_equal(scores[1::2], np.array([10.0, 30.0, 50.0, 70.0, 90.0])) + + +def test_bool_mask_inplace_multiply(): + """The pandas idiom: col[mask] *= scalar.""" + t = CTable(Row, new_data=DATA10) + mask = t["id"].to_numpy() % 2 == 0 + t["score"][mask] *= 2 + scores = t["score"].to_numpy() + np.testing.assert_array_equal(scores[0::2], np.array([0.0, 40.0, 80.0, 120.0, 160.0])) + np.testing.assert_array_equal(scores[1::2], np.array([10.0, 30.0, 50.0, 70.0, 90.0])) + + +def test_bool_mask_wrong_length_raises(): + t = CTable(Row, new_data=DATA10) + bad_mask = np.array([True, False, True], dtype=np.bool_) + with pytest.raises(IndexError, match="length"): + _ = t["score"][bad_mask] + + +def test_bool_mask_through_view(): + """Boolean mask indexing works on views too.""" + t = CTable(Row, new_data=DATA10) + view = t.where(t["id"] < 6) # rows 0-5 + mask = view["id"].to_numpy() % 2 == 0 + view["score"][mask] *= 10 + # rows 0,2,4 in view → ids 0,2,4 in parent → scores 0,20,40 * 10 + assert t["score"][0] == pytest.approx(0.0) + assert t["score"][2] == pytest.approx(200.0) + assert t["score"][4] == pytest.approx(400.0) + assert t["score"][1] == pytest.approx(10.0) # untouched + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_schema_specs.py b/tests/ctable/test_schema_specs.py new file mode 100644 index 00000000..645087d7 --- /dev/null +++ b/tests/ctable/test_schema_specs.py @@ -0,0 +1,343 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for schema spec objects (blosc2.schema).""" + +import numpy as np +import pytest + +import blosc2 +from blosc2.schema import ( + SchemaSpec, + complex64, + complex128, + float32, + float64, + int8, + int16, + int32, + int64, + string, + uint8, + uint16, + uint32, + uint64, +) +from blosc2.schema import ( + bool as b2_bool, +) +from blosc2.schema import ( + bytes as b2_bytes, +) + +# ------------------------------------------------------------------- +# dtype mapping +# ------------------------------------------------------------------- + + +def test_int_dtypes(): + assert int8().dtype == np.dtype(np.int8) + assert int16().dtype == np.dtype(np.int16) + assert int32().dtype == np.dtype(np.int32) + assert int64().dtype == np.dtype(np.int64) + assert int64(ge=0).dtype == np.dtype(np.int64) + + +def test_uint_dtypes(): + assert uint8().dtype == np.dtype(np.uint8) + assert uint16().dtype == np.dtype(np.uint16) + assert uint32().dtype == np.dtype(np.uint32) + assert uint64().dtype == np.dtype(np.uint64) + + +def test_float_dtypes(): + assert float32().dtype == np.dtype(np.float32) + assert float64().dtype == np.dtype(np.float64) + + +def test_bool_dtype(): + assert b2_bool().dtype == np.dtype(np.bool_) + + +def test_complex_dtypes(): + assert complex64().dtype == np.dtype(np.complex64) + assert complex128().dtype == np.dtype(np.complex128) + + +def test_string_dtype(): + assert string(max_length=16).dtype == np.dtype("U16") + assert string(max_length=32).dtype == np.dtype("U32") + assert string().dtype == np.dtype("U32") # default max_length=32 + + +def test_bytes_dtype(): + assert b2_bytes(max_length=8).dtype == np.dtype("S8") + assert b2_bytes().dtype == np.dtype("S32") # default max_length=32 + + +# ------------------------------------------------------------------- +# python_type mapping +# ------------------------------------------------------------------- + + +def test_python_types(): + for cls in [int8, int16, int32, int64, uint8, uint16, uint32, uint64]: + assert cls().python_type is int + for cls in [float32, float64]: + assert cls().python_type is float + for cls in [complex64, complex128]: + assert cls().python_type is complex + assert b2_bool().python_type is bool + assert string().python_type is str + assert b2_bytes().python_type is bytes + + +# ------------------------------------------------------------------- +# constraint storage +# ------------------------------------------------------------------- + + +def test_int64_constraints(): + s = int64(ge=0, lt=100) + assert s.ge == 0 + assert s.gt is None + assert s.le is None + assert s.lt == 100 + + +def test_float64_constraints(): + s = float64(gt=0.0, le=1.0) + assert s.gt == 0.0 + assert s.le == 1.0 + assert s.ge is None + assert s.lt is None + + +def test_string_constraints(): + s = string(min_length=2, max_length=10, pattern=r"^\w+$") + assert s.min_length == 2 + assert s.max_length == 10 + assert s.pattern == r"^\w+$" + + +def test_bytes_constraints(): + s = b2_bytes(min_length=1, max_length=8) + assert s.min_length == 1 + assert s.max_length == 8 + + +# ------------------------------------------------------------------- +# to_pydantic_kwargs +# ------------------------------------------------------------------- + + +def test_int64_pydantic_kwargs_partial(): + """Only non-None constraints appear in pydantic kwargs.""" + assert int64(ge=0).to_pydantic_kwargs() == {"ge": 0} + assert int64(ge=0, le=100).to_pydantic_kwargs() == {"ge": 0, "le": 100} + assert int64().to_pydantic_kwargs() == {} + + +def test_float64_pydantic_kwargs(): + assert float64(gt=0.0, lt=1.0).to_pydantic_kwargs() == {"gt": 0.0, "lt": 1.0} + + +def test_bool_pydantic_kwargs(): + assert b2_bool().to_pydantic_kwargs() == {} + + +def test_string_pydantic_kwargs(): + s = string(min_length=1, max_length=5) + kw = s.to_pydantic_kwargs() + assert kw["min_length"] == 1 + assert kw["max_length"] == 5 + + +# ------------------------------------------------------------------- +# to_metadata_dict +# ------------------------------------------------------------------- + + +def test_int64_metadata_dict(): + d = int64(ge=0, le=100).to_metadata_dict() + assert d["kind"] == "int64" + assert d["ge"] == 0 + assert d["le"] == 100 + assert "gt" not in d + assert "lt" not in d + + +def test_float64_metadata_dict(): + d = float64().to_metadata_dict() + assert d["kind"] == "float64" + assert len(d) == 1 # no constraints + + +def test_bool_metadata_dict(): + assert b2_bool().to_metadata_dict() == {"kind": "bool"} + + +def test_string_metadata_dict(): + d = string(max_length=9).to_metadata_dict() + assert d["kind"] == "string" + assert d["max_length"] == 9 + + +def test_complex128_metadata_dict(): + assert complex128().to_metadata_dict() == {"kind": "complex128"} + + +# ------------------------------------------------------------------- +# All specs are SchemaSpec subclasses +# ------------------------------------------------------------------- + + +def test_all_are_schema_spec(): + all_specs = [ + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, + float32, + float64, + b2_bool, + complex64, + complex128, + string, + b2_bytes, + ] + for cls in all_specs: + assert issubclass(cls, SchemaSpec) + + +# ------------------------------------------------------------------- +# New integer / float metadata dicts +# ------------------------------------------------------------------- + + +def test_int8_metadata_dict(): + d = int8(ge=0, lt=128).to_metadata_dict() + assert d["kind"] == "int8" + assert d["ge"] == 0 + assert d["lt"] == 128 + + +def test_uint8_metadata_dict(): + d = uint8(le=200).to_metadata_dict() + assert d["kind"] == "uint8" + assert d["le"] == 200 + + +def test_float32_metadata_dict(): + d = float32(ge=0.0, le=1.0).to_metadata_dict() + assert d["kind"] == "float32" + assert d["ge"] == 0.0 + assert d["le"] == 1.0 + + +def test_new_kinds_roundtrip(): + """Every new kind serialises and deserialises correctly.""" + from dataclasses import dataclass + + from blosc2.schema_compiler import compile_schema, schema_from_dict, schema_to_dict + + @dataclass + class R: + a: int = blosc2.field(blosc2.int8(ge=0)) + b: int = blosc2.field(blosc2.uint16(), default=0) + c: float = blosc2.field(blosc2.float32(ge=0.0, le=1.0), default=0.0) + + schema = compile_schema(R) + d = schema_to_dict(schema) + restored = schema_from_dict(d) + + assert restored.columns_by_name["a"].spec.to_metadata_dict()["kind"] == "int8" + assert restored.columns_by_name["b"].spec.to_metadata_dict()["kind"] == "uint16" + assert restored.columns_by_name["c"].spec.to_metadata_dict()["kind"] == "float32" + + +# ------------------------------------------------------------------- +# blosc2 namespace exports +# ------------------------------------------------------------------- + + +def test_blosc2_namespace(): + """All spec classes are reachable via the blosc2 namespace.""" + assert blosc2.int8 is int8 + assert blosc2.int16 is int16 + assert blosc2.int32 is int32 + assert blosc2.int64 is int64 + assert blosc2.uint8 is uint8 + assert blosc2.uint16 is uint16 + assert blosc2.uint32 is uint32 + assert blosc2.uint64 is uint64 + assert blosc2.float32 is float32 + assert blosc2.float64 is float64 + assert blosc2.bool is b2_bool + assert blosc2.complex64 is complex64 + assert blosc2.complex128 is complex128 + assert blosc2.string is string + + +# ------------------------------------------------------------------- +# String vectorized validation — np.char.str_len path +# ------------------------------------------------------------------- + + +def test_string_validation_vectorized(): + """max_length / min_length use the np.char.str_len path, not np.vectorize.""" + from dataclasses import dataclass + + from blosc2 import CTable + + @dataclass + class Row: + name: str = blosc2.field(blosc2.string(min_length=2, max_length=5)) + + t = CTable(Row, expected_size=10) + t.extend([("hi",), ("hello",)]) # 2 and 5 chars — both valid + assert len(t) == 2 + + with pytest.raises(ValueError, match="max_length"): + t.extend([("toolong",)]) # 7 chars > 5 + + with pytest.raises(ValueError, match="min_length"): + t.extend([("x",)]) # 1 char < 2 + + +def test_string_validation_numpy_array(): + """Vectorized length check catches violations when the array dtype is wider + than the schema's max_length (e.g. dtype U8 with max_length=4).""" + from dataclasses import dataclass + + from blosc2 import CTable + + # Schema says max 4 chars, but the numpy dtype is U8 (wider). + # Strings of 5+ chars survive in the array and are caught by validation. + @dataclass + class Row: + tag: str = blosc2.field(blosc2.string(max_length=4)) + + dtype = np.dtype([("tag", "U8")]) + good = np.array([("ab",), ("cd",)], dtype=dtype) + bad = np.array([("ab",), ("toolong",)], dtype=dtype) # 7 chars > 4 + + t = CTable(Row, expected_size=5) + t.extend(good) + assert len(t) == 2 + + t2 = CTable(Row, expected_size=5) + with pytest.raises(ValueError, match="max_length"): + t2.extend(bad) + + # Note: when the array dtype matches the schema max_length (e.g. U4 with + # max_length=4), NumPy already truncates values to fit the dtype before + # validation runs — so no violation can be detected in that case. diff --git a/tests/ctable/test_schema_validation.py b/tests/ctable/test_schema_validation.py new file mode 100644 index 00000000..2d51d29f --- /dev/null +++ b/tests/ctable/test_schema_validation.py @@ -0,0 +1,164 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +# ------------------------------------------------------------------- +# append() validation +# ------------------------------------------------------------------- + + +def test_append_valid_row(): + """Rows within constraints are accepted.""" + t = CTable(Row, expected_size=5) + t.append((0, 0.0, True)) + t.append((1, 100.0, False)) + t.append((99, 50.5, True)) + assert len(t) == 3 + + +def test_append_ge_violation(): + """id < 0 raises ValueError (ge=0).""" + t = CTable(Row, expected_size=5) + with pytest.raises(ValueError): + t.append((-1, 50.0, True)) + + +def test_append_le_violation(): + """score > 100 raises ValueError (le=100).""" + t = CTable(Row, expected_size=5) + with pytest.raises(ValueError): + t.append((1, 100.1, True)) + + +def test_append_boundary_values(): + """Exact boundary values (ge=0 and le=100) are accepted.""" + t = CTable(Row, expected_size=5) + t.append((0, 0.0, True)) # id=0 (ge boundary), score=0.0 (ge boundary) + t.append((1, 100.0, False)) # score=100.0 (le boundary) + assert len(t) == 2 + + +def test_append_default_fill(): + """Fields with defaults can be omitted from a tuple — Pydantic fills them in.""" + t = CTable(Row, expected_size=5) + # Only id is required; score and active have defaults + t.append((5,)) # score=0.0, active=True filled by defaults + assert len(t) == 1 + assert t.row[0].id[0] == 5 + + +def test_append_validate_false(): + """validate=False skips constraint checks — invalid data is stored silently.""" + t = CTable(Row, expected_size=5, validate=False) + t.append((-5, 200.0, True)) # would fail with validate=True + assert len(t) == 1 + assert int(t._cols["id"][0]) == -5 + + +# ------------------------------------------------------------------- +# extend() validation (vectorized) +# ------------------------------------------------------------------- + + +def test_extend_valid_rows(): + """Bulk insert within constraints succeeds.""" + t = CTable(Row, expected_size=10) + data = [(i, float(i), True) for i in range(10)] + t.extend(data) + assert len(t) == 10 + + +def test_extend_ge_violation(): + """A negative id anywhere in the batch raises ValueError.""" + t = CTable(Row, expected_size=10) + data = [(i, float(i), True) for i in range(5)] + [(-1, 50.0, False)] + with pytest.raises(ValueError, match="ge=0"): + t.extend(data) + + +def test_extend_le_violation(): + """A score > 100 anywhere in the batch raises ValueError.""" + t = CTable(Row, expected_size=10) + data = [(i, float(i), True) for i in range(5)] + [(5, 101.0, False)] + with pytest.raises(ValueError, match="le=100"): + t.extend(data) + + +def test_extend_validate_false(): + """validate=False on the table skips bulk constraint checks.""" + t = CTable(Row, expected_size=10, validate=False) + data = [(-1, 200.0, True), (-2, 300.0, False)] + t.extend(data) # no error + assert len(t) == 2 + + +def test_extend_numpy_structured_array(): + """Constraint enforcement also works when extending with a structured NumPy array.""" + dtype = np.dtype([("id", np.int64), ("score", np.float64), ("active", np.bool_)]) + good = np.array([(1, 50.0, True), (2, 75.0, False)], dtype=dtype) + bad = np.array([(1, 50.0, True), (2, 150.0, False)], dtype=dtype) # score > 100 + + t = CTable(Row, expected_size=5) + t.extend(good) + assert len(t) == 2 + + t2 = CTable(Row, expected_size=5) + with pytest.raises(ValueError, match="le=100"): + t2.extend(bad) + + +# ------------------------------------------------------------------- +# gt / lt constraints +# ------------------------------------------------------------------- + + +@dataclass +class Strict: + x: int = blosc2.field(blosc2.int64(gt=0, lt=10)) + + +def test_gt_lt_append(): + """gt and lt are exclusive bounds.""" + t = CTable(Strict, expected_size=5) + + t.append((5,)) # valid + with pytest.raises(ValueError): + t.append((0,)) # violates gt=0 + with pytest.raises(ValueError): + t.append((10,)) # violates lt=10 + + +def test_gt_lt_extend(): + """Vectorized gt/lt checks work on batches.""" + t = CTable(Strict, expected_size=10) + t.extend([(i,) for i in range(1, 10)]) # 1..9 all valid + assert len(t) == 9 + + t2 = CTable(Strict, expected_size=5) + with pytest.raises(ValueError, match="gt=0"): + t2.extend([(0,)]) + with pytest.raises(ValueError, match="lt=10"): + t2.extend([(10,)]) + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_select_describe_cov.py b/tests/ctable/test_select_describe_cov.py new file mode 100644 index 00000000..5e51d6d3 --- /dev/null +++ b/tests/ctable/test_select_describe_cov.py @@ -0,0 +1,276 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for select(), describe(), and cov().""" + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + label: str = blosc2.field(blosc2.string(max_length=16), default="") + + +DATA10 = [(i, float(i * 10 % 100), i % 2 == 0, f"r{i}") for i in range(10)] + + +# =========================================================================== +# select() +# =========================================================================== + + +def test_select_returns_subset_of_columns(): + t = CTable(Row, new_data=DATA10) + v = t.select(["id", "score"]) + assert v.col_names == ["id", "score"] + assert v.ncols == 2 + + +def test_select_preserves_caller_order(): + t = CTable(Row, new_data=DATA10) + v = t.select(["score", "id"]) + assert v.col_names == ["score", "id"] + + +def test_select_shares_data_no_copy(): + t = CTable(Row, new_data=DATA10) + v = t.select(["id", "score"]) + # Same NDArray objects — no copy + assert v._cols["id"] is t._cols["id"] + assert v._cols["score"] is t._cols["score"] + + +def test_select_row_count_unchanged(): + t = CTable(Row, new_data=DATA10) + v = t.select(["id", "score"]) + assert len(v) == 10 + + +def test_select_data_correct(): + t = CTable(Row, new_data=DATA10) + v = t.select(["id", "score"]) + np.testing.assert_array_equal(v["id"].to_numpy(), t["id"].to_numpy()) + np.testing.assert_array_equal(v["score"].to_numpy(), t["score"].to_numpy()) + + +def test_select_base_is_parent(): + t = CTable(Row, new_data=DATA10) + v = t.select(["id"]) + assert v.base is t + + +def test_select_combined_with_where(): + t = CTable(Row, new_data=DATA10) + v = t.select(["id", "score"]).where(t["id"] > 4) + assert len(v) == 5 + assert v.col_names == ["id", "score"] + + +def test_select_combined_with_deletions(): + t = CTable(Row, new_data=DATA10) + t.delete([0, 1]) + v = t.select(["id", "score"]) + assert len(v) == 8 + np.testing.assert_array_equal(v["id"].to_numpy(), t["id"].to_numpy()) + + +def test_select_schema_updated(): + t = CTable(Row, new_data=DATA10) + v = t.select(["id", "score"]) + assert list(v.schema.columns_by_name.keys()) == ["id", "score"] + assert "active" not in v.schema.columns_by_name + assert "label" not in v.schema.columns_by_name + + +def test_select_blocks_structural_mutations(): + t = CTable(Row, new_data=DATA10) + v = t.select(["id", "score"]) + with pytest.raises(TypeError): + v.append((99, 50.0, True, "x")) + + +def test_select_empty_raises(): + t = CTable(Row, new_data=DATA10) + with pytest.raises(ValueError, match="at least one"): + t.select([]) + + +def test_select_unknown_column_raises(): + t = CTable(Row, new_data=DATA10) + with pytest.raises(KeyError): + t.select(["id", "nonexistent"]) + + +def test_select_single_column(): + t = CTable(Row, new_data=DATA10) + v = t.select(["score"]) + assert v.col_names == ["score"] + assert len(v) == 10 + + +# =========================================================================== +# describe() +# =========================================================================== + + +def test_describe_runs_without_error(capsys): + t = CTable(Row, new_data=DATA10) + t.describe() + out = capsys.readouterr().out + assert "id" in out + assert "score" in out + assert "active" in out + assert "label" in out + + +def test_describe_shows_row_count(capsys): + t = CTable(Row, new_data=DATA10) + t.describe() + out = capsys.readouterr().out + assert "10" in out + + +def test_describe_numeric_stats(capsys): + t = CTable(Row, new_data=DATA10) + t.describe() + out = capsys.readouterr().out + assert "mean" in out + assert "std" in out + assert "min" in out + assert "max" in out + + +def test_describe_bool_stats(capsys): + t = CTable(Row, new_data=DATA10) + t.describe() + out = capsys.readouterr().out + assert "true" in out + assert "false" in out + + +def test_describe_string_stats(capsys): + t = CTable(Row, new_data=DATA10) + t.describe() + out = capsys.readouterr().out + assert "unique" in out + + +def test_describe_empty_table(capsys): + t = CTable(Row) + t.describe() + out = capsys.readouterr().out + assert "0 rows" in out + assert "empty" in out + + +def test_describe_on_select(capsys): + t = CTable(Row, new_data=DATA10) + t.select(["id", "score"]).describe() + out = capsys.readouterr().out + assert "id" in out + assert "score" in out + assert "active" not in out + + +# =========================================================================== +# cov() +# =========================================================================== + + +def test_cov_shape(): + t = CTable(Row, new_data=DATA10) + c = t.select(["id", "score"]).cov() + assert c.shape == (2, 2) + + +def test_cov_symmetric(): + t = CTable(Row, new_data=DATA10) + c = t.select(["id", "score"]).cov() + np.testing.assert_allclose(c, c.T) + + +def test_cov_diagonal_equals_variance(): + t = CTable(Row, new_data=DATA10) + ids = t["id"].to_numpy().astype(np.float64) + scores = t["score"].to_numpy().astype(np.float64) + c = t.select(["id", "score"]).cov() + assert c[0, 0] == pytest.approx(np.var(ids, ddof=1)) + assert c[1, 1] == pytest.approx(np.var(scores, ddof=1)) + + +def test_cov_single_column_is_scalar(): + t = CTable(Row, new_data=DATA10) + c = t.select(["id"]).cov() + assert c.shape == (1, 1) + ids = t["id"].to_numpy().astype(np.float64) + assert c[0, 0] == pytest.approx(np.var(ids, ddof=1)) + + +def test_cov_bool_column_cast_to_int(): + t = CTable(Row, new_data=DATA10) + # bool is treated as 0/1 int — should not raise + c = t.select(["id", "active"]).cov() + assert c.shape == (2, 2) + + +def test_cov_skips_deleted_rows(): + t = CTable(Row, new_data=DATA10) + t.delete([0]) # remove id=0 + ids = t["id"].to_numpy().astype(np.float64) + c = t.select(["id"]).cov() + assert c[0, 0] == pytest.approx(np.var(ids, ddof=1)) + + +def test_cov_string_column_raises(): + t = CTable(Row, new_data=DATA10) + with pytest.raises(TypeError, match="not supported"): + t.cov() # 'label' is a string column + + +def test_cov_complex_column_raises(): + @dataclass + class CRow: + val: complex = blosc2.field(blosc2.complex128()) + + t = CTable(CRow, new_data=[(1 + 2j,), (3 + 4j,)]) + with pytest.raises(TypeError, match="not supported"): + t.cov() + + +def test_cov_too_few_rows_raises(): + t = CTable(Row, new_data=[(0, 0.0, True, "a")]) + with pytest.raises(ValueError, match="2 live rows"): + t.select(["id", "score"]).cov() + + +def test_cov_after_all_deleted_raises(): + t = CTable(Row, new_data=DATA10) + t.delete(list(range(10))) + with pytest.raises(ValueError): + t.select(["id", "score"]).cov() + + +def test_cov_three_columns(): + # identity-ish: if columns are linearly independent, diagonal dominates + data = [(i, float(i), i % 2 == 0, "") for i in range(20)] + t = CTable(Row, new_data=data) + c = t.select(["id", "score", "active"]).cov() + assert c.shape == (3, 3) + np.testing.assert_allclose(c, c.T, atol=1e-10) + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_sort_by.py b/tests/ctable/test_sort_by.py new file mode 100644 index 00000000..6363b2a8 --- /dev/null +++ b/tests/ctable/test_sort_by.py @@ -0,0 +1,272 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for CTable.sort_by().""" + +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +@dataclass +class StrRow: + label: str = blosc2.field(blosc2.string(max_length=16)) + rank: int = blosc2.field(blosc2.int64(ge=0), default=0) + + +DATA = [ + (3, 80.0, True), + (1, 50.0, False), + (4, 90.0, True), + (2, 50.0, True), + (0, 70.0, False), +] + + +# =========================================================================== +# Single-column sort +# =========================================================================== + + +def test_sort_single_col_ascending(): + t = CTable(Row, new_data=DATA) + s = t.sort_by("id") + np.testing.assert_array_equal(s["id"].to_numpy(), [0, 1, 2, 3, 4]) + + +def test_sort_single_col_descending(): + t = CTable(Row, new_data=DATA) + s = t.sort_by("score", ascending=False) + np.testing.assert_array_equal(s["score"].to_numpy(), [90.0, 80.0, 70.0, 50.0, 50.0]) + + +def test_sort_bool_column(): + t = CTable(Row, new_data=DATA) + s = t.sort_by("active") + # False < True → False rows first + assert list(s["active"].to_numpy()) == [False, False, True, True, True] + + +def test_sort_string_column(): + t = CTable(StrRow, new_data=[("charlie", 3), ("alice", 1), ("dave", 4), ("bob", 2)]) + s = t.sort_by("label") + assert list(s["label"].to_numpy()) == ["alice", "bob", "charlie", "dave"] + + +def test_sort_string_column_descending(): + t = CTable(StrRow, new_data=[("charlie", 3), ("alice", 1), ("dave", 4), ("bob", 2)]) + s = t.sort_by("label", ascending=False) + assert list(s["label"].to_numpy()) == ["dave", "charlie", "bob", "alice"] + + +# =========================================================================== +# Multi-column sort +# =========================================================================== + + +def test_sort_multi_col_both_asc(): + t = CTable(Row, new_data=DATA) + s = t.sort_by(["score", "id"], ascending=[True, True]) + scores = s["score"].to_numpy() + ids = s["id"].to_numpy() + # score asc; tiebreak: id asc (both 50.0 rows → id 1 before id 2) + assert scores[0] == pytest.approx(50.0) + assert ids[0] == 1 + assert scores[1] == pytest.approx(50.0) + assert ids[1] == 2 + + +def test_sort_multi_col_mixed(): + t = CTable(Row, new_data=DATA) + s = t.sort_by(["score", "id"], ascending=[True, False]) + scores = s["score"].to_numpy() + ids = s["id"].to_numpy() + # score asc; tiebreak: id desc (both 50.0 rows → id 2 before id 1) + assert scores[0] == pytest.approx(50.0) + assert ids[0] == 2 + assert scores[1] == pytest.approx(50.0) + assert ids[1] == 1 + + +def test_sort_multi_col_ascending_list_notation(): + """Passing ascending=True (single bool) applies to all keys.""" + t = CTable(Row, new_data=DATA) + s = t.sort_by(["score", "id"], ascending=True) + np.testing.assert_array_equal(s["id"].to_numpy()[:2], [1, 2]) + + +# =========================================================================== +# Non-destructive: original table is unchanged +# =========================================================================== + + +def test_sort_does_not_modify_original(): + t = CTable(Row, new_data=DATA) + original_ids = t["id"].to_numpy().copy() + _ = t.sort_by("id") + np.testing.assert_array_equal(t["id"].to_numpy(), original_ids) + + +def test_sort_returns_new_table(): + t = CTable(Row, new_data=DATA) + s = t.sort_by("id") + assert s is not t + + +# =========================================================================== +# inplace=True +# =========================================================================== + + +def test_sort_inplace_returns_self(): + t = CTable(Row, new_data=DATA) + result = t.sort_by("id", inplace=True) + assert result is t + + +def test_sort_inplace_modifies_table(): + t = CTable(Row, new_data=DATA) + t.sort_by("id", inplace=True) + np.testing.assert_array_equal(t["id"].to_numpy(), [0, 1, 2, 3, 4]) + + +def test_sort_inplace_descending(): + t = CTable(Row, new_data=DATA) + t.sort_by("score", ascending=False, inplace=True) + assert t["score"][0] == pytest.approx(90.0) + assert t["score"][-1] == pytest.approx(50.0) + + +# =========================================================================== +# Interaction with deletions +# =========================================================================== + + +def test_sort_skips_deleted_rows(): + t = CTable(Row, new_data=DATA) + t.delete([0]) # delete id=3 (first row) + s = t.sort_by("id") + np.testing.assert_array_equal(s["id"].to_numpy(), [0, 1, 2, 4]) + assert len(s) == 4 + + +def test_sort_inplace_skips_deleted_rows(): + t = CTable(Row, new_data=DATA) + t.delete([0, 2]) # delete id=3 and id=4 + t.sort_by("id", inplace=True) + np.testing.assert_array_equal(t["id"].to_numpy(), [0, 1, 2]) + assert len(t) == 3 + + +def test_sort_all_columns_consistent(): + """All columns move together when sorted.""" + t = CTable(Row, new_data=DATA) + s = t.sort_by("id") + ids = s["id"].to_numpy() + scores = s["score"].to_numpy() + # Original DATA: id→score mapping: 0→70, 1→50, 2→50, 3→80, 4→90 + expected = {0: 70.0, 1: 50.0, 2: 50.0, 3: 80.0, 4: 90.0} + for i, v in zip(ids, scores, strict=True): + assert v == pytest.approx(expected[int(i)]) + + +# =========================================================================== +# Edge cases +# =========================================================================== + + +def test_sort_empty_table(): + t = CTable(Row) + s = t.sort_by("id") + assert len(s) == 0 + + +def test_sort_single_row(): + t = CTable(Row, new_data=[(7, 42.0, True)]) + s = t.sort_by("id") + assert s["id"][0] == 7 + + +def test_sort_already_sorted(): + data = [(i, float(i * 10), True) for i in range(5)] + t = CTable(Row, new_data=data) + s = t.sort_by("id") + np.testing.assert_array_equal(s["id"].to_numpy(), list(range(5))) + + +def test_sort_reverse_sorted(): + data = [(i, float(i * 10), True) for i in range(5, 0, -1)] + t = CTable(Row, new_data=data) + s = t.sort_by("id") + np.testing.assert_array_equal(s["id"].to_numpy(), [1, 2, 3, 4, 5]) + + +# =========================================================================== +# Error cases +# =========================================================================== + + +def test_sort_view_raises(): + t = CTable(Row, new_data=DATA) + view = t.where(t["id"] > 2) + with pytest.raises(ValueError, match="view"): + view.sort_by("id") + + +def test_sort_unknown_column_raises(): + t = CTable(Row, new_data=DATA) + with pytest.raises(KeyError): + t.sort_by("nonexistent") + + +def test_sort_complex_column_raises(): + @dataclass + class CRow: + val: complex = blosc2.field(blosc2.complex128()) + + t = CTable(CRow, new_data=[(1 + 2j,), (3 + 4j,)]) + with pytest.raises(TypeError, match="complex"): + t.sort_by("val") + + +def test_sort_ascending_length_mismatch_raises(): + t = CTable(Row, new_data=DATA) + with pytest.raises(ValueError, match="ascending"): + t.sort_by(["id", "score"], ascending=[True]) + + +def test_sort_readonly_inplace_raises(): + import os + import pathlib + import shutil + + path_obj = pathlib.Path(__file__).parent / "saved_ctable" / "_sort_ro_test.b2d" + path = str(path_obj) + os.makedirs(path_obj.parent, exist_ok=True) + try: + t = CTable(Row, urlpath=path, mode="w", new_data=DATA) + t.close() + t_ro = CTable.open(path, mode="r") + with pytest.raises(ValueError, match="read-only"): + t_ro.sort_by("id", inplace=True) + finally: + shutil.rmtree(path, ignore_errors=True) + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_table_persistency.py b/tests/ctable/test_table_persistency.py new file mode 100644 index 00000000..3e9045db --- /dev/null +++ b/tests/ctable/test_table_persistency.py @@ -0,0 +1,521 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Persistency tests for CTable: create → close → reopen round-trips.""" + +import json +import os +import pathlib +import shutil +from dataclasses import dataclass + +import pytest + +import blosc2 +from blosc2 import CTable + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +@dataclass +class Row: + id: int = blosc2.field(blosc2.int64(ge=0)) + score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0) + active: bool = blosc2.field(blosc2.bool(), default=True) + + +TABLE_ROOT = str(pathlib.Path(__file__).parent / "saved_ctable" / "test_tables") + + +@pytest.fixture(autouse=True) +def clean_table_dir(): + """Remove test directory before each test and clean up after.""" + if os.path.exists(TABLE_ROOT): + shutil.rmtree(TABLE_ROOT) + os.makedirs(TABLE_ROOT, exist_ok=True) + yield + if os.path.exists(TABLE_ROOT): + shutil.rmtree(TABLE_ROOT) + + +def table_path(name: str) -> str: + return os.path.join(TABLE_ROOT, name) + + +# --------------------------------------------------------------------------- +# Layout: disk structure +# --------------------------------------------------------------------------- + + +def test_create_layout_files_exist(): + """Creating a persistent CTable writes the expected files.""" + path = table_path("people") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.append((1, 50.0, True)) + + assert os.path.exists(os.path.join(path, "_meta.b2f")) + assert os.path.exists(os.path.join(path, "_valid_rows.b2nd")) + assert os.path.exists(os.path.join(path, "_cols", "id.b2nd")) + assert os.path.exists(os.path.join(path, "_cols", "score.b2nd")) + assert os.path.exists(os.path.join(path, "_cols", "active.b2nd")) + + +def test_schema_saved_in_meta_vlmeta(): + """Schema JSON and kind marker are present in _meta.b2f.""" + path = table_path("people") + CTable(Row, urlpath=path, mode="w", expected_size=16) + + meta = blosc2.open(os.path.join(path, "_meta.b2f"), mode="r") + assert meta.vlmeta["kind"] == "ctable" + assert meta.vlmeta["version"] == 1 + schema = json.loads(meta.vlmeta["schema"]) + assert schema["version"] == 1 + col_names = [c["name"] for c in schema["columns"]] + assert col_names == ["id", "score", "active"] + + +# --------------------------------------------------------------------------- +# Round-trip: data survives reopen +# --------------------------------------------------------------------------- + + +def test_reopen_with_ctable_constructor(): + """Data written before close is readable after reopening via CTable(...).""" + path = table_path("rt") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.extend([(1, 10.0, True), (2, 20.0, False), (3, 30.0, True)]) + + t2 = CTable(Row, urlpath=path, mode="a") + assert len(t2) == 3 + assert list(t2["id"].to_numpy()) == [1, 2, 3] + assert list(t2["score"].to_numpy()) == [10.0, 20.0, 30.0] + + +def test_reopen_with_open_classmethod(): + """CTable.open() returns a read-only table with correct data.""" + path = table_path("ro") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.extend([(10, 50.0, True), (20, 60.0, False)]) + + t2 = CTable.open(path) + assert len(t2) == 2 + assert list(t2["id"].to_numpy()) == [10, 20] + + +def test_column_order_preserved_after_reopen(): + """Column order from the schema JSON is respected on reopen.""" + path = table_path("order") + + @dataclass + class MultiCol: + z: int = blosc2.field(blosc2.int64()) + a: float = blosc2.field(blosc2.float64(), default=0.0) + m: bool = blosc2.field(blosc2.bool(), default=True) + + t = CTable(MultiCol, urlpath=path, mode="w", expected_size=16) + t2 = CTable(MultiCol, urlpath=path, mode="a") + assert t2.col_names == ["z", "a", "m"] + + +def test_schema_constraints_preserved(): + """Reopening re-enables constraint validation from the stored schema.""" + path = table_path("constraints") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.append((1, 50.0, True)) + + t2 = CTable(Row, urlpath=path, mode="a") + with pytest.raises(ValueError): + t2.append((-1, 50.0, True)) # id violates ge=0 + + +# --------------------------------------------------------------------------- +# Append after reopen +# --------------------------------------------------------------------------- + + +def test_append_after_reopen(): + """Appending to a reopened table grows the row count correctly.""" + path = table_path("append") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.extend([(1, 10.0, True), (2, 20.0, False)]) + + t2 = CTable(Row, urlpath=path, mode="a") + t2.append((3, 30.0, True)) + assert len(t2) == 3 + assert t2.row[2].id[0] == 3 + + # Verify it's visible in a third open + t3 = CTable(Row, urlpath=path, mode="a") + assert len(t3) == 3 + assert list(t3["id"].to_numpy()) == [1, 2, 3] + + +def test_extend_after_reopen(): + """extend() after reopen persists all new rows.""" + path = table_path("extend") + t = CTable(Row, urlpath=path, mode="w", expected_size=64) + t.extend([(i, float(i), True) for i in range(5)]) + + t2 = CTable(Row, urlpath=path, mode="a") + t2.extend([(i, float(i), i % 2 == 0) for i in range(5, 10)]) + assert len(t2) == 10 + + t3 = CTable(Row, urlpath=path, mode="a") + assert len(t3) == 10 + assert list(t3["id"].to_numpy()) == list(range(10)) + + +# --------------------------------------------------------------------------- +# Delete after reopen +# --------------------------------------------------------------------------- + + +def test_delete_after_reopen(): + """Deletions after reopen are reflected in subsequent opens.""" + path = table_path("delete") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.extend([(1, 10.0, True), (2, 20.0, False), (3, 30.0, True)]) + + t2 = CTable(Row, urlpath=path, mode="a") + t2.delete(1) # remove row with id=2 + assert len(t2) == 2 + + t3 = CTable(Row, urlpath=path, mode="a") + assert len(t3) == 2 + assert list(t3["id"].to_numpy()) == [1, 3] + + +# --------------------------------------------------------------------------- +# valid_rows persistence +# --------------------------------------------------------------------------- + + +def test_valid_rows_persisted(): + """The tombstone mask (_valid_rows) is correctly stored and loaded.""" + path = table_path("vr") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.extend([(1, 10.0, True), (2, 20.0, False), (3, 30.0, True)]) + t.delete(1) # mark row 1 (id=2) as invalid + + # _valid_rows on disk: slots 0 and 2 are True, slot 1 is False + vr = blosc2.open(os.path.join(path, "_valid_rows.b2nd"), mode="r") + raw = vr[:3] + assert raw[0] + assert not raw[1] + assert raw[2] + + +# --------------------------------------------------------------------------- +# mode="w" overwrites existing table +# --------------------------------------------------------------------------- + + +def test_mode_w_overwrites_existing(): + """mode='w' on an existing path creates a fresh table.""" + path = table_path("overwrite") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.extend([(1, 10.0, True), (2, 20.0, False)]) + + t2 = CTable(Row, urlpath=path, mode="w", expected_size=16) + assert len(t2) == 0 + + t3 = CTable(Row, urlpath=path, mode="a") + assert len(t3) == 0 + + +# --------------------------------------------------------------------------- +# Read-only mode +# --------------------------------------------------------------------------- + + +def test_read_only_mode_rejects_append(): + path = table_path("ro_append") + CTable(Row, urlpath=path, mode="w", expected_size=16) + + t = CTable.open(path, mode="r") + with pytest.raises(ValueError, match="read-only"): + t.append((1, 50.0, True)) + + +def test_read_only_mode_rejects_extend(): + path = table_path("ro_extend") + CTable(Row, urlpath=path, mode="w", expected_size=16) + + t = CTable.open(path, mode="r") + with pytest.raises(ValueError, match="read-only"): + t.extend([(1, 50.0, True)]) + + +def test_read_only_mode_rejects_delete(): + path = table_path("ro_delete") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.append((1, 50.0, True)) + + t2 = CTable.open(path, mode="r") + with pytest.raises(ValueError, match="read-only"): + t2.delete(0) + + +def test_read_only_mode_rejects_compact(): + path = table_path("ro_compact") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.append((1, 50.0, True)) + + t2 = CTable.open(path, mode="r") + with pytest.raises(ValueError, match="read-only"): + t2.compact() + + +def test_read_only_allows_reads(): + """Read-only table: row access, column access, head/tail, where all work.""" + path = table_path("ro_reads") + t = CTable(Row, urlpath=path, mode="w", expected_size=16) + t.extend([(1, 10.0, True), (2, 20.0, False), (3, 30.0, True)]) + + t2 = CTable.open(path, mode="r") + assert len(t2) == 3 + assert t2.row[0].id[0] == 1 + assert list(t2["score"].to_numpy()) == [10.0, 20.0, 30.0] + assert len(t2.head(2)) == 2 + assert len(t2.tail(1)) == 1 + view = t2.where(t2["id"] > 1) + assert len(view) == 2 + + +# --------------------------------------------------------------------------- +# open() error cases +# --------------------------------------------------------------------------- + + +def test_open_nonexistent_raises(): + with pytest.raises(FileNotFoundError): + CTable.open(table_path("does_not_exist")) + + +def test_open_wrong_kind_raises(tmp_path): + """A path with a _meta.b2f that is not a ctable raises ValueError.""" + path = str(tmp_path / "fake_table") + store = blosc2.TreeStore(path, mode="w", threshold=0) + sc = blosc2.SChunk() + sc.vlmeta["kind"] = "something_else" + store["/_meta"] = sc + store.close() + + with pytest.raises(ValueError, match="CTable"): + CTable.open(path) + + +# --------------------------------------------------------------------------- +# Column name validation +# --------------------------------------------------------------------------- + + +def test_column_name_cannot_start_with_underscore(): + @dataclass + class Bad: + _id: int = blosc2.field(blosc2.int64()) + + with pytest.raises(ValueError, match="_"): + CTable(Bad) + + +def test_column_name_cannot_contain_slash(): + @dataclass + class Bad: + pass + + from blosc2.schema_compiler import _validate_column_name + + with pytest.raises(ValueError, match="/"): + _validate_column_name("a/b") + + +def test_column_name_cannot_be_empty(): + from blosc2.schema_compiler import _validate_column_name + + with pytest.raises(ValueError): + _validate_column_name("") + + +# --------------------------------------------------------------------------- +# new_data= guard when opening existing +# --------------------------------------------------------------------------- + + +def test_new_data_rejected_when_opening_existing(): + path = table_path("newdata") + CTable(Row, urlpath=path, mode="w", expected_size=16) + + with pytest.raises(ValueError, match="new_data"): + CTable(Row, new_data=[(1, 50.0, True)], urlpath=path, mode="a") + + +# --------------------------------------------------------------------------- +# Capacity growth (resize) persists +# --------------------------------------------------------------------------- + + +def test_grow_persists(): + """Filling past the initial capacity triggers resize; data still survives.""" + path = table_path("grow") + t = CTable(Row, urlpath=path, mode="w", expected_size=4) + for i in range(10): + t.append((i, float(i), True)) + assert len(t) == 10 + + t2 = CTable(Row, urlpath=path, mode="a") + assert len(t2) == 10 + assert list(t2["id"].to_numpy()) == list(range(10)) + + +# --------------------------------------------------------------------------- +# save() / load() +# --------------------------------------------------------------------------- + + +def test_save_creates_disk_layout(): + """save() writes the expected directory structure.""" + t = blosc2.CTable(Row, expected_size=16) + t.extend([(1, 10.0, True), (2, 20.0, False)]) + + path = table_path("saved") + t.save(path) + + assert os.path.exists(os.path.join(path, "_meta.b2f")) + assert os.path.exists(os.path.join(path, "_valid_rows.b2nd")) + assert os.path.exists(os.path.join(path, "_cols", "id.b2nd")) + assert os.path.exists(os.path.join(path, "_cols", "score.b2nd")) + assert os.path.exists(os.path.join(path, "_cols", "active.b2nd")) + + +def test_save_then_open_round_trip(): + """Data written by save() can be read back via CTable.open().""" + t = blosc2.CTable(Row, expected_size=16) + t.extend([(i, float(i * 10), i % 2 == 0) for i in range(5)]) + + path = table_path("saved_rt") + t.save(path) + + t2 = CTable.open(path) + assert len(t2) == 5 + assert list(t2["id"].to_numpy()) == list(range(5)) + assert list(t2["score"].to_numpy()) == [float(i * 10) for i in range(5)] + + +def test_save_compacts_deleted_rows(): + """save() writes only live rows — deleted rows are not included.""" + t = blosc2.CTable(Row, expected_size=16) + t.extend([(i, float(i), True) for i in range(6)]) + t.delete([0, 2, 4]) # delete rows with id 0, 2, 4 + assert len(t) == 3 + + path = table_path("saved_compact") + t.save(path) + + t2 = CTable.open(path) + assert len(t2) == 3 + assert list(t2["id"].to_numpy()) == [1, 3, 5] + + +def test_save_existing_path_raises_by_default(): + """save() raises ValueError if the path already exists unless overwrite=True.""" + t = blosc2.CTable(Row, expected_size=4) + t.append((1, 10.0, True)) + + path = table_path("save_conflict") + t.save(path) + + with pytest.raises(ValueError, match="overwrite"): + t.save(path) + + +def test_save_overwrite_replaces_table(): + """save(overwrite=True) replaces an existing table.""" + t1 = blosc2.CTable(Row, expected_size=4) + t1.extend([(1, 10.0, True), (2, 20.0, True)]) + + path = table_path("overwrite") + t1.save(path) + + t2 = blosc2.CTable(Row, expected_size=4) + t2.append((99, 50.0, False)) + t2.save(path, overwrite=True) + + t3 = CTable.open(path) + assert len(t3) == 1 + assert t3["id"][0] == 99 + + +def test_save_view_raises(): + """save() on a view raises ValueError.""" + t = blosc2.CTable(Row, expected_size=8) + t.extend([(i, float(i), True) for i in range(4)]) + view = t.where(t["id"] > 1) + + with pytest.raises(ValueError, match="view"): + view.save(table_path("view_save")) + + +def test_load_returns_in_memory_table(): + """load() returns a writable in-memory CTable.""" + t = blosc2.CTable(Row, expected_size=8) + t.extend([(i, float(i * 5), True) for i in range(4)]) + + path = table_path("loadme") + t.save(path) + + loaded = CTable.load(path) + assert len(loaded) == 4 + assert list(loaded["id"].to_numpy()) == [0, 1, 2, 3] + # Must be writable + loaded.append((100, 50.0, True)) + assert len(loaded) == 5 + + +def test_load_does_not_modify_disk(): + """Mutations on a loaded table do not affect the on-disk table.""" + t = blosc2.CTable(Row, expected_size=8) + t.extend([(i, float(i), True) for i in range(3)]) + + path = table_path("load_isolation") + t.save(path) + + loaded = CTable.load(path) + loaded.append((999, 99.0, False)) + loaded.delete(0) + + # Re-open the original persistent table — should be unchanged + t2 = CTable.open(path) + assert len(t2) == 3 + assert list(t2["id"].to_numpy()) == [0, 1, 2] + + +def test_load_nonexistent_raises(): + with pytest.raises(FileNotFoundError): + CTable.load(table_path("does_not_exist")) + + +def test_save_empty_table(): + """save() and load() work correctly on an empty table.""" + t = blosc2.CTable(Row, expected_size=4) + + path = table_path("empty") + t.save(path) + + t2 = CTable.load(path) + assert len(t2) == 0 + # Can still append after load + t2.append((1, 10.0, True)) + assert len(t2) == 1 + + +if __name__ == "__main__": + import pytest + + pytest.main(["-v", __file__]) diff --git a/tests/ndarray/test_c2array_expr.py b/tests/ndarray/test_c2array_expr.py index e65d1805..c4b778f1 100644 --- a/tests/ndarray/test_c2array_expr.py +++ b/tests/ndarray/test_c2array_expr.py @@ -186,7 +186,7 @@ def test_save(cat2_context): for op in ops: del op del expr - expr = blosc2.open(urlpath) + expr = blosc2.open(urlpath, mode="r") res = expr.compute() assert res.dtype == np.float64 np.testing.assert_allclose(res[:], nres, rtol=tol, atol=tol) diff --git a/tests/ndarray/test_concat.py b/tests/ndarray/test_concat.py index c9e42067..0d18cfa2 100644 --- a/tests/ndarray/test_concat.py +++ b/tests/ndarray/test_concat.py @@ -31,7 +31,7 @@ ([21, 121, 101, 10], [2, 121, 101, 10], "f4", 0), ([121, 21, 101, 10], [121, 12, 101, 10], "i8", 1), ([121, 121, 10, 10], [121, 121, 1, 10], "i8", 2), - ([121, 121, 101, 2], [121, 121, 101, 10], "i8", -1), + ([121, 121, 101, 2], [121, 121, 101, 5], "i8", -1), ], ) def test_concat2(shape1, shape2, dtype, axis): @@ -56,7 +56,7 @@ def test_concat2(shape1, shape2, dtype, axis): ([21, 121, 101, 10], [2, 121, 101, 10], [1, 121, 101, 10], "f4", 0), ([121, 21, 101, 10], [121, 12, 101, 10], [121, 1, 101, 10], "i8", 1), ([121, 121, 10, 10], [121, 121, 1, 10], [121, 121, 3, 10], "i8", 2), - ([121, 121, 101, 2], [121, 121, 101, 10], [121, 121, 101, 1], "i8", -1), + ([121, 121, 101, 2], [121, 121, 101, 5], [121, 121, 101, 1], "i8", -1), ], ) def test_concat3(shape1, shape2, shape3, dtype, axis): diff --git a/tests/ndarray/test_dsl_kernels.py b/tests/ndarray/test_dsl_kernels.py index ac37beeb..8261536f 100644 --- a/tests/ndarray/test_dsl_kernels.py +++ b/tests/ndarray/test_dsl_kernels.py @@ -872,7 +872,7 @@ def _save_reload_compute(kernel, inputs_np, inputs_b2, dtype, urlpaths, extra_kw """Save a LazyUDF backed by *kernel*, reload it, and return (reloaded_expr, result).""" lazy = blosc2.lazyudf(kernel, inputs_b2, dtype=dtype, **(extra_kwargs or {})) lazy.save(urlpath=urlpaths["lazy"]) - reloaded = blosc2.open(urlpaths["lazy"]) + reloaded = blosc2.open(urlpaths["lazy"], mode="r") return reloaded, reloaded.compute() @@ -951,7 +951,7 @@ def test_dsl_save_getitem(tmp_path): lazy = blosc2.lazyudf(kernel_save_simple, (a, b), dtype=np.float64) lazy.save(urlpath=str(tmp_path / "lazy.b2nd")) - reloaded = blosc2.open(str(tmp_path / "lazy.b2nd")) + reloaded = blosc2.open(str(tmp_path / "lazy.b2nd"), mode="r") assert isinstance(reloaded.func, DSLKernel) expected = (na + nb) ** 2 @@ -970,7 +970,7 @@ def test_dsl_save_input_names_match(tmp_path): lazy = blosc2.lazyudf(kernel_save_simple, (a, b), dtype=np.float64) lazy.save(urlpath=str(tmp_path / "lazy.b2nd")) - reloaded = blosc2.open(str(tmp_path / "lazy.b2nd")) + reloaded = blosc2.open(str(tmp_path / "lazy.b2nd"), mode="r") assert isinstance(reloaded.func, DSLKernel) assert reloaded.func.input_names == ["x", "y"] diff --git a/tests/ndarray/test_full.py b/tests/ndarray/test_full.py index 8a52441c..2670835c 100644 --- a/tests/ndarray/test_full.py +++ b/tests/ndarray/test_full.py @@ -223,7 +223,7 @@ def test_complex_datatype(): ("f_022", "= 1) & (a < 4)", reopened.fields).where(reopened) explained = expr.explain() @@ -1150,8 +1150,8 @@ def test_forced_ooc_full_index_merge_preserves_sorted_sidecars(monkeypatch, tmp_ descriptor = arr.create_index(kind=blosc2.IndexKind.FULL) meta = descriptor["full"] - values_sidecar = blosc2.open(meta["values_path"]) - positions_sidecar = blosc2.open(meta["positions_path"]) + values_sidecar = blosc2.open(meta["values_path"], mode="r") + positions_sidecar = blosc2.open(meta["positions_path"], mode="r") np.testing.assert_array_equal(values_sidecar[:], np.sort(data, kind="stable")) np.testing.assert_array_equal(values_sidecar[:], data[positions_sidecar[:]]) diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index 20dba823..6525a07d 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -466,7 +466,7 @@ def test_arctan2_pow(urlpath, shape_fixture, dtype_fixture, function, value1, va expr = blosc2.LazyExpr(new_op=(a1, function, a2)) if urlpath is not None: expr.save(urlpath=urlpath_save) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() # Evaluate using NumExpr if function == "**": @@ -480,7 +480,7 @@ def test_arctan2_pow(urlpath, shape_fixture, dtype_fixture, function, value1, va expr = blosc2.LazyExpr(new_op=(a1, function, value2)) if urlpath is not None: expr.save(urlpath=urlpath_save) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() # Evaluate using NumExpr if function == "**": @@ -496,7 +496,7 @@ def test_arctan2_pow(urlpath, shape_fixture, dtype_fixture, function, value1, va expr = blosc2.LazyExpr(new_op=(value1, function, a2)) if urlpath is not None: expr.save(urlpath=urlpath_save) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() # Evaluate using NumExpr if function == "**": @@ -716,7 +716,7 @@ def test_save(): ) np.testing.assert_allclose(res[:], nres, rtol=tol, atol=tol) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") # After opening, check that a lazy expression does have an array # and schunk attributes. This is to allow the .info() method to work. assert hasattr(expr, "array") is True @@ -735,7 +735,7 @@ def test_save(): var_dict = {"a1": ops[0], "a2": ops[1], "a3": ops[2], "a4": ops[3], "x": x} lazy_expr = eval(expr, var_dict) lazy_expr.save(urlpath=urlpath_save2) - expr = blosc2.open(urlpath_save2) + expr = blosc2.open(urlpath_save2, mode="r") assert expr.array.dtype == np.float64 res = expr.compute() nres = ne_evaluate("na1 / na2 + na2 - na3 * na4**3") @@ -759,7 +759,7 @@ def test_save_unsafe(): expr.save(urlpath=urlpath) disk_arrays.append(urlpath) - expr = blosc2.open(urlpath) + expr = blosc2.open(urlpath, mode="r") # Replace expression by a (potentially) unsafe expression expr.expression = "import os; os.system('touch /tmp/unsafe')" with pytest.raises(ValueError) as excinfo: @@ -807,7 +807,7 @@ def test_save_functions(function, dtype_fixture, shape_fixture): expr = blosc2.LazyExpr(new_op=(a1, function, None)) expr.save(urlpath=urlpath_save) del expr - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() # Evaluate using NumExpr @@ -823,7 +823,7 @@ def test_save_functions(function, dtype_fixture, shape_fixture): res_lazyexpr = expr.compute() np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=rtol) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=rtol) @@ -847,7 +847,7 @@ def test_save_contains(values): # Construct the lazy expression expr_lazy = blosc2.LazyExpr(new_op=(a1_blosc, "contains", value2)) expr_lazy.save(urlpath=urlpath_save) - expr_lazy = blosc2.open(urlpath_save) + expr_lazy = blosc2.open(urlpath_save, mode="r") # Evaluate using NumExpr expr_numexpr = f"{'contains'}(a1, value2)" res_numexpr = ne_evaluate(expr_numexpr) @@ -857,7 +857,7 @@ def test_save_contains(values): # Construct the lazy expression expr_lazy = blosc2.LazyExpr(new_op=(a1_blosc, "contains", a2_blosc)) expr_lazy.save(urlpath=urlpath_save) - expr_lazy = blosc2.open(urlpath_save) + expr_lazy = blosc2.open(urlpath_save, mode="r") # Evaluate using NumExpr res_numexpr = ne_evaluate("contains(a2, a1)") else: # ("str", "NDArray") @@ -867,7 +867,7 @@ def test_save_contains(values): # Construct the lazy expression expr_lazy = blosc2.LazyExpr(new_op=(value1, "contains", a2_blosc)) expr_lazy.save(urlpath=urlpath_save) - expr_lazy = blosc2.open(urlpath_save) + expr_lazy = blosc2.open(urlpath_save, mode="r") # Evaluate using NumExpr res_numexpr = ne_evaluate("contains(value1, a2)") res_lazyexpr = expr_lazy.compute() @@ -901,7 +901,7 @@ def test_save_many_functions(dtype_fixture, shape_fixture): res_lazyexpr = expr.compute() np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=rtol, atol=atol) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=rtol, atol=atol) @@ -946,7 +946,7 @@ def test_save_constructor(disk, shape, dtype, constructor): a = b2func(lshape, dtype=dtype, shape=shape, urlpath=urlpath, mode="w") expr = f"a + {constructor}({lshape}, dtype={dtype}, shape={shape}) + 1" if disk: - a = blosc2.open(urlpath) + a = blosc2.open(urlpath, mode="r") npfunc = getattr(np, constructor) if constructor == "linspace": na = npfunc(0, 10, lshape, dtype=dtype).reshape(shape) @@ -964,7 +964,7 @@ def test_save_constructor(disk, shape, dtype, constructor): assert lexpr.shape == a.shape if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() nres = na + na + 1 assert np.allclose(res[()], nres) @@ -986,7 +986,7 @@ def test_save_2_constructors(shape, disk): lexpr = blosc2.lazyexpr(expr) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() na = np.arange(lshape).reshape(shape) nb = np.ones(shape) @@ -1013,7 +1013,7 @@ def test_save_constructor_reshape(shape, disk): lexpr = blosc2.lazyexpr(expr) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() na = np.arange(lshape).reshape(shape) nb = np.ones(shape) @@ -1037,7 +1037,7 @@ def test_save_2equal_constructors(shape, disk): lexpr = blosc2.lazyexpr(expr) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() na = np.ones(shape, dtype=np.int8) nb = np.ones(shape) @@ -1360,9 +1360,9 @@ def test_fill_disk_operands(chunks, blocks, disk, fill_value): b = blosc2.zeros((N, N), urlpath=bpath, mode="w", chunks=chunks, blocks=blocks) c = blosc2.zeros((N, N), urlpath=cpath, mode="w", chunks=chunks, blocks=blocks) if disk: - a = blosc2.open("a.b2nd") - b = blosc2.open("b.b2nd") - c = blosc2.open("c.b2nd") + a = blosc2.open("a.b2nd", mode="r") + b = blosc2.open("b.b2nd", mode="r") + c = blosc2.open("c.b2nd", mode="r") expr = ((a**3 + blosc2.sin(c * 2)) < b) & ~(c > 0) @@ -1709,7 +1709,7 @@ def test_missing_operator(): blosc2.remove_urlpath("b.b2nd") # Re-open the lazy expression with pytest.raises(blosc2.exceptions.MissingOperands) as excinfo: - blosc2.open("expr.b2nd") + blosc2.open("expr.b2nd", mode="r") # Check that some operand is missing assert "a" not in excinfo.value.missing_ops @@ -1746,7 +1746,7 @@ def test_save_dictstore_operands(tmp_path): "b": {"kind": "dictstore_key", "version": 1, "urlpath": str(store_path), "key": "/b"}, } - restored = blosc2.open(expr_path) + restored = blosc2.open(expr_path, mode="r") assert isinstance(restored, blosc2.LazyExpr) np.testing.assert_array_equal(restored[:], expected) @@ -1762,7 +1762,7 @@ def test_save_proxy_operands_reopen_default_mode(tmp_path): expr = proxy + proxy expr.save(str(expr_path)) - restored = blosc2.open(str(expr_path)) + restored = blosc2.open(str(expr_path), mode="r") assert isinstance(restored, blosc2.LazyExpr) np.testing.assert_array_equal(restored[:], np.arange(10, dtype=np.int64) * 2) @@ -1780,13 +1780,13 @@ def test_lazyexpr_vlmeta_in_memory_and_persisted(tmp_path): expr_path = tmp_path / "expr_vlmeta.b2nd" expr.save(str(expr_path)) - restored = blosc2.open(str(expr_path)) + restored = blosc2.open(str(expr_path), mode="r") assert restored.vlmeta["name"] == "sum" assert restored.vlmeta["config"] == {"scale": 1} restored.vlmeta["note"] = "persisted" - reopened = blosc2.open(str(expr_path)) + reopened = blosc2.open(str(expr_path), mode="r") assert reopened.vlmeta["note"] == "persisted" np.testing.assert_array_equal(reopened[:], np.arange(5, dtype=np.int64) * 2) @@ -1862,19 +1862,19 @@ def test_chain_persistentexpressions(): le1_ = blosc2.lazyexpr("a ** 3 + sin(a ** 2)", {"a": a}) le1_.save("expr1.b2nd", mode="w") - myle1 = blosc2.open("expr1.b2nd") + myle1 = blosc2.open("expr1.b2nd", mode="r") le2_ = blosc2.lazyexpr("(le1 < c)", {"le1": myle1, "c": c}) le2_.save("expr2.b2nd", mode="w") - myle2 = blosc2.open("expr2.b2nd") + myle2 = blosc2.open("expr2.b2nd", mode="r") le3_ = blosc2.lazyexpr("(b < 0)", {"b": b}) le3_.save("expr3.b2nd", mode="w") - myle3 = blosc2.open("expr3.b2nd") + myle3 = blosc2.open("expr3.b2nd", mode="r") le4_ = blosc2.lazyexpr("(le2 & le3)", {"le2": myle2, "le3": myle3}) le4_.save("expr4.b2nd", mode="w") - myle4 = blosc2.open("expr4.b2nd") + myle4 = blosc2.open("expr4.b2nd", mode="r") assert (myle4[:] == le4[:]).all() # Remove files diff --git a/tests/ndarray/test_lazyudf.py b/tests/ndarray/test_lazyudf.py index 3aa76cb6..ab0cd814 100644 --- a/tests/ndarray/test_lazyudf.py +++ b/tests/ndarray/test_lazyudf.py @@ -483,7 +483,7 @@ def test_save_ludf(): expr.save(urlpath=urlpath) del expr - expr = blosc2.open(urlpath) + expr = blosc2.open(urlpath, mode="r") assert isinstance(expr, blosc2.LazyUDF) res_lazyexpr = expr.compute() np.testing.assert_array_equal(res_lazyexpr[:], npc) @@ -493,7 +493,7 @@ def test_save_ludf(): expr = blosc2.lazyudf(udf1p_numba, (array,), np.float64) expr.save(urlpath=urlpath) del expr - expr = blosc2.open(urlpath) + expr = blosc2.open(urlpath, mode="r") assert isinstance(expr, blosc2.LazyUDF) res_lazyexpr = expr.compute() np.testing.assert_array_equal(res_lazyexpr[:], npc) @@ -511,7 +511,7 @@ def test_lazyudf_vlmeta_roundtrip(tmp_path): expr.vlmeta["attrs"] = {"version": 1} expr.save(urlpath=str(expr_path)) - restored = blosc2.open(str(expr_path)) + restored = blosc2.open(str(expr_path), mode="r") assert isinstance(restored, blosc2.LazyUDF) assert restored.vlmeta["name"] == "increment" diff --git a/tests/ndarray/test_ndarray.py b/tests/ndarray/test_ndarray.py index 5a4f376d..c21c774a 100644 --- a/tests/ndarray/test_ndarray.py +++ b/tests/ndarray/test_ndarray.py @@ -77,7 +77,7 @@ def test_shape_with_zeros(shape, urlpath): data = np.zeros(shape, dtype="int32") ndarray = blosc2.asarray(data, urlpath=urlpath, mode="w") if urlpath is not None: - ndarray = blosc2.open(urlpath) + ndarray = blosc2.open(urlpath, mode="r") assert isinstance(ndarray, blosc2.NDArray) assert ndarray.shape == shape assert ndarray.size == 0 @@ -502,11 +502,11 @@ def test_argsort_scalar(): def test_save(): a = blosc2.arange(0, 10, 1, dtype="i4", shape=(10,)) blosc2.save(a, "test.b2nd") - c = blosc2.open("test.b2nd") + c = blosc2.open("test.b2nd", mode="r") assert np.array_equal(a[:], c[:]) blosc2.remove_urlpath("test.b2nd") with pytest.raises(FileNotFoundError): - blosc2.open("test.b2nd") + blosc2.open("test.b2nd", mode="r") def test_oindex(): diff --git a/tests/ndarray/test_persistency.py b/tests/ndarray/test_persistency.py index ee79c680..939cd5a8 100644 --- a/tests/ndarray/test_persistency.py +++ b/tests/ndarray/test_persistency.py @@ -32,7 +32,7 @@ def test_persistency(shape, chunks, blocks, urlpath, contiguous, dtype): size = int(np.prod(shape)) nparray = np.arange(size, dtype=dtype).reshape(shape) _ = blosc2.asarray(nparray, chunks=chunks, blocks=blocks, urlpath=urlpath, contiguous=contiguous) - b = blosc2.open(urlpath) + b = blosc2.open(urlpath, mode="r") bc = b[:] diff --git a/tests/ndarray/test_proxy.py b/tests/ndarray/test_proxy.py index 15648e99..fc4577d9 100644 --- a/tests/ndarray/test_proxy.py +++ b/tests/ndarray/test_proxy.py @@ -90,10 +90,10 @@ def test_open(urlpath, shape, chunks, blocks, slices, dtype): del b if urlpath is None: with pytest.raises(RuntimeError): - _ = blosc2.open(proxy_urlpath) + _ = blosc2.open(proxy_urlpath, mode="a") else: - b = blosc2.open(proxy_urlpath) - a = blosc2.open(urlpath) + b = blosc2.open(proxy_urlpath, mode="a") + a = blosc2.open(urlpath, mode="r") if not struct_dtype: np.testing.assert_almost_equal(b[...], a[...]) else: diff --git a/tests/ndarray/test_proxy_c2array.py b/tests/ndarray/test_proxy_c2array.py index fea92163..1f7d427f 100644 --- a/tests/ndarray/test_proxy_c2array.py +++ b/tests/ndarray/test_proxy_c2array.py @@ -86,7 +86,7 @@ def test_open(cat2_context): del a del b - b = blosc2.open(urlpath) + b = blosc2.open(urlpath, mode="r") a = get_array(shape, chunks_blocks) np.testing.assert_allclose(b[...], a[...]) diff --git a/tests/ndarray/test_proxy_expr.py b/tests/ndarray/test_proxy_expr.py index e4aea5a5..17b9f6b0 100644 --- a/tests/ndarray/test_proxy_expr.py +++ b/tests/ndarray/test_proxy_expr.py @@ -76,7 +76,7 @@ def test_expr_proxy_operands(chunks_blocks, cat2_context): urlpath = "expr_proxies.b2nd" expr.save(urlpath=urlpath, mode="w") del expr - expr_opened = blosc2.open("expr_proxies.b2nd") + expr_opened = blosc2.open("expr_proxies.b2nd", mode="r") assert isinstance(expr_opened, blosc2.LazyExpr) # All diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py index 91bd17e6..c5ece156 100644 --- a/tests/ndarray/test_reductions.py +++ b/tests/ndarray/test_reductions.py @@ -427,7 +427,7 @@ def test_fast_path(chunks, blocks, disk, fill_value, reduce_op, axis): else: a = blosc2.zeros(shape, dtype=np.float64, chunks=chunks, blocks=blocks, urlpath=urlpath, mode="w") if disk: - a = blosc2.open(urlpath) + a = blosc2.open(urlpath, mode="r") na = a[:] if reduce_op in {"cumulative_sum", "cumulative_prod"}: axis = 0 if axis is None else axis @@ -473,7 +473,7 @@ def test_miniexpr_slice(chunks, blocks, disk, fill_value, reduce_op): else: a = blosc2.zeros(shape, dtype=np.float64, chunks=chunks, blocks=blocks, urlpath=urlpath, mode="w") if disk: - a = blosc2.open(urlpath) + a = blosc2.open(urlpath, mode="r") na = a[:] # Test slice # TODO: Make this work with miniexpr (currently just skips to normal reduction eval) @@ -520,8 +520,8 @@ def test_save_version1(disk, fill_value, reduce_op, axis): a = blosc2.zeros(shape, dtype=np.float64, urlpath=urlpath, mode="w") b = blosc2.zeros(shape, dtype=np.float64, urlpath="b.b2nd", mode="w") - 0.1 if disk: - a = blosc2.open(urlpath) - b = blosc2.open("b.b2nd") + a = blosc2.open(urlpath, mode="r") + b = blosc2.open("b.b2nd", mode="r") na = a[:] nb = b[:] @@ -531,7 +531,7 @@ def test_save_version1(disk, fill_value, reduce_op, axis): assert lexpr.shape == a.shape if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() if reduce_op in {"cumulative_sum", "cumulative_prod"}: oploc = "npcumsum" if reduce_op == "cumulative_sum" else "npcumprod" @@ -581,8 +581,8 @@ def test_save_version2(disk, fill_value, reduce_op, axis): a = blosc2.zeros(shape, dtype=np.float64, urlpath=urlpath, mode="w") b = blosc2.zeros(shape, dtype=np.float64, urlpath="b.b2nd", mode="w") - 0.1 if disk: - a = blosc2.open(urlpath) - b = blosc2.open("b.b2nd") + a = blosc2.open(urlpath, mode="r") + b = blosc2.open("b.b2nd", mode="r") na = a[:] nb = b[:] @@ -591,7 +591,7 @@ def test_save_version2(disk, fill_value, reduce_op, axis): lexpr = blosc2.lazyexpr(expr, operands={"a": a, "b": b}) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() if reduce_op in {"cumulative_sum", "cumulative_prod"}: oploc = "npcumsum" if reduce_op == "cumulative_sum" else "npcumprod" @@ -641,8 +641,8 @@ def test_save_version3(disk, fill_value, reduce_op, axis): a = blosc2.zeros(shape, dtype=np.float64, urlpath=urlpath, mode="w") b = blosc2.zeros(shape, dtype=np.float64, urlpath="b.b2nd", mode="w") - 0.1 if disk: - a = blosc2.open(urlpath) - b = blosc2.open("b.b2nd") + a = blosc2.open(urlpath, mode="r") + b = blosc2.open("b.b2nd", mode="r") na = a[:] nb = b[:] @@ -651,7 +651,7 @@ def test_save_version3(disk, fill_value, reduce_op, axis): lexpr = blosc2.lazyexpr(expr, operands={"a": a, "b": b}) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() if reduce_op in {"cumulative_sum", "cumulative_prod"}: oploc = "npcumsum" if reduce_op == "cumulative_sum" else "npcumprod" @@ -701,8 +701,8 @@ def test_save_version4(disk, fill_value, reduce_op, axis): a = blosc2.zeros(shape, dtype=np.float64, urlpath=urlpath, mode="w") b = blosc2.zeros(shape, dtype=np.float64, urlpath="b.b2nd", mode="w") - 0.1 if disk: - a = blosc2.open(urlpath) - b = blosc2.open("b.b2nd") + a = blosc2.open(urlpath, mode="r") + b = blosc2.open("b.b2nd", mode="r") na = a[:] # Just a single reduction @@ -710,7 +710,7 @@ def test_save_version4(disk, fill_value, reduce_op, axis): lexpr = blosc2.lazyexpr(expr, operands={"a": a}) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() if reduce_op in {"cumulative_sum", "cumulative_prod"}: oploc = "npcumsum" if reduce_op == "cumulative_sum" else "npcumprod" @@ -738,7 +738,7 @@ def test_save_constructor_reduce(shape, disk, compute): lexpr = blosc2.lazyexpr(expr) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") if compute: res = lexpr.compute() res = res[()] # for later comparison with nres @@ -767,7 +767,7 @@ def test_save_constructor_reduce2(shape, disk, compute): lexpr = blosc2.lazyexpr(expr) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") if compute: res = lexpr.compute() res = res[()] # for later comparison with nres diff --git a/tests/ndarray/test_stringarrays.py b/tests/ndarray/test_stringarrays.py index a512622a..e6f37114 100644 --- a/tests/ndarray/test_stringarrays.py +++ b/tests/ndarray/test_stringarrays.py @@ -124,7 +124,7 @@ def test_unicode_roundtrip_on_disk(tmp_path, shape): ) # Re-open from disk - out = blosc2.open(path) + out = blosc2.open(path, mode="r") assert out.dtype == arr.dtype assert np.array_equal(out, arr) @@ -153,7 +153,7 @@ def test_unicode_on_disk_partial_io(tmp_path): b2[6:10] = replacement arr[6:10] = replacement - reopened = blosc2.open(path) + reopened = blosc2.open(path, mode="r") assert np.array_equal(reopened, arr) @@ -167,7 +167,7 @@ def test_unicode_on_disk_persistence(tmp_path): b2 = blosc2.open(path, mode="a") b2[:] = arr2 - reopened = blosc2.open(path) + reopened = blosc2.open(path, mode="r") assert np.array_equal(reopened, arr2) diff --git a/tests/ndarray/test_struct_dtype.py b/tests/ndarray/test_struct_dtype.py index 9f2c3055..dc2b736e 100644 --- a/tests/ndarray/test_struct_dtype.py +++ b/tests/ndarray/test_struct_dtype.py @@ -36,7 +36,7 @@ def test_scalar(shape, dtype, urlpath): assert a.dtype == b.dtype if urlpath is not None: - c = blosc2.open(urlpath) + c = blosc2.open(urlpath, mode="r") assert np.array_equal(c[:], b) assert c.shape == a.shape assert c.dtype == a.dtype diff --git a/tests/test_dict_store.py b/tests/test_dict_store.py index 4347eaf4..7c9c9dd8 100644 --- a/tests/test_dict_store.py +++ b/tests/test_dict_store.py @@ -114,6 +114,23 @@ def test_to_b2z_and_reopen(populated_dict_store): assert np.all(dstore_read["/nodeB"][:] == np.arange(6)) +def test_extensionless_dict_store_defaults_to_directory(tmp_path): + path = tmp_path / "test_dstore_extless" + + with DictStore(str(path), mode="w") as dstore: + dstore["/node1"] = np.arange(4) + + assert path.is_dir() + assert (path / "embed.b2e").exists() + + with DictStore(str(path), mode="r") as dstore: + assert np.array_equal(dstore["/node1"][:], np.arange(4)) + + opened = blosc2.open(str(path), mode="r") + assert isinstance(opened, DictStore) + assert np.array_equal(opened["/node1"][:], np.arange(4)) + + def test_to_b2z_from_readonly_b2d(): b2d_path = "test_to_b2z_from_readonly.b2d" b2z_path = "test_to_b2z_from_readonly.b2z" @@ -676,3 +693,25 @@ def test_mmap_mode_validation(tmp_path): with pytest.raises(ValueError, match="mmap_mode='r' requires mode='r'"): DictStore(str(path), mode="a", mmap_mode="r") + + +def test_b2z_double_open_append_no_corruption(tmp_path): + """Opening a .b2z store twice in append mode must not corrupt the archive. + + Regression test: previously, GC of the first open's DictStore triggered + ``to_b2z()`` which overwrote the archive with a near-empty ZIP, causing the + second open to fail with ``blosc2_schunk_open_offset`` returning NULL. + """ + path = str(tmp_path / "double_open.b2z") + + with DictStore(path, mode="w") as ds: + ds["/arr"] = blosc2.arange(20) + + # First open — no explicit close (simulates the GC-triggered path) + ds1 = DictStore(path, mode="a") + assert np.array_equal(ds1["/arr"][:], np.arange(20)) + del ds1 # GC; must NOT corrupt the archive + + # Second open — must succeed and see correct data + with DictStore(path, mode="a") as ds2: + assert np.array_equal(ds2["/arr"][:], np.arange(20)) diff --git a/tests/test_mmap.py b/tests/test_mmap.py index 53dfc27c..c0870d1f 100644 --- a/tests/test_mmap.py +++ b/tests/test_mmap.py @@ -53,7 +53,9 @@ def test_initial_mapping_size(tmp_path, monkeypatch, capfd, initial_mapping_size # Reading via open for mmap_mode in ["r", "r+", "c"]: open_mapping_size = None if mmap_mode == "r" else initial_mapping_size - schunk_open = blosc2.open(urlpath, mmap_mode=mmap_mode, initial_mapping_size=open_mapping_size) + schunk_open = blosc2.open( + urlpath, mode="r", mmap_mode=mmap_mode, initial_mapping_size=open_mapping_size + ) for i in range(nchunks): buffer = i * np.arange(chunk_nitems, dtype=dtype) bytes_obj = buffer.tobytes() @@ -95,13 +97,13 @@ def test_initial_mapping_size(tmp_path, monkeypatch, capfd, initial_mapping_size # Error handling with pytest.raises(ValueError, match=r"w\+ mmap_mode cannot be used to open an existing file"): - blosc2.open(urlpath, mmap_mode="w+") + blosc2.open(urlpath, mode="a", mmap_mode="w+") with pytest.raises(ValueError, match="initial_mapping_size can only be used with writing modes"): - blosc2.open(urlpath, mmap_mode="r", initial_mapping_size=100) + blosc2.open(urlpath, mode="a", mmap_mode="r", initial_mapping_size=100) with pytest.raises(ValueError, match="initial_mapping_size can only be used with mmap_mode"): - blosc2.open(urlpath, mmap_mode=None, initial_mapping_size=100) + blosc2.open(urlpath, mode="a", mmap_mode=None, initial_mapping_size=100) with pytest.raises(ValueError, match="initial_mapping_size can only be used with writing modes"): blosc2.SChunk(mmap_mode="r", initial_mapping_size=100, **storage) diff --git a/tests/test_open.py b/tests/test_open.py index c7f78c88..a8610fac 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -110,7 +110,7 @@ def test_open(contiguous, urlpath, cparams, dparams, nchunks, chunk_nitems, dtyp def test_open_fake(): with pytest.raises(FileNotFoundError): - _ = blosc2.open("none.b2nd") + _ = blosc2.open("none.b2nd", mode="r") @pytest.mark.parametrize("offset", [0, 42]) @@ -148,3 +148,34 @@ def test_open_offset(offset, urlpath, mode, mmap_mode): blosc2.open(urlpath, mode, mmap_mode=mmap_mode) blosc2.remove_urlpath(urlpath) + + +def test_open_no_mode_warns(tmp_path): + """FutureWarning is emitted when mode is omitted.""" + urlpath = str(tmp_path / "test.b2nd") + blosc2.asarray(np.arange(10), urlpath=urlpath, mode="w") + with pytest.warns(FutureWarning, match="mode='a'"): + _ = blosc2.open(urlpath) + + +def test_open_explicit_mode_no_warn(tmp_path): + """No FutureWarning is emitted when mode is explicitly given.""" + import warnings + + urlpath = str(tmp_path / "test.b2nd") + blosc2.asarray(np.arange(10), urlpath=urlpath, mode="w") + with warnings.catch_warnings(): + warnings.simplefilter("error", FutureWarning) + _ = blosc2.open(urlpath, mode="r") + _ = blosc2.open(urlpath, mode="a") + + +def test_open_mmap_without_mode_warns(tmp_path): + """FutureWarning is emitted when mode is omitted, even with mmap_mode.""" + if blosc2.IS_WASM: + pytest.skip("mmap_mode is not supported reliably on wasm32") + + urlpath = str(tmp_path / "test.b2nd") + blosc2.asarray(np.arange(10), urlpath=urlpath, mode="w") + with pytest.warns(FutureWarning, match="mode='a'"): + _ = blosc2.open(urlpath, mmap_mode="r") diff --git a/tests/test_open_c2array.py b/tests/test_open_c2array.py index 14bb6710..8d4458ac 100644 --- a/tests/test_open_c2array.py +++ b/tests/test_open_c2array.py @@ -43,7 +43,7 @@ def test_open_c2array(cat2_context): assert a1.cratio == a_open.cratio with pytest.raises(NotImplementedError): - _ = blosc2.open(urlpath) + _ = blosc2.open(urlpath, mode="a") with pytest.raises(NotImplementedError): _ = blosc2.open(urlpath, mode="r", offset=0, cparams={}) diff --git a/tests/test_proxy_schunk.py b/tests/test_proxy_schunk.py index 4245b1aa..7155834c 100644 --- a/tests/test_proxy_schunk.py +++ b/tests/test_proxy_schunk.py @@ -68,9 +68,9 @@ def test_open(urlpath, chunksize, nchunks): del schunk if urlpath is None: with pytest.raises(RuntimeError): - _ = blosc2.open(proxy_urlpath) + _ = blosc2.open(proxy_urlpath, mode="a") else: - proxy = blosc2.open(proxy_urlpath) + proxy = blosc2.open(proxy_urlpath, mode="a") assert proxy[0 : len(data) * 4] == bytes_obj blosc2.remove_urlpath(urlpath) diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py index c7e83972..40382311 100644 --- a/tests/test_tree_store.py +++ b/tests/test_tree_store.py @@ -1057,6 +1057,23 @@ def test_open_context_manager(populated_tree_store): assert np.array_equal(tstore["/child0/data"][:], np.array([1, 2, 3])) +def test_extensionless_tree_store_defaults_to_directory(tmp_path): + path = tmp_path / "test_tstore_extless" + + with TreeStore(str(path), mode="w") as tstore: + tstore["/group/node"] = np.arange(6) + + assert path.is_dir() + assert (path / "embed.b2e").exists() + + with TreeStore(str(path), mode="r") as tstore: + assert np.array_equal(tstore["/group/node"][:], np.arange(6)) + + opened = blosc2.open(str(path), mode="r") + assert isinstance(opened, TreeStore) + assert np.array_equal(opened["/group/node"][:], np.arange(6)) + + @pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) def test_mmap_mode_read_access(storage_type, tmp_path): path = tmp_path / f"test_tstore_mmap.{storage_type}"