From f427898747494e8d734d3ba12cf32ef544d88de5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 18 Feb 2026 21:48:40 +0100 Subject: [PATCH 01/23] sketch out sync codecs + threadpool --- src/zarr/__init__.py | 1 + src/zarr/abc/codec.py | 17 + src/zarr/api/asynchronous.py | 9 +- src/zarr/codecs/blosc.py | 23 +- src/zarr/codecs/bytes.py | 18 +- src/zarr/codecs/crc32c_.py | 26 +- src/zarr/codecs/gzip.py | 14 +- src/zarr/codecs/transpose.py | 12 +- src/zarr/codecs/vlen_utf8.py | 49 ++- src/zarr/codecs/zstd.py | 14 +- src/zarr/core/config.py | 2 +- src/zarr/experimental/sync_codecs.py | 558 +++++++++++++++++++++++++++ tests/test_config.py | 8 +- tests/test_sync_codec_pipeline.py | 306 +++++++++++++++ 14 files changed, 995 insertions(+), 62 deletions(-) create mode 100644 src/zarr/experimental/sync_codecs.py create mode 100644 tests/test_sync_codec_pipeline.py diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 3c6195c28f..e206892fb6 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -37,6 +37,7 @@ from zarr.core.array import Array, AsyncArray from zarr.core.config import config from zarr.core.group import AsyncGroup, Group +from zarr.experimental.sync_codecs import SyncCodecPipeline # noqa: F401 (registers pipeline) # in case setuptools scm screw up and find version to be 0.0.0 assert not __version__.startswith("0.0.0") diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index d41c457b4e..8b0401d6bd 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -137,6 +137,23 @@ def validate( The array chunk grid """ + def _decode_sync(self, chunk_data: CodecOutput, chunk_spec: ArraySpec) -> CodecInput: + """Synchronously decode a single chunk. Override in subclasses to enable + SyncCodecPipeline support.""" + raise NotImplementedError # pragma: no cover + + def _encode_sync( + self, chunk_data: CodecInput, chunk_spec: ArraySpec + ) -> CodecOutput | None: + """Synchronously encode a single chunk. Override in subclasses to enable + SyncCodecPipeline support.""" + raise NotImplementedError # pragma: no cover + + @property + def supports_sync(self) -> bool: + """Whether this codec has synchronous encode/decode implementations.""" + return type(self)._decode_sync is not BaseCodec._decode_sync + async def _decode_single(self, chunk_data: CodecOutput, chunk_spec: ArraySpec) -> CodecInput: raise NotImplementedError # pragma: no cover diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 6164cda957..6ad92025ac 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -386,7 +386,9 @@ async def open( is_v3_array = zarr_format == 3 and _metadata_dict.get("node_type") == "array" if is_v3_array or zarr_format == 2: return AsyncArray( - store_path=store_path, metadata=_metadata_dict, config=kwargs.get("config") + store_path=store_path, + metadata=_metadata_dict, + config=kwargs.get("config"), ) except (AssertionError, FileNotFoundError, NodeTypeValidationError): pass @@ -1279,7 +1281,10 @@ async def open_array( _warn_write_empty_chunks_kwarg() try: - return await AsyncArray.open(store_path, zarr_format=zarr_format) + return await AsyncArray.open( + store_path, + zarr_format=zarr_format, + ) except FileNotFoundError as err: if not store_path.read_only and mode in _CREATE_MODES: overwrite = _infer_overwrite(mode) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 5b91cfa005..fd1e3d449b 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -299,28 +299,29 @@ def _blosc_codec(self) -> Blosc: config_dict["typesize"] = self.typesize return Blosc.from_config(config_dict) + def _decode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer: + return as_numpy_array_wrapper(self._blosc_codec.decode, chunk_bytes, chunk_spec.prototype) + + def _encode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None: + # Since blosc only support host memory, we convert the input and output of the encoding + # between numpy array and buffer + return chunk_spec.prototype.buffer.from_bytes( + self._blosc_codec.encode(chunk_bytes.as_numpy_array()) + ) + async def _decode_single( self, chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> Buffer: - return await asyncio.to_thread( - as_numpy_array_wrapper, self._blosc_codec.decode, chunk_bytes, chunk_spec.prototype - ) + return await asyncio.to_thread(self._decode_sync, chunk_bytes, chunk_spec) async def _encode_single( self, chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> Buffer | None: - # Since blosc only support host memory, we convert the input and output of the encoding - # between numpy array and buffer - return await asyncio.to_thread( - lambda chunk: chunk_spec.prototype.buffer.from_bytes( - self._blosc_codec.encode(chunk.as_numpy_array()) - ), - chunk_bytes, - ) + return await asyncio.to_thread(self._encode_sync, chunk_bytes, chunk_spec) def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 39c26bd4a8..87ca2427b1 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -67,7 +67,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: ) return self - async def _decode_single( + def _decode_sync( self, chunk_bytes: Buffer, chunk_spec: ArraySpec, @@ -95,7 +95,7 @@ async def _decode_single( ) return chunk_array - async def _encode_single( + def _encode_sync( self, chunk_array: NDBuffer, chunk_spec: ArraySpec, @@ -116,5 +116,19 @@ async def _encode_single( nd_array = nd_array.ravel().view(dtype="B") return chunk_spec.prototype.buffer.from_array_like(nd_array) + async def _decode_single( + self, + chunk_bytes: Buffer, + chunk_spec: ArraySpec, + ) -> NDBuffer: + return self._decode_sync(chunk_bytes, chunk_spec) + + async def _encode_single( + self, + chunk_array: NDBuffer, + chunk_spec: ArraySpec, + ) -> Buffer | None: + return self._encode_sync(chunk_array, chunk_spec) + def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index 9536d0d558..3cd3aef873 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -31,11 +31,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "crc32c"} - async def _decode_single( - self, - chunk_bytes: Buffer, - chunk_spec: ArraySpec, - ) -> Buffer: + def _decode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer: data = chunk_bytes.as_numpy_array() crc32_bytes = data[-4:] inner_bytes = data[:-4] @@ -51,11 +47,7 @@ async def _decode_single( ) return chunk_spec.prototype.buffer.from_array_like(inner_bytes) - async def _encode_single( - self, - chunk_bytes: Buffer, - chunk_spec: ArraySpec, - ) -> Buffer | None: + def _encode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None: data = chunk_bytes.as_numpy_array() # Calculate the checksum and "cast" it to a numpy array checksum = np.array( @@ -64,5 +56,19 @@ async def _encode_single( # Append the checksum (as bytes) to the data return chunk_spec.prototype.buffer.from_array_like(np.append(data, checksum.view("B"))) + async def _decode_single( + self, + chunk_bytes: Buffer, + chunk_spec: ArraySpec, + ) -> Buffer: + return self._decode_sync(chunk_bytes, chunk_spec) + + async def _encode_single( + self, + chunk_bytes: Buffer, + chunk_spec: ArraySpec, + ) -> Buffer | None: + return self._encode_sync(chunk_bytes, chunk_spec) + def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length + 4 diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 610ca9dadd..2b165c108c 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -48,23 +48,25 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "gzip", "configuration": {"level": self.level}} + def _decode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer: + return as_numpy_array_wrapper(GZip(self.level).decode, chunk_bytes, chunk_spec.prototype) + + def _encode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None: + return as_numpy_array_wrapper(GZip(self.level).encode, chunk_bytes, chunk_spec.prototype) + async def _decode_single( self, chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> Buffer: - return await asyncio.to_thread( - as_numpy_array_wrapper, GZip(self.level).decode, chunk_bytes, chunk_spec.prototype - ) + return await asyncio.to_thread(self._decode_sync, chunk_bytes, chunk_spec) async def _encode_single( self, chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> Buffer | None: - return await asyncio.to_thread( - as_numpy_array_wrapper, GZip(self.level).encode, chunk_bytes, chunk_spec.prototype - ) + return await asyncio.to_thread(self._encode_sync, chunk_bytes, chunk_spec) def compute_encoded_size( self, diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index a8570b6e8f..abbebfd090 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -95,20 +95,26 @@ def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: prototype=chunk_spec.prototype, ) + def _decode_sync(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + inverse_order = np.argsort(self.order) + return chunk_array.transpose(inverse_order) + + def _encode_sync(self, chunk_array: NDBuffer, _chunk_spec: ArraySpec) -> NDBuffer | None: + return chunk_array.transpose(self.order) + async def _decode_single( self, chunk_array: NDBuffer, chunk_spec: ArraySpec, ) -> NDBuffer: - inverse_order = np.argsort(self.order) - return chunk_array.transpose(inverse_order) + return self._decode_sync(chunk_array, chunk_spec) async def _encode_single( self, chunk_array: NDBuffer, _chunk_spec: ArraySpec, ) -> NDBuffer | None: - return chunk_array.transpose(self.order) + return self._encode_sync(chunk_array, _chunk_spec) def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: return input_byte_length diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index fb1fb76126..16de25001c 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -40,12 +40,7 @@ def to_dict(self) -> dict[str, JSON]: def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self - # TODO: expand the tests for this function - async def _decode_single( - self, - chunk_bytes: Buffer, - chunk_spec: ArraySpec, - ) -> NDBuffer: + def _decode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) raw_bytes = chunk_bytes.as_array_like() @@ -55,15 +50,25 @@ async def _decode_single( as_string_dtype = decoded.astype(chunk_spec.dtype.to_native_dtype(), copy=False) return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype) + def _encode_sync(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> Buffer | None: + assert isinstance(chunk_array, NDBuffer) + return chunk_spec.prototype.buffer.from_bytes( + _vlen_utf8_codec.encode(chunk_array.as_numpy_array()) + ) + + async def _decode_single( + self, + chunk_bytes: Buffer, + chunk_spec: ArraySpec, + ) -> NDBuffer: + return self._decode_sync(chunk_bytes, chunk_spec) + async def _encode_single( self, chunk_array: NDBuffer, chunk_spec: ArraySpec, ) -> Buffer | None: - assert isinstance(chunk_array, NDBuffer) - return chunk_spec.prototype.buffer.from_bytes( - _vlen_utf8_codec.encode(chunk_array.as_numpy_array()) - ) + return self._encode_sync(chunk_array, chunk_spec) def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: # what is input_byte_length for an object dtype? @@ -86,11 +91,7 @@ def to_dict(self) -> dict[str, JSON]: def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self - async def _decode_single( - self, - chunk_bytes: Buffer, - chunk_spec: ArraySpec, - ) -> NDBuffer: + def _decode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) raw_bytes = chunk_bytes.as_array_like() @@ -99,15 +100,25 @@ async def _decode_single( decoded = _reshape_view(decoded, chunk_spec.shape) return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded) + def _encode_sync(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> Buffer | None: + assert isinstance(chunk_array, NDBuffer) + return chunk_spec.prototype.buffer.from_bytes( + _vlen_bytes_codec.encode(chunk_array.as_numpy_array()) + ) + + async def _decode_single( + self, + chunk_bytes: Buffer, + chunk_spec: ArraySpec, + ) -> NDBuffer: + return self._decode_sync(chunk_bytes, chunk_spec) + async def _encode_single( self, chunk_array: NDBuffer, chunk_spec: ArraySpec, ) -> Buffer | None: - assert isinstance(chunk_array, NDBuffer) - return chunk_spec.prototype.buffer.from_bytes( - _vlen_bytes_codec.encode(chunk_array.as_numpy_array()) - ) + return self._encode_sync(chunk_array, chunk_spec) def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: # what is input_byte_length for an object dtype? diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index 27cc9a7777..fab4fd573e 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -71,23 +71,25 @@ def _zstd_codec(self) -> Zstd: config_dict = {"level": self.level, "checksum": self.checksum} return Zstd.from_config(config_dict) + def _decode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer: + return as_numpy_array_wrapper(self._zstd_codec.decode, chunk_bytes, chunk_spec.prototype) + + def _encode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None: + return as_numpy_array_wrapper(self._zstd_codec.encode, chunk_bytes, chunk_spec.prototype) + async def _decode_single( self, chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> Buffer: - return await asyncio.to_thread( - as_numpy_array_wrapper, self._zstd_codec.decode, chunk_bytes, chunk_spec.prototype - ) + return await asyncio.to_thread(self._decode_sync, chunk_bytes, chunk_spec) async def _encode_single( self, chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> Buffer | None: - return await asyncio.to_thread( - as_numpy_array_wrapper, self._zstd_codec.encode, chunk_bytes, chunk_spec.prototype - ) + return await asyncio.to_thread(self._encode_sync, chunk_bytes, chunk_spec) def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index f8f8ea4f5f..ddf38eaf25 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -102,7 +102,7 @@ def enable_gpu(self) -> ConfigSet: "threading": {"max_workers": None}, "json_indent": 2, "codec_pipeline": { - "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", + "path": "zarr.experimental.sync_codecs.SyncCodecPipeline", "batch_size": 1, }, "codecs": { diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py new file mode 100644 index 0000000000..a1cb6b76ee --- /dev/null +++ b/src/zarr/experimental/sync_codecs.py @@ -0,0 +1,558 @@ +"""Experimental synchronous codec pipeline. + +The standard zarr codec pipeline (``BatchedCodecPipeline``) wraps fundamentally +synchronous operations (e.g. gzip compress/decompress) in ``asyncio.to_thread``. +The ``SyncCodecPipeline`` in this module eliminates that overhead by dispatching +the full codec chain for each chunk via ``ThreadPoolExecutor.map``, achieving +2-11x throughput improvements. + +Usage:: + + import zarr + from zarr.experimental.sync_codecs import SyncCodecPipeline + + arr = zarr.create_array( + store, + shape=(100, 100), + chunks=(32, 32), + dtype="float64", + codec_pipeline_class=SyncCodecPipeline, + ) +""" + +from __future__ import annotations + +import asyncio +import os +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from itertools import islice +from typing import TYPE_CHECKING, TypeVar + +from zarr.abc.codec import ( + ArrayArrayCodec, + ArrayBytesCodec, + ArrayBytesCodecPartialDecodeMixin, + ArrayBytesCodecPartialEncodeMixin, + BytesBytesCodec, + Codec, + CodecPipeline, +) +from zarr.core.buffer import Buffer, NDBuffer +from zarr.core.codec_pipeline import _unzip2, codecs_from_list, resolve_batched +from zarr.core.common import concurrent_map +from zarr.core.config import config +from zarr.core.indexing import SelectorTuple, is_scalar +from zarr.registry import register_pipeline + +if TYPE_CHECKING: + from collections.abc import Iterable, Iterator + from typing import Any, Self + + from zarr.abc.store import ByteGetter, ByteSetter + from zarr.core.array_spec import ArraySpec + from zarr.core.buffer import BufferPrototype + from zarr.core.chunk_grids import ChunkGrid + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + +__all__ = ["SyncCodecPipeline"] + +T = TypeVar("T") + + +# --------------------------------------------------------------------------- +# Pipeline helpers +# --------------------------------------------------------------------------- + +def _batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]: + if n < 1: + raise ValueError("n must be at least one") + it = iter(iterable) + while batch := tuple(islice(it, n)): + yield batch + + +def _fill_value_or_default(chunk_spec: ArraySpec) -> Any: + fill_value = chunk_spec.fill_value + if fill_value is None: + return chunk_spec.dtype.default_scalar() + return fill_value + + +def _get_pool() -> ThreadPoolExecutor: + """Lazily get or create the module-level thread pool.""" + global _POOL + if _POOL is None: + _POOL = ThreadPoolExecutor(max_workers=os.cpu_count()) + return _POOL + + +_POOL: ThreadPoolExecutor | None = None + + +# --------------------------------------------------------------------------- +# SyncCodecPipeline +# --------------------------------------------------------------------------- + +@dataclass(frozen=True) +class SyncCodecPipeline(CodecPipeline): + """A codec pipeline that runs full per-chunk codec chains in a thread pool. + + When all codecs implement ``_decode_sync`` / ``_encode_sync`` (i.e. + ``supports_sync`` is ``True``), the entire per-chunk codec chain is + dispatched as a single work item via ``ThreadPoolExecutor.map``. + + When a codec does *not* support sync (e.g. ``ShardingCodec``), the pipeline + falls back to the standard async ``decode`` / ``encode`` path from the base + class for that batch, preserving correctness while still benefiting from + sync dispatch for the inner pipeline. + """ + + array_array_codecs: tuple[ArrayArrayCodec, ...] + array_bytes_codec: ArrayBytesCodec + bytes_bytes_codecs: tuple[BytesBytesCodec, ...] + batch_size: int + + @property + def _all_sync(self) -> bool: + """True when every codec in the chain supports synchronous dispatch.""" + return all(c.supports_sync for c in self) + + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: + return type(self).from_codecs(c.evolve_from_array_spec(array_spec=array_spec) for c in self) + + @classmethod + def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: + array_array, array_bytes, bytes_bytes = codecs_from_list(list(codecs)) + return cls( + array_array_codecs=array_array, + array_bytes_codec=array_bytes, + bytes_bytes_codecs=bytes_bytes, + batch_size=batch_size or config.get("codec_pipeline.batch_size"), + ) + + @property + def supports_partial_decode(self) -> bool: + return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( + self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin + ) + + @property + def supports_partial_encode(self) -> bool: + return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( + self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin + ) + + def __iter__(self) -> Iterator[Codec]: + yield from self.array_array_codecs + yield self.array_bytes_codec + yield from self.bytes_bytes_codecs + + def validate( + self, + *, + shape: tuple[int, ...], + dtype: ZDType[TBaseDType, TBaseScalar], + chunk_grid: ChunkGrid, + ) -> None: + for codec in self: + codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid) + + def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: + for codec in self: + byte_length = codec.compute_encoded_size(byte_length, array_spec) + array_spec = codec.resolve_metadata(array_spec) + return byte_length + + # ------------------------------------------------------------------- + # Per-chunk codec chain (for pool.map dispatch) + # ------------------------------------------------------------------- + + def _resolve_metadata_chain(self, chunk_spec: ArraySpec) -> tuple[ + list[tuple[ArrayArrayCodec, ArraySpec]], + tuple[ArrayBytesCodec, ArraySpec], + list[tuple[BytesBytesCodec, ArraySpec]], + ]: + """Resolve metadata through the codec chain for a single chunk_spec.""" + aa_codecs_with_spec: list[tuple[ArrayArrayCodec, ArraySpec]] = [] + spec = chunk_spec + for aa_codec in self.array_array_codecs: + aa_codecs_with_spec.append((aa_codec, spec)) + spec = aa_codec.resolve_metadata(spec) + + ab_codec_with_spec = (self.array_bytes_codec, spec) + spec = self.array_bytes_codec.resolve_metadata(spec) + + bb_codecs_with_spec: list[tuple[BytesBytesCodec, ArraySpec]] = [] + for bb_codec in self.bytes_bytes_codecs: + bb_codecs_with_spec.append((bb_codec, spec)) + spec = bb_codec.resolve_metadata(spec) + + return (aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec) + + def _decode_one( + self, + chunk_bytes: Buffer | None, + chunk_spec: ArraySpec, + aa_chain: list[tuple[ArrayArrayCodec, ArraySpec]], + ab_pair: tuple[ArrayBytesCodec, ArraySpec], + bb_chain: list[tuple[BytesBytesCodec, ArraySpec]], + ) -> NDBuffer | None: + """Decode a single chunk through the full codec chain, synchronously.""" + if chunk_bytes is None: + return None + + # bytes-bytes decode (reverse order) + for bb_codec, spec in reversed(bb_chain): + chunk_bytes = bb_codec._decode_sync(chunk_bytes, spec) + + # array-bytes decode + ab_codec, ab_spec = ab_pair + chunk_array = ab_codec._decode_sync(chunk_bytes, ab_spec) + + # array-array decode (reverse order) + for aa_codec, spec in reversed(aa_chain): + chunk_array = aa_codec._decode_sync(chunk_array, spec) + + return chunk_array + + def _encode_one( + self, + chunk_array: NDBuffer | None, + chunk_spec: ArraySpec, + ) -> Buffer | None: + """Encode a single chunk through the full codec chain, synchronously.""" + if chunk_array is None: + return None + + spec = chunk_spec + + # array-array encode + for aa_codec in self.array_array_codecs: + chunk_array = aa_codec._encode_sync(chunk_array, spec) + spec = aa_codec.resolve_metadata(spec) + + # array-bytes encode + chunk_bytes = self.array_bytes_codec._encode_sync(chunk_array, spec) + spec = self.array_bytes_codec.resolve_metadata(spec) + + # bytes-bytes encode + for bb_codec in self.bytes_bytes_codecs: + chunk_bytes = bb_codec._encode_sync(chunk_bytes, spec) + spec = bb_codec.resolve_metadata(spec) + + return chunk_bytes + + # ------------------------------------------------------------------- + # Top-level decode / encode (pool.map over full chain per chunk) + # ------------------------------------------------------------------- + + async def _decode_async( + self, + chunk_bytes_and_specs: list[tuple[Buffer | None, ArraySpec]], + ) -> Iterable[NDBuffer | None]: + """Async fallback: walk codecs one at a time (like BatchedCodecPipeline).""" + chunk_bytes_batch, chunk_specs = _unzip2(chunk_bytes_and_specs) + + for bb_codec in self.bytes_bytes_codecs[::-1]: + chunk_bytes_batch = list(await bb_codec.decode( + zip(chunk_bytes_batch, chunk_specs, strict=False) + )) + + chunk_array_batch: list[NDBuffer | None] = list(await self.array_bytes_codec.decode( + zip(chunk_bytes_batch, chunk_specs, strict=False) + )) + + for aa_codec in self.array_array_codecs[::-1]: + chunk_array_batch = list(await aa_codec.decode( + zip(chunk_array_batch, chunk_specs, strict=False) + )) + + return chunk_array_batch + + async def _encode_async( + self, + chunk_arrays_and_specs: list[tuple[NDBuffer | None, ArraySpec]], + ) -> Iterable[Buffer | None]: + """Async fallback: walk codecs one at a time (like BatchedCodecPipeline).""" + chunk_array_batch, chunk_specs = _unzip2(chunk_arrays_and_specs) + + for aa_codec in self.array_array_codecs: + chunk_array_batch = list(await aa_codec.encode( + zip(chunk_array_batch, chunk_specs, strict=False) + )) + chunk_specs = list(resolve_batched(aa_codec, chunk_specs)) + + chunk_bytes_batch: list[Buffer | None] = list(await self.array_bytes_codec.encode( + zip(chunk_array_batch, chunk_specs, strict=False) + )) + chunk_specs = list(resolve_batched(self.array_bytes_codec, chunk_specs)) + + for bb_codec in self.bytes_bytes_codecs: + chunk_bytes_batch = list(await bb_codec.encode( + zip(chunk_bytes_batch, chunk_specs, strict=False) + )) + chunk_specs = list(resolve_batched(bb_codec, chunk_specs)) + + return chunk_bytes_batch + + async def decode( + self, + chunk_bytes_and_specs: Iterable[tuple[Buffer | None, ArraySpec]], + ) -> Iterable[NDBuffer | None]: + items = list(chunk_bytes_and_specs) + if not items: + return [] + + if not self._all_sync: + return await self._decode_async(items) + + # Precompute the metadata chain once (same for all chunks in a batch) + _, first_spec = items[0] + aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(first_spec) + + pool = _get_pool() + loop = asyncio.get_running_loop() + + # Submit each chunk to the pool and wrap each Future for asyncio. + async_futures = [ + asyncio.wrap_future( + pool.submit(self._decode_one, item[0], item[1], aa_chain, ab_pair, bb_chain), + loop=loop, + ) + for item in items + ] + return await asyncio.gather(*async_futures) + + async def encode( + self, + chunk_arrays_and_specs: Iterable[tuple[NDBuffer | None, ArraySpec]], + ) -> Iterable[Buffer | None]: + items = list(chunk_arrays_and_specs) + if not items: + return [] + + if not self._all_sync: + return await self._encode_async(items) + + pool = _get_pool() + loop = asyncio.get_running_loop() + + # Submit each chunk to the pool and wrap each Future for asyncio. + async_futures = [ + asyncio.wrap_future( + pool.submit(self._encode_one, item[0], item[1]), + loop=loop, + ) + for item in items + ] + return await asyncio.gather(*async_futures) + + # ------------------------------------------------------------------- + # read / write (IO stays async, compute goes through pool.map) + # ------------------------------------------------------------------- + + async def read( + self, + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + out: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + await concurrent_map( + [ + (single_batch_info, out, drop_axes) + for single_batch_info in _batched(batch_info, self.batch_size) + ], + self._read_batch, + config.get("async.concurrency"), + ) + + async def _read_batch( + self, + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + out: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + batch_info = list(batch_info) + # Phase 1: IO -- fetch bytes from store (always async) + chunk_bytes_batch = await concurrent_map( + [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], + lambda byte_getter, prototype: byte_getter.get(prototype), + config.get("async.concurrency"), + ) + + # Phase 2: Compute -- decode via pool.map + decode_items = [ + (chunk_bytes, chunk_spec) + for chunk_bytes, (_, chunk_spec, *_) in zip( + chunk_bytes_batch, batch_info, strict=False + ) + ] + chunk_array_batch: Iterable[NDBuffer | None] = await self.decode(decode_items) + + # Phase 3: Scatter into output buffer + for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip( + chunk_array_batch, batch_info, strict=False + ): + if chunk_array is not None: + tmp = chunk_array[chunk_selection] + if drop_axes != (): + tmp = tmp.squeeze(axis=drop_axes) + out[out_selection] = tmp + else: + out[out_selection] = _fill_value_or_default(chunk_spec) + + async def write( + self, + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + value: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + await concurrent_map( + [ + (single_batch_info, value, drop_axes) + for single_batch_info in _batched(batch_info, self.batch_size) + ], + self._write_batch, + config.get("async.concurrency"), + ) + + def _merge_chunk_array( + self, + existing_chunk_array: NDBuffer | None, + value: NDBuffer, + out_selection: SelectorTuple, + chunk_spec: ArraySpec, + chunk_selection: SelectorTuple, + is_complete_chunk: bool, + drop_axes: tuple[int, ...], + ) -> NDBuffer: + if ( + is_complete_chunk + and value.shape == chunk_spec.shape + and value[out_selection].shape == chunk_spec.shape + ): + return value + if existing_chunk_array is None: + chunk_array = chunk_spec.prototype.nd_buffer.create( + shape=chunk_spec.shape, + dtype=chunk_spec.dtype.to_native_dtype(), + order=chunk_spec.order, + fill_value=_fill_value_or_default(chunk_spec), + ) + else: + chunk_array = existing_chunk_array.copy() + if chunk_selection == () or is_scalar( + value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype() + ): + chunk_value = value + else: + chunk_value = value[out_selection] + if drop_axes != (): + item = tuple( + None if idx in drop_axes else slice(None) + for idx in range(chunk_spec.ndim) + ) + chunk_value = chunk_value[item] + chunk_array[chunk_selection] = chunk_value + return chunk_array + + async def _write_batch( + self, + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + value: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + batch_info = list(batch_info) + + # Phase 1: IO -- read existing bytes for non-complete chunks + async def _read_key( + byte_setter: ByteSetter | None, prototype: BufferPrototype + ) -> Buffer | None: + if byte_setter is None: + return None + return await byte_setter.get(prototype=prototype) + + chunk_bytes_batch: Iterable[Buffer | None] + chunk_bytes_batch = await concurrent_map( + [ + ( + None if is_complete_chunk else byte_setter, + chunk_spec.prototype, + ) + for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info + ], + _read_key, + config.get("async.concurrency"), + ) + + # Phase 2: Compute -- decode existing chunks via pool.map + decode_items = [ + (chunk_bytes, chunk_spec) + for chunk_bytes, (_, chunk_spec, *_) in zip( + chunk_bytes_batch, batch_info, strict=False + ) + ] + chunk_array_decoded: Iterable[NDBuffer | None] = await self.decode(decode_items) + + # Phase 3: Merge (pure compute, single-threaded -- touches shared `value` buffer) + chunk_array_merged = [ + self._merge_chunk_array( + chunk_array, + value, + out_selection, + chunk_spec, + chunk_selection, + is_complete_chunk, + drop_axes, + ) + for chunk_array, ( + _, + chunk_spec, + chunk_selection, + out_selection, + is_complete_chunk, + ) in zip(chunk_array_decoded, batch_info, strict=False) + ] + + chunk_array_batch: list[NDBuffer | None] = [] + for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_merged, batch_info, strict=False): + if chunk_array is None: + chunk_array_batch.append(None) + else: + if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal( + _fill_value_or_default(chunk_spec) + ): + chunk_array_batch.append(None) + else: + chunk_array_batch.append(chunk_array) + + # Phase 4: Compute -- encode via pool.map + encode_items = [ + (chunk_array, chunk_spec) + for chunk_array, (_, chunk_spec, *_) in zip( + chunk_array_batch, batch_info, strict=False + ) + ] + chunk_bytes_batch = await self.encode(encode_items) + + # Phase 5: IO -- write to store + async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None: + if chunk_bytes is None: + await byte_setter.delete() + else: + await byte_setter.set(chunk_bytes) + + await concurrent_map( + [ + (byte_setter, chunk_bytes) + for chunk_bytes, (byte_setter, *_) in zip( + chunk_bytes_batch, batch_info, strict=False + ) + ], + _write_key, + config.get("async.concurrency"), + ) + + +register_pipeline(SyncCodecPipeline) diff --git a/tests/test_config.py b/tests/test_config.py index c3102e8efe..fc33bd87cb 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -59,7 +59,7 @@ def test_config_defaults_set() -> None: "threading": {"max_workers": None}, "json_indent": 2, "codec_pipeline": { - "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", + "path": "zarr.experimental.sync_codecs.SyncCodecPipeline", "batch_size": 1, }, "codecs": { @@ -132,7 +132,7 @@ def test_config_codec_pipeline_class(store: Store) -> None: # has default value assert get_pipeline_class().__name__ != "" - config.set({"codec_pipeline.name": "zarr.core.codec_pipeline.BatchedCodecPipeline"}) + config.set({"codec_pipeline.path": "zarr.core.codec_pipeline.BatchedCodecPipeline"}) assert get_pipeline_class() == zarr.core.codec_pipeline.BatchedCodecPipeline _mock = Mock() @@ -191,6 +191,10 @@ async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Bu _mock.call() return None + def _encode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None: + _mock.call() + return None + register_codec("blosc", MockBloscCodec) with config.set({"codecs.blosc": fully_qualified_name(MockBloscCodec)}): assert get_codec_class("blosc") == MockBloscCodec diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py new file mode 100644 index 0000000000..ac5ec8fa2e --- /dev/null +++ b/tests/test_sync_codec_pipeline.py @@ -0,0 +1,306 @@ +"""Tests for zarr.experimental.sync_codecs module.""" + +from __future__ import annotations + +import numpy as np +import pytest + +import zarr +from zarr.codecs.bytes import BytesCodec +from zarr.codecs.gzip import GzipCodec +from zarr.codecs.transpose import TransposeCodec +from zarr.codecs.zstd import ZstdCodec +from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.buffer import default_buffer_prototype +from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.experimental.sync_codecs import SyncCodecPipeline +from zarr.storage import MemoryStore + + +def _make_array_spec( + shape: tuple[int, ...], dtype: np.dtype +) -> ArraySpec: + zdtype = get_data_type_from_native_dtype(dtype) + return ArraySpec( + shape=shape, + dtype=zdtype, + fill_value=zdtype.default_scalar(), + config=ArrayConfig(order="C", write_empty_chunks=True), + prototype=default_buffer_prototype(), + ) + + +def _make_nd_buffer(arr: np.ndarray) -> zarr.core.buffer.NDBuffer: + return default_buffer_prototype().nd_buffer.from_numpy_array(arr) + + +# --------------------------------------------------------------------------- +# Unit tests: supports_sync property +# --------------------------------------------------------------------------- + + +class TestSupportsSync: + def test_gzip_supports_sync(self): + assert GzipCodec().supports_sync + + def test_zstd_supports_sync(self): + assert ZstdCodec().supports_sync + + def test_bytes_supports_sync(self): + assert BytesCodec().supports_sync + + def test_transpose_supports_sync(self): + assert TransposeCodec(order=(0, 1)).supports_sync + + def test_sharding_does_not_support_sync(self): + from zarr.codecs.sharding import ShardingCodec + + assert not ShardingCodec(chunk_shape=(8,)).supports_sync + + +# --------------------------------------------------------------------------- +# Unit tests: individual codec sync roundtrips +# --------------------------------------------------------------------------- + + +class TestGzipCodecSync: + def test_roundtrip(self): + codec = GzipCodec(level=1) + arr = np.arange(100, dtype="float64") + spec = _make_array_spec(arr.shape, arr.dtype) + buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) + + encoded = codec._encode_sync(buf, spec) + assert encoded is not None + decoded = codec._decode_sync(encoded, spec) + result = np.frombuffer(decoded.as_numpy_array(), dtype="float64") + np.testing.assert_array_equal(arr, result) + + +class TestZstdCodecSync: + def test_roundtrip(self): + codec = ZstdCodec(level=1) + arr = np.arange(100, dtype="float64") + spec = _make_array_spec(arr.shape, arr.dtype) + buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) + + encoded = codec._encode_sync(buf, spec) + assert encoded is not None + decoded = codec._decode_sync(encoded, spec) + result = np.frombuffer(decoded.as_numpy_array(), dtype="float64") + np.testing.assert_array_equal(arr, result) + + +class TestBytesCodecSync: + def test_roundtrip(self): + codec = BytesCodec() + arr = np.arange(100, dtype="float64") + spec = _make_array_spec(arr.shape, arr.dtype) + nd_buf = _make_nd_buffer(arr) + + # Evolve from array spec (handles endianness) + codec = codec.evolve_from_array_spec(spec) + + encoded = codec._encode_sync(nd_buf, spec) + assert encoded is not None + decoded = codec._decode_sync(encoded, spec) + np.testing.assert_array_equal(arr, decoded.as_numpy_array()) + + +class TestTransposeCodecSync: + def test_roundtrip(self): + codec = TransposeCodec(order=(1, 0)) + arr = np.arange(12, dtype="float64").reshape(3, 4) + spec = _make_array_spec(arr.shape, arr.dtype) + nd_buf = _make_nd_buffer(arr) + + encoded = codec._encode_sync(nd_buf, spec) + assert encoded is not None + resolved_spec = codec.resolve_metadata(spec) + decoded = codec._decode_sync(encoded, resolved_spec) + np.testing.assert_array_equal(arr, decoded.as_numpy_array()) + + +# --------------------------------------------------------------------------- +# Unit tests: SyncCodecPipeline construction +# --------------------------------------------------------------------------- + + +class TestSyncCodecPipelineConstruction: + def test_from_codecs_valid(self): + pipeline = SyncCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) + assert isinstance(pipeline, SyncCodecPipeline) + assert len(pipeline.bytes_bytes_codecs) == 1 + assert isinstance(pipeline.array_bytes_codec, BytesCodec) + + def test_from_codecs_accepts_sharding(self): + from zarr.codecs.sharding import ShardingCodec + + pipeline = SyncCodecPipeline.from_codecs([ShardingCodec(chunk_shape=(8,))]) + assert isinstance(pipeline, SyncCodecPipeline) + assert not pipeline._all_sync + + def test_from_codecs_rejects_missing_array_bytes(self): + with pytest.raises(ValueError, match="Required ArrayBytesCodec"): + SyncCodecPipeline.from_codecs([GzipCodec()]) + + def test_from_codecs_with_transpose(self): + pipeline = SyncCodecPipeline.from_codecs([ + TransposeCodec(order=(1, 0)), + BytesCodec(), + GzipCodec(level=1), + ]) + assert len(pipeline.array_array_codecs) == 1 + assert isinstance(pipeline.array_array_codecs[0], TransposeCodec) + + +# --------------------------------------------------------------------------- +# Unit tests: SyncCodecPipeline encode/decode roundtrip +# --------------------------------------------------------------------------- + + +class TestSyncCodecPipelineRoundtrip: + @pytest.mark.asyncio + async def test_encode_decode_single_chunk(self): + pipeline = SyncCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) + arr = np.random.default_rng(42).standard_normal((32, 32)).astype("float64") + spec = _make_array_spec(arr.shape, arr.dtype) + pipeline = pipeline.evolve_from_array_spec(spec) + nd_buf = _make_nd_buffer(arr) + + encoded = await pipeline.encode([(nd_buf, spec)]) + decoded = await pipeline.decode([(list(encoded)[0], spec)]) + result = list(decoded)[0] + assert result is not None + np.testing.assert_array_equal(arr, result.as_numpy_array()) + + @pytest.mark.asyncio + async def test_encode_decode_multiple_chunks(self): + pipeline = SyncCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) + rng = np.random.default_rng(42) + spec = _make_array_spec((16, 16), np.dtype("float64")) + pipeline = pipeline.evolve_from_array_spec(spec) + chunks = [rng.standard_normal((16, 16)).astype("float64") for _ in range(10)] + nd_bufs = [_make_nd_buffer(c) for c in chunks] + + encoded = list(await pipeline.encode([(buf, spec) for buf in nd_bufs])) + decoded = list(await pipeline.decode([(enc, spec) for enc in encoded])) + for original, dec in zip(chunks, decoded): + assert dec is not None + np.testing.assert_array_equal(original, dec.as_numpy_array()) + + @pytest.mark.asyncio + async def test_encode_decode_empty_batch(self): + pipeline = SyncCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) + encoded = await pipeline.encode([]) + assert list(encoded) == [] + decoded = await pipeline.decode([]) + assert list(decoded) == [] + + @pytest.mark.asyncio + async def test_encode_decode_none_chunk(self): + pipeline = SyncCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) + spec = _make_array_spec((8,), np.dtype("float64")) + pipeline = pipeline.evolve_from_array_spec(spec) + + encoded = list(await pipeline.encode([(None, spec)])) + assert encoded[0] is None + + decoded = list(await pipeline.decode([(None, spec)])) + assert decoded[0] is None + + +# --------------------------------------------------------------------------- +# Integration tests: SyncCodecPipeline is the default pipeline +# --------------------------------------------------------------------------- + + +class TestSyncCodecPipelineDefault: + def test_create_array_uses_sync_pipeline(self): + store = MemoryStore() + arr = zarr.create_array( + store, + shape=(100, 100), + chunks=(32, 32), + dtype="float64", + ) + assert isinstance(arr.async_array.codec_pipeline, SyncCodecPipeline) + + data = np.random.default_rng(42).standard_normal((100, 100)) + arr[:] = data + np.testing.assert_array_equal(arr[:], data) + + def test_open_uses_sync_pipeline(self): + store = MemoryStore() + arr = zarr.create_array( + store, + shape=(50, 50), + chunks=(25, 25), + dtype="float64", + ) + data = np.random.default_rng(42).standard_normal((50, 50)) + arr[:] = data + + arr2 = zarr.open_array(store=store) + assert isinstance(arr2.async_array.codec_pipeline, SyncCodecPipeline) + np.testing.assert_array_equal(arr2[:], data) + + def test_from_array_uses_sync_pipeline(self): + store1 = MemoryStore() + arr1 = zarr.create_array( + store1, + shape=(20, 20), + chunks=(10, 10), + dtype="float64", + ) + data = np.random.default_rng(42).standard_normal((20, 20)) + arr1[:] = data + + store2 = MemoryStore() + arr2 = zarr.from_array(store2, data=arr1) + assert isinstance(arr2.async_array.codec_pipeline, SyncCodecPipeline) + np.testing.assert_array_equal(arr2[:], data) + + def test_partial_write(self): + store = MemoryStore() + arr = zarr.create_array( + store, + shape=(100,), + chunks=(10,), + dtype="int32", + fill_value=0, + ) + arr[5:15] = np.arange(10, dtype="int32") + 1 + result = arr[:] + expected = np.zeros(100, dtype="int32") + expected[5:15] = np.arange(10, dtype="int32") + 1 + np.testing.assert_array_equal(result, expected) + + def test_zstd_codec(self): + store = MemoryStore() + arr = zarr.create_array( + store, + shape=(50,), + chunks=(10,), + dtype="float32", + compressors=ZstdCodec(level=3), + ) + data = np.random.default_rng(42).standard_normal(50).astype("float32") + arr[:] = data + np.testing.assert_array_equal(arr[:], data) + + def test_config_switch_to_batched(self): + """Verify we can switch back to BatchedCodecPipeline via config.""" + from zarr.core.codec_pipeline import BatchedCodecPipeline + + zarr.config.set( + {"codec_pipeline.path": "zarr.core.codec_pipeline.BatchedCodecPipeline"} + ) + try: + store = MemoryStore() + arr = zarr.create_array(store, shape=(10,), dtype="float64") + assert isinstance(arr.async_array.codec_pipeline, BatchedCodecPipeline) + finally: + zarr.config.set( + {"codec_pipeline.path": "zarr.experimental.sync_codecs.SyncCodecPipeline"} + ) From 65d12301c0280f24d91a73e24a783acf1526761f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 08:45:13 +0100 Subject: [PATCH 02/23] fix perf regressions --- src/zarr/experimental/sync_codecs.py | 232 +++++++++++++-------------- 1 file changed, 110 insertions(+), 122 deletions(-) diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py index a1cb6b76ee..5418e45228 100644 --- a/src/zarr/experimental/sync_codecs.py +++ b/src/zarr/experimental/sync_codecs.py @@ -2,29 +2,18 @@ The standard zarr codec pipeline (``BatchedCodecPipeline``) wraps fundamentally synchronous operations (e.g. gzip compress/decompress) in ``asyncio.to_thread``. -The ``SyncCodecPipeline`` in this module eliminates that overhead by dispatching -the full codec chain for each chunk via ``ThreadPoolExecutor.map``, achieving -2-11x throughput improvements. +The ``SyncCodecPipeline`` in this module eliminates that overhead by running +per-chunk codec chains synchronously, achieving 2-11x throughput improvements. Usage:: import zarr - from zarr.experimental.sync_codecs import SyncCodecPipeline - - arr = zarr.create_array( - store, - shape=(100, 100), - chunks=(32, 32), - dtype="float64", - codec_pipeline_class=SyncCodecPipeline, - ) + + zarr.config.set({"codec_pipeline.path": "zarr.experimental.sync_codecs.SyncCodecPipeline"}) """ from __future__ import annotations -import asyncio -import os -from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from itertools import islice from typing import TYPE_CHECKING, TypeVar @@ -64,6 +53,7 @@ # Pipeline helpers # --------------------------------------------------------------------------- + def _batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]: if n < 1: raise ValueError("n must be at least one") @@ -79,33 +69,22 @@ def _fill_value_or_default(chunk_spec: ArraySpec) -> Any: return fill_value -def _get_pool() -> ThreadPoolExecutor: - """Lazily get or create the module-level thread pool.""" - global _POOL - if _POOL is None: - _POOL = ThreadPoolExecutor(max_workers=os.cpu_count()) - return _POOL - - -_POOL: ThreadPoolExecutor | None = None - - # --------------------------------------------------------------------------- # SyncCodecPipeline # --------------------------------------------------------------------------- + @dataclass(frozen=True) class SyncCodecPipeline(CodecPipeline): - """A codec pipeline that runs full per-chunk codec chains in a thread pool. + """A codec pipeline that runs per-chunk codec chains synchronously. When all codecs implement ``_decode_sync`` / ``_encode_sync`` (i.e. - ``supports_sync`` is ``True``), the entire per-chunk codec chain is - dispatched as a single work item via ``ThreadPoolExecutor.map``. + ``supports_sync`` is ``True``), the per-chunk codec chain runs synchronously + without any ``asyncio.to_thread`` overhead. When a codec does *not* support sync (e.g. ``ShardingCodec``), the pipeline - falls back to the standard async ``decode`` / ``encode`` path from the base - class for that batch, preserving correctness while still benefiting from - sync dispatch for the inner pipeline. + falls back to the standard async ``decode`` / ``encode`` path, preserving + correctness while still benefiting from sync dispatch for the inner pipeline. """ array_array_codecs: tuple[ArrayArrayCodec, ...] @@ -165,10 +144,12 @@ def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: return byte_length # ------------------------------------------------------------------- - # Per-chunk codec chain (for pool.map dispatch) + # Per-chunk sync codec chain # ------------------------------------------------------------------- - def _resolve_metadata_chain(self, chunk_spec: ArraySpec) -> tuple[ + def _resolve_metadata_chain( + self, chunk_spec: ArraySpec + ) -> tuple[ list[tuple[ArrayArrayCodec, ArraySpec]], tuple[ArrayBytesCodec, ArraySpec], list[tuple[BytesBytesCodec, ArraySpec]], @@ -244,7 +225,7 @@ def _encode_one( return chunk_bytes # ------------------------------------------------------------------- - # Top-level decode / encode (pool.map over full chain per chunk) + # Async fallback for codecs that don't support sync (e.g. sharding) # ------------------------------------------------------------------- async def _decode_async( @@ -255,18 +236,18 @@ async def _decode_async( chunk_bytes_batch, chunk_specs = _unzip2(chunk_bytes_and_specs) for bb_codec in self.bytes_bytes_codecs[::-1]: - chunk_bytes_batch = list(await bb_codec.decode( - zip(chunk_bytes_batch, chunk_specs, strict=False) - )) + chunk_bytes_batch = list( + await bb_codec.decode(zip(chunk_bytes_batch, chunk_specs, strict=False)) + ) - chunk_array_batch: list[NDBuffer | None] = list(await self.array_bytes_codec.decode( - zip(chunk_bytes_batch, chunk_specs, strict=False) - )) + chunk_array_batch: list[NDBuffer | None] = list( + await self.array_bytes_codec.decode(zip(chunk_bytes_batch, chunk_specs, strict=False)) + ) for aa_codec in self.array_array_codecs[::-1]: - chunk_array_batch = list(await aa_codec.decode( - zip(chunk_array_batch, chunk_specs, strict=False) - )) + chunk_array_batch = list( + await aa_codec.decode(zip(chunk_array_batch, chunk_specs, strict=False)) + ) return chunk_array_batch @@ -278,24 +259,28 @@ async def _encode_async( chunk_array_batch, chunk_specs = _unzip2(chunk_arrays_and_specs) for aa_codec in self.array_array_codecs: - chunk_array_batch = list(await aa_codec.encode( - zip(chunk_array_batch, chunk_specs, strict=False) - )) + chunk_array_batch = list( + await aa_codec.encode(zip(chunk_array_batch, chunk_specs, strict=False)) + ) chunk_specs = list(resolve_batched(aa_codec, chunk_specs)) - chunk_bytes_batch: list[Buffer | None] = list(await self.array_bytes_codec.encode( - zip(chunk_array_batch, chunk_specs, strict=False) - )) + chunk_bytes_batch: list[Buffer | None] = list( + await self.array_bytes_codec.encode(zip(chunk_array_batch, chunk_specs, strict=False)) + ) chunk_specs = list(resolve_batched(self.array_bytes_codec, chunk_specs)) for bb_codec in self.bytes_bytes_codecs: - chunk_bytes_batch = list(await bb_codec.encode( - zip(chunk_bytes_batch, chunk_specs, strict=False) - )) + chunk_bytes_batch = list( + await bb_codec.encode(zip(chunk_bytes_batch, chunk_specs, strict=False)) + ) chunk_specs = list(resolve_batched(bb_codec, chunk_specs)) return chunk_bytes_batch + # ------------------------------------------------------------------- + # Top-level decode / encode + # ------------------------------------------------------------------- + async def decode( self, chunk_bytes_and_specs: Iterable[tuple[Buffer | None, ArraySpec]], @@ -307,22 +292,14 @@ async def decode( if not self._all_sync: return await self._decode_async(items) - # Precompute the metadata chain once (same for all chunks in a batch) + # All codecs support sync -- run the full chain inline (no threading). _, first_spec = items[0] aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(first_spec) - pool = _get_pool() - loop = asyncio.get_running_loop() - - # Submit each chunk to the pool and wrap each Future for asyncio. - async_futures = [ - asyncio.wrap_future( - pool.submit(self._decode_one, item[0], item[1], aa_chain, ab_pair, bb_chain), - loop=loop, - ) - for item in items + return [ + self._decode_one(chunk_bytes, chunk_spec, aa_chain, ab_pair, bb_chain) + for chunk_bytes, chunk_spec in items ] - return await asyncio.gather(*async_futures) async def encode( self, @@ -335,21 +312,11 @@ async def encode( if not self._all_sync: return await self._encode_async(items) - pool = _get_pool() - loop = asyncio.get_running_loop() - - # Submit each chunk to the pool and wrap each Future for asyncio. - async_futures = [ - asyncio.wrap_future( - pool.submit(self._encode_one, item[0], item[1]), - loop=loop, - ) - for item in items - ] - return await asyncio.gather(*async_futures) + # All codecs support sync -- run the full chain inline (no threading). + return [self._encode_one(chunk_array, chunk_spec) for chunk_array, chunk_spec in items] # ------------------------------------------------------------------- - # read / write (IO stays async, compute goes through pool.map) + # read / write (IO stays async, compute runs inline) # ------------------------------------------------------------------- async def read( @@ -381,16 +348,22 @@ async def _read_batch( config.get("async.concurrency"), ) - # Phase 2: Compute -- decode via pool.map + # Phase 2: Compute -- decode + scatter decode_items = [ (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, *_) in zip( - chunk_bytes_batch, batch_info, strict=False - ) + for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch_info, strict=False) ] + chunk_array_batch: Iterable[NDBuffer | None] = await self.decode(decode_items) + self._scatter(chunk_array_batch, batch_info, out, drop_axes) - # Phase 3: Scatter into output buffer + @staticmethod + def _scatter( + chunk_array_batch: Iterable[NDBuffer | None], + batch_info: list[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + out: NDBuffer, + drop_axes: tuple[int, ...], + ) -> None: for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip( chunk_array_batch, batch_info, strict=False ): @@ -450,8 +423,7 @@ def _merge_chunk_array( chunk_value = value[out_selection] if drop_axes != (): item = tuple( - None if idx in drop_axes else slice(None) - for idx in range(chunk_spec.ndim) + None if idx in drop_axes else slice(None) for idx in range(chunk_spec.ndim) ) chunk_value = chunk_value[item] chunk_array[chunk_selection] = chunk_value @@ -473,7 +445,7 @@ async def _read_key( return None return await byte_setter.get(prototype=prototype) - chunk_bytes_batch: Iterable[Buffer | None] + chunk_bytes_batch: list[Buffer | None] chunk_bytes_batch = await concurrent_map( [ ( @@ -486,16 +458,58 @@ async def _read_key( config.get("async.concurrency"), ) - # Phase 2: Compute -- decode existing chunks via pool.map + # Phase 2: Compute -- decode, merge, encode decode_items = [ (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, *_) in zip( - chunk_bytes_batch, batch_info, strict=False - ) + for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch_info, strict=False) ] + + encoded_batch = await self._write_batch_compute(decode_items, batch_info, value, drop_axes) + + # Phase 3: IO -- write to store + async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None: + if chunk_bytes is None: + await byte_setter.delete() + else: + await byte_setter.set(chunk_bytes) + + await concurrent_map( + [ + (byte_setter, chunk_bytes) + for chunk_bytes, (byte_setter, *_) in zip(encoded_batch, batch_info, strict=False) + ], + _write_key, + config.get("async.concurrency"), + ) + + async def _write_batch_compute( + self, + decode_items: list[tuple[Buffer | None, ArraySpec]], + batch_info: list[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + value: NDBuffer, + drop_axes: tuple[int, ...], + ) -> list[Buffer | None]: + """Async fallback for compute phase of _write_batch.""" chunk_array_decoded: Iterable[NDBuffer | None] = await self.decode(decode_items) - # Phase 3: Merge (pure compute, single-threaded -- touches shared `value` buffer) + chunk_array_batch = self._merge_and_filter( + chunk_array_decoded, batch_info, value, drop_axes + ) + + encode_items = [ + (chunk_array, chunk_spec) + for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_batch, batch_info, strict=False) + ] + return list(await self.encode(encode_items)) + + def _merge_and_filter( + self, + chunk_array_decoded: Iterable[NDBuffer | None], + batch_info: list[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + value: NDBuffer, + drop_axes: tuple[int, ...], + ) -> list[NDBuffer | None]: + """Merge decoded chunks with new data and filter empty chunks.""" chunk_array_merged = [ self._merge_chunk_array( chunk_array, @@ -515,44 +529,18 @@ async def _read_key( ) in zip(chunk_array_decoded, batch_info, strict=False) ] - chunk_array_batch: list[NDBuffer | None] = [] + result: list[NDBuffer | None] = [] for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_merged, batch_info, strict=False): if chunk_array is None: - chunk_array_batch.append(None) + result.append(None) else: if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal( _fill_value_or_default(chunk_spec) ): - chunk_array_batch.append(None) + result.append(None) else: - chunk_array_batch.append(chunk_array) - - # Phase 4: Compute -- encode via pool.map - encode_items = [ - (chunk_array, chunk_spec) - for chunk_array, (_, chunk_spec, *_) in zip( - chunk_array_batch, batch_info, strict=False - ) - ] - chunk_bytes_batch = await self.encode(encode_items) - - # Phase 5: IO -- write to store - async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None: - if chunk_bytes is None: - await byte_setter.delete() - else: - await byte_setter.set(chunk_bytes) - - await concurrent_map( - [ - (byte_setter, chunk_bytes) - for chunk_bytes, (byte_setter, *_) in zip( - chunk_bytes_batch, batch_info, strict=False - ) - ], - _write_key, - config.get("async.concurrency"), - ) + result.append(chunk_array) + return result register_pipeline(SyncCodecPipeline) From f979eaa2a01bb15cebe542dc6a15e6197a06817d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 10:42:33 +0100 Subject: [PATCH 03/23] add partial encode / decode --- src/zarr/experimental/sync_codecs.py | 40 ++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py index 5418e45228..a576f33a3c 100644 --- a/src/zarr/experimental/sync_codecs.py +++ b/src/zarr/experimental/sync_codecs.py @@ -341,6 +341,24 @@ async def _read_batch( drop_axes: tuple[int, ...] = (), ) -> None: batch_info = list(batch_info) + + if self.supports_partial_decode: + assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) + chunk_array_batch = await self.array_bytes_codec.decode_partial( + [ + (byte_getter, chunk_selection, chunk_spec) + for byte_getter, chunk_spec, chunk_selection, *_ in batch_info + ] + ) + for chunk_array, (_, chunk_spec, _, out_selection, _) in zip( + chunk_array_batch, batch_info, strict=False + ): + if chunk_array is not None: + out[out_selection] = chunk_array + else: + out[out_selection] = _fill_value_or_default(chunk_spec) + return + # Phase 1: IO -- fetch bytes from store (always async) chunk_bytes_batch = await concurrent_map( [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], @@ -354,8 +372,8 @@ async def _read_batch( for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch_info, strict=False) ] - chunk_array_batch: Iterable[NDBuffer | None] = await self.decode(decode_items) - self._scatter(chunk_array_batch, batch_info, out, drop_axes) + chunk_array_batch_decoded: Iterable[NDBuffer | None] = await self.decode(decode_items) + self._scatter(chunk_array_batch_decoded, batch_info, out, drop_axes) @staticmethod def _scatter( @@ -437,6 +455,24 @@ async def _write_batch( ) -> None: batch_info = list(batch_info) + if self.supports_partial_encode: + assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) + if len(value.shape) == 0: + await self.array_bytes_codec.encode_partial( + [ + (byte_setter, value, chunk_selection, chunk_spec) + for byte_setter, chunk_spec, chunk_selection, _, _ in batch_info + ], + ) + else: + await self.array_bytes_codec.encode_partial( + [ + (byte_setter, value[out_selection], chunk_selection, chunk_spec) + for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info + ], + ) + return + # Phase 1: IO -- read existing bytes for non-complete chunks async def _read_key( byte_setter: ByteSetter | None, prototype: BufferPrototype From a934899eabec1abf6565511f0a2c3b125528a5d6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 11:25:35 +0100 Subject: [PATCH 04/23] add sync hotpath --- src/zarr/abc/codec.py | 27 ++- src/zarr/abc/store.py | 26 +++ src/zarr/codecs/gzip.py | 9 +- src/zarr/core/array.py | 276 ++++++++++++++++++++++++++- src/zarr/experimental/sync_codecs.py | 97 ++++++++++ src/zarr/storage/_common.py | 23 +++ src/zarr/storage/_local.py | 49 +++++ src/zarr/storage/_memory.py | 40 ++++ 8 files changed, 534 insertions(+), 13 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 8b0401d6bd..85f9e5eedd 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -142,9 +142,7 @@ def _decode_sync(self, chunk_data: CodecOutput, chunk_spec: ArraySpec) -> CodecI SyncCodecPipeline support.""" raise NotImplementedError # pragma: no cover - def _encode_sync( - self, chunk_data: CodecInput, chunk_spec: ArraySpec - ) -> CodecOutput | None: + def _encode_sync(self, chunk_data: CodecInput, chunk_spec: ArraySpec) -> CodecOutput | None: """Synchronously encode a single chunk. Override in subclasses to enable SyncCodecPipeline support.""" raise NotImplementedError # pragma: no cover @@ -476,6 +474,29 @@ async def write( """ ... + @property + def supports_sync_io(self) -> bool: + """Whether this pipeline supports fully synchronous read/write.""" + return False + + def read_sync( + self, + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + out: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + """Synchronous read path. Only available on pipelines that support it.""" + raise NotImplementedError + + def write_sync( + self, + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + value: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + """Synchronous write path. Only available on pipelines that support it.""" + raise NotImplementedError + async def _batching_helper( func: Callable[[CodecInput, ArraySpec], Awaitable[CodecOutput | None]], diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 87df89a683..59c711b773 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -524,6 +524,32 @@ def supports_partial_writes(self) -> Literal[False]: """ return False + @property + def supports_sync(self) -> bool: + """Does the store support synchronous get/set/delete? + + When True, the sync codec pipeline can bypass the event loop for IO. + Override in subclasses that have native sync implementations. + """ + return False + + def get_sync( + self, + key: str, + prototype: BufferPrototype, + byte_range: ByteRequest | None = None, + ) -> Buffer | None: + """Synchronous version of get(). Only available when supports_sync is True.""" + raise NotImplementedError + + def set_sync(self, key: str, value: Buffer) -> None: + """Synchronous version of set(). Only available when supports_sync is True.""" + raise NotImplementedError + + def delete_sync(self, key: str) -> None: + """Synchronous version of delete(). Only available when supports_sync is True.""" + raise NotImplementedError + @property @abstractmethod def supports_listing(self) -> bool: diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 2b165c108c..1a027fe8a2 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -2,6 +2,7 @@ import asyncio from dataclasses import dataclass +from functools import cached_property from typing import TYPE_CHECKING from numcodecs.gzip import GZip @@ -48,11 +49,15 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "gzip", "configuration": {"level": self.level}} + @cached_property + def _gzip_codec(self) -> GZip: + return GZip(self.level) + def _decode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer: - return as_numpy_array_wrapper(GZip(self.level).decode, chunk_bytes, chunk_spec.prototype) + return as_numpy_array_wrapper(self._gzip_codec.decode, chunk_bytes, chunk_spec.prototype) def _encode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None: - return as_numpy_array_wrapper(GZip(self.level).encode, chunk_bytes, chunk_spec.prototype) + return as_numpy_array_wrapper(self._gzip_codec.encode, chunk_bytes, chunk_spec.prototype) async def _decode_single( self, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 564d0e915a..5800bf0cec 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1973,6 +1973,16 @@ def config(self) -> ArrayConfig: """ return self.async_array.config + def _can_use_sync_path(self) -> bool: + """Check if we can bypass the event loop entirely for read/write.""" + pipeline = self.async_array.codec_pipeline + store_path = self.async_array.store_path + return ( + getattr(pipeline, "supports_sync_io", False) + and not pipeline.supports_partial_decode + and getattr(store_path, "supports_sync", False) + ) + @classmethod @deprecated("Use zarr.create_array instead.", category=ZarrDeprecationWarning) def create( @@ -3049,9 +3059,21 @@ def get_basic_selection( if prototype is None: prototype = default_buffer_prototype() + indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + if self._can_use_sync_path(): + return _get_selection_sync( + self.async_array.store_path, + self.async_array.metadata, + self.async_array.codec_pipeline, + self.async_array.config, + indexer, + out=out, + fields=fields, + prototype=prototype, + ) return sync( self.async_array._get_selection( - BasicIndexer(selection, self.shape, self.metadata.chunk_grid), + indexer, out=out, fields=fields, prototype=prototype, @@ -3159,6 +3181,18 @@ def set_basic_selection( if prototype is None: prototype = default_buffer_prototype() indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + if self._can_use_sync_path(): + _set_selection_sync( + self.async_array.store_path, + self.async_array.metadata, + self.async_array.codec_pipeline, + self.async_array.config, + indexer, + value, + fields=fields, + prototype=prototype, + ) + return sync(self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_orthogonal_selection( @@ -3287,6 +3321,17 @@ def get_orthogonal_selection( if prototype is None: prototype = default_buffer_prototype() indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + if self._can_use_sync_path(): + return _get_selection_sync( + self.async_array.store_path, + self.async_array.metadata, + self.async_array.codec_pipeline, + self.async_array.config, + indexer, + out=out, + fields=fields, + prototype=prototype, + ) return sync( self.async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3406,9 +3451,19 @@ def set_orthogonal_selection( if prototype is None: prototype = default_buffer_prototype() indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) - return sync( - self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype) - ) + if self._can_use_sync_path(): + _set_selection_sync( + self.async_array.store_path, + self.async_array.metadata, + self.async_array.codec_pipeline, + self.async_array.config, + indexer, + value, + fields=fields, + prototype=prototype, + ) + return + sync(self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_mask_selection( self, @@ -3494,6 +3549,17 @@ def get_mask_selection( if prototype is None: prototype = default_buffer_prototype() indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + if self._can_use_sync_path(): + return _get_selection_sync( + self.async_array.store_path, + self.async_array.metadata, + self.async_array.codec_pipeline, + self.async_array.config, + indexer, + out=out, + fields=fields, + prototype=prototype, + ) return sync( self.async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3584,6 +3650,18 @@ def set_mask_selection( if prototype is None: prototype = default_buffer_prototype() indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + if self._can_use_sync_path(): + _set_selection_sync( + self.async_array.store_path, + self.async_array.metadata, + self.async_array.codec_pipeline, + self.async_array.config, + indexer, + value, + fields=fields, + prototype=prototype, + ) + return sync(self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_coordinate_selection( @@ -3672,11 +3750,23 @@ def get_coordinate_selection( if prototype is None: prototype = default_buffer_prototype() indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) - out_array = sync( - self.async_array._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype + if self._can_use_sync_path(): + out_array = _get_selection_sync( + self.async_array.store_path, + self.async_array.metadata, + self.async_array.codec_pipeline, + self.async_array.config, + indexer, + out=out, + fields=fields, + prototype=prototype, + ) + else: + out_array = sync( + self.async_array._get_selection( + indexer=indexer, out=out, fields=fields, prototype=prototype + ) ) - ) if hasattr(out_array, "shape"): # restore shape @@ -3786,6 +3876,18 @@ def set_coordinate_selection( f"elements with an array of {value.shape[0]} elements." ) + if self._can_use_sync_path(): + _set_selection_sync( + self.async_array.store_path, + self.async_array.metadata, + self.async_array.codec_pipeline, + self.async_array.config, + indexer, + value, + fields=fields, + prototype=prototype, + ) + return sync(self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_block_selection( @@ -3887,6 +3989,17 @@ def get_block_selection( if prototype is None: prototype = default_buffer_prototype() indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + if self._can_use_sync_path(): + return _get_selection_sync( + self.async_array.store_path, + self.async_array.metadata, + self.async_array.codec_pipeline, + self.async_array.config, + indexer, + out=out, + fields=fields, + prototype=prototype, + ) return sync( self.async_array._get_selection( indexer=indexer, out=out, fields=fields, prototype=prototype @@ -3988,6 +4101,18 @@ def set_block_selection( if prototype is None: prototype = default_buffer_prototype() indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + if self._can_use_sync_path(): + _set_selection_sync( + self.async_array.store_path, + self.async_array.metadata, + self.async_array.codec_pipeline, + self.async_array.config, + indexer, + value, + fields=fields, + prototype=prototype, + ) + return sync(self.async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @property @@ -5619,6 +5744,141 @@ async def _get_selection( return out_buffer.as_ndarray_like() +def _get_selection_sync( + store_path: StorePath, + metadata: ArrayMetadata, + codec_pipeline: CodecPipeline, + config: ArrayConfig, + indexer: Indexer, + *, + prototype: BufferPrototype, + out: NDBuffer | None = None, + fields: Fields | None = None, +) -> NDArrayLikeOrScalar: + """Synchronous version of _get_selection — bypasses the event loop entirely.""" + # Get dtype from metadata + if metadata.zarr_format == 2: + zdtype = metadata.dtype + else: + zdtype = metadata.data_type + dtype = zdtype.to_native_dtype() + + # Determine memory order + if metadata.zarr_format == 2: + order = metadata.order + else: + order = config.order + + # check fields are sensible + out_dtype = check_fields(fields, dtype) + + # setup output buffer + if out is not None: + if isinstance(out, NDBuffer): + out_buffer = out + else: + raise TypeError(f"out argument needs to be an NDBuffer. Got {type(out)!r}") + if out_buffer.shape != indexer.shape: + raise ValueError( + f"shape of out argument doesn't match. Expected {indexer.shape}, got {out.shape}" + ) + else: + out_buffer = prototype.nd_buffer.empty( + shape=indexer.shape, + dtype=out_dtype, + order=order, + ) + if product(indexer.shape) > 0: + _config = config + if metadata.zarr_format == 2: + _config = replace(_config, order=order) + + codec_pipeline.read_sync( + [ + ( + store_path / metadata.encode_chunk_key(chunk_coords), + metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), + chunk_selection, + out_selection, + is_complete_chunk, + ) + for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer + ], + out_buffer, + drop_axes=indexer.drop_axes, + ) + if isinstance(indexer, BasicIndexer) and indexer.shape == (): + return out_buffer.as_scalar() + return out_buffer.as_ndarray_like() + + +def _set_selection_sync( + store_path: StorePath, + metadata: ArrayMetadata, + codec_pipeline: CodecPipeline, + config: ArrayConfig, + indexer: Indexer, + value: npt.ArrayLike, + *, + prototype: BufferPrototype, + fields: Fields | None = None, +) -> None: + """Synchronous version of _set_selection — bypasses the event loop entirely.""" + # Get dtype from metadata + if metadata.zarr_format == 2: + zdtype = metadata.dtype + else: + zdtype = metadata.data_type + dtype = zdtype.to_native_dtype() + + # check fields are sensible + check_fields(fields, dtype) + fields = check_no_multi_fields(fields) + + # check value shape + if np.isscalar(value): + array_like = prototype.buffer.create_zero_length().as_array_like() + if isinstance(array_like, np._typing._SupportsArrayFunc): + array_like_ = cast("np._typing._SupportsArrayFunc", array_like) + value = np.asanyarray(value, dtype=dtype, like=array_like_) + else: + if not hasattr(value, "shape"): + value = np.asarray(value, dtype) + if not hasattr(value, "dtype") or value.dtype.name != dtype.name: + if hasattr(value, "astype"): + value = value.astype(dtype=dtype, order="A") + else: + value = np.array(value, dtype=dtype, order="A") + value = cast("NDArrayLike", value) + + value_buffer = prototype.nd_buffer.from_ndarray_like(value) + + # Determine memory order + if metadata.zarr_format == 2: + order = metadata.order + else: + order = config.order + + _config = config + if metadata.zarr_format == 2: + _config = replace(_config, order=order) + + codec_pipeline.write_sync( + [ + ( + store_path / metadata.encode_chunk_key(chunk_coords), + metadata.get_chunk_spec(chunk_coords, _config, prototype), + chunk_selection, + out_selection, + is_complete_chunk, + ) + for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer + ], + value_buffer, + drop_axes=indexer.drop_axes, + ) + + async def _getitem( store_path: StorePath, metadata: ArrayMetadata, diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py index a576f33a3c..70080cf3cb 100644 --- a/src/zarr/experimental/sync_codecs.py +++ b/src/zarr/experimental/sync_codecs.py @@ -578,5 +578,102 @@ def _merge_and_filter( result.append(chunk_array) return result + # ------------------------------------------------------------------- + # Fully synchronous read / write (bypass event loop entirely) + # ------------------------------------------------------------------- + + @property + def supports_sync_io(self) -> bool: + return self._all_sync + + def read_sync( + self, + batch_info: Iterable[tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool]], + out: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + batch_info_list = list(batch_info) + if not batch_info_list: + return + + # Resolve metadata chain once (all chunks share the same spec structure) + _, first_spec, *_ = batch_info_list[0] + aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(first_spec) + + for byte_getter, chunk_spec, chunk_selection, out_selection, _ in batch_info_list: + # IO: sync store read + chunk_bytes: Buffer | None = byte_getter.get_sync(prototype=chunk_spec.prototype) + + # Compute: decode through codec chain + chunk_array = self._decode_one(chunk_bytes, chunk_spec, aa_chain, ab_pair, bb_chain) + + # Scatter into output buffer + if chunk_array is not None: + tmp = chunk_array[chunk_selection] + if drop_axes != (): + tmp = tmp.squeeze(axis=drop_axes) + out[out_selection] = tmp + else: + out[out_selection] = _fill_value_or_default(chunk_spec) + + def write_sync( + self, + batch_info: Iterable[tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool]], + value: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + batch_info_list = list(batch_info) + if not batch_info_list: + return + + for ( + byte_setter, + chunk_spec, + chunk_selection, + out_selection, + is_complete_chunk, + ) in batch_info_list: + # Phase 1: Read existing chunk if needed (for partial writes) + existing_bytes: Buffer | None = None + if not is_complete_chunk: + existing_bytes = byte_setter.get_sync(prototype=chunk_spec.prototype) + + # Phase 2: Decode existing chunk + existing_array: NDBuffer | None = None + if existing_bytes is not None: + aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(chunk_spec) + existing_array = self._decode_one( + existing_bytes, chunk_spec, aa_chain, ab_pair, bb_chain + ) + + # Phase 3: Merge + chunk_array: NDBuffer | None = self._merge_chunk_array( + existing_array, + value, + out_selection, + chunk_spec, + chunk_selection, + is_complete_chunk, + drop_axes, + ) + + # Phase 4: Check empty chunk + if ( + chunk_array is not None + and not chunk_spec.config.write_empty_chunks + and chunk_array.all_equal(_fill_value_or_default(chunk_spec)) + ): + chunk_array = None + + # Phase 5: Encode and write + if chunk_array is None: + byte_setter.delete_sync() + else: + chunk_bytes = self._encode_one(chunk_array, chunk_spec) + if chunk_bytes is None: + byte_setter.delete_sync() + else: + byte_setter.set_sync(chunk_bytes) + register_pipeline(SyncCodecPipeline) diff --git a/src/zarr/storage/_common.py b/src/zarr/storage/_common.py index 4bea04f024..4814702239 100644 --- a/src/zarr/storage/_common.py +++ b/src/zarr/storage/_common.py @@ -228,6 +228,29 @@ async def is_empty(self) -> bool: """ return await self.store.is_empty(self.path) + @property + def supports_sync(self) -> bool: + """Whether the underlying store supports synchronous operations.""" + return self.store.supports_sync + + def get_sync( + self, + prototype: BufferPrototype | None = None, + byte_range: ByteRequest | None = None, + ) -> Buffer | None: + """Synchronous read from the store.""" + if prototype is None: + prototype = default_buffer_prototype() + return self.store.get_sync(self.path, prototype=prototype, byte_range=byte_range) + + def set_sync(self, value: Buffer) -> None: + """Synchronous write to the store.""" + self.store.set_sync(self.path, value) + + def delete_sync(self) -> None: + """Synchronous delete from the store.""" + self.store.delete_sync(self.path) + def __truediv__(self, other: str) -> StorePath: """Combine this store path with another path""" return self.__class__(self.store, _dereference_path(self.path, other)) diff --git a/src/zarr/storage/_local.py b/src/zarr/storage/_local.py index 80233a112d..61e0795e1e 100644 --- a/src/zarr/storage/_local.py +++ b/src/zarr/storage/_local.py @@ -187,6 +187,55 @@ def __repr__(self) -> str: def __eq__(self, other: object) -> bool: return isinstance(other, type(self)) and self.root == other.root + @property + def supports_sync(self) -> bool: + return True + + def get_sync( + self, + key: str, + prototype: BufferPrototype | None = None, + byte_range: ByteRequest | None = None, + ) -> Buffer | None: + if prototype is None: + prototype = default_buffer_prototype() + if not self._is_open: + if not self.read_only: + self.root.mkdir(parents=True, exist_ok=True) + if not self.root.exists(): + raise FileNotFoundError(f"{self.root} does not exist") + self._is_open = True + assert isinstance(key, str) + path = self.root / key + try: + return _get(path, prototype, byte_range) + except (FileNotFoundError, IsADirectoryError, NotADirectoryError): + return None + + def set_sync(self, key: str, value: Buffer) -> None: + if not self._is_open: + if not self.read_only: + self.root.mkdir(parents=True, exist_ok=True) + if not self.root.exists(): + raise FileNotFoundError(f"{self.root} does not exist") + self._is_open = True + self._check_writable() + assert isinstance(key, str) + if not isinstance(value, Buffer): + raise TypeError( + f"LocalStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead." + ) + path = self.root / key + _put(path, value) + + def delete_sync(self, key: str) -> None: + self._check_writable() + path = self.root / key + if path.is_dir(): + shutil.rmtree(path) + else: + path.unlink(missing_ok=True) + async def get( self, key: str, diff --git a/src/zarr/storage/_memory.py b/src/zarr/storage/_memory.py index e6f9b7a512..46b3e409da 100644 --- a/src/zarr/storage/_memory.py +++ b/src/zarr/storage/_memory.py @@ -77,6 +77,46 @@ def __eq__(self, other: object) -> bool: and self.read_only == other.read_only ) + @property + def supports_sync(self) -> bool: + return True + + def get_sync( + self, + key: str, + prototype: BufferPrototype | None = None, + byte_range: ByteRequest | None = None, + ) -> Buffer | None: + if prototype is None: + prototype = default_buffer_prototype() + if not self._is_open: + self._is_open = True + assert isinstance(key, str) + try: + value = self._store_dict[key] + start, stop = _normalize_byte_range_index(value, byte_range) + return prototype.buffer.from_buffer(value[start:stop]) + except KeyError: + return None + + def set_sync(self, key: str, value: Buffer) -> None: + self._check_writable() + if not self._is_open: + self._is_open = True + assert isinstance(key, str) + if not isinstance(value, Buffer): + raise TypeError( + f"MemoryStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead." + ) + self._store_dict[key] = value + + def delete_sync(self, key: str) -> None: + self._check_writable() + try: + del self._store_dict[key] + except KeyError: + logger.debug("Key %s does not exist.", key) + async def get( self, key: str, From b53ac3e3d96e2f9fa5ef2167cdb4571ed94aca80 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 11:49:13 +0100 Subject: [PATCH 05/23] add comments and documentation --- docs/design/sync-bypass.md | 228 +++++++++++++++++++++++++++ src/zarr/abc/codec.py | 38 ++++- src/zarr/abc/store.py | 42 ++++- src/zarr/codecs/gzip.py | 8 + src/zarr/core/array.py | 72 ++++++++- src/zarr/experimental/sync_codecs.py | 63 ++++++-- src/zarr/storage/_common.py | 21 ++- src/zarr/storage/_local.py | 18 +++ src/zarr/storage/_memory.py | 18 +++ 9 files changed, 483 insertions(+), 25 deletions(-) create mode 100644 docs/design/sync-bypass.md diff --git a/docs/design/sync-bypass.md b/docs/design/sync-bypass.md new file mode 100644 index 0000000000..fdb3702232 --- /dev/null +++ b/docs/design/sync-bypass.md @@ -0,0 +1,228 @@ +# Design: Fully Synchronous Read/Write Bypass + +## Problem + +Zarr-python's read/write path is inherently async: every `Array.__getitem__` +or `Array.__setitem__` call passes through several layers of async machinery +before any actual work happens. For workloads where both the codec chain and +the store are fundamentally synchronous (e.g. gzip + MemoryStore, or +zstd + LocalStore), this async overhead dominates latency. + +The call chain looks like this: + +``` +Array.__getitem__ + └─ sync() # (1) thread hop: submits coroutine to background event loop + └─ AsyncArray._get_selection # runs on the event loop thread + └─ CodecPipeline.read # async pipeline + ├─ concurrent_map # (2) launches tasks on event loop + │ └─ ByteGetter.get(prototype) # (3) async store IO + │ └─ MemoryStore.get() # just a dict lookup! + └─ codec.decode() + └─ asyncio.to_thread(...) # (4) thread hop for CPU work + └─ gzip.decompress(...) # actual compute +``` + +There are four sources of overhead, marked (1)-(4): + +1. **`sync()` bridge**: Every synchronous `Array` method calls `sync()`, which + uses `asyncio.run_coroutine_threadsafe()` to submit work to a background + event loop thread. Even when the coroutine does zero awaiting, this costs + ~30-50us for the round-trip through the event loop. + +2. **`concurrent_map` batching**: The pipeline groups chunks into batches and + dispatches them via `concurrent_map`, which creates asyncio tasks. For + single-chunk reads (the common case), this is pure overhead. + +3. **Async store IO**: `StorePath.get()` / `StorePath.set()` are `async def`. + For `MemoryStore` (a dict lookup) and `LocalStore` (a file read), the + underlying operation is synchronous — wrapping it in `async def` forces an + unnecessary context switch through the event loop. + +4. **`asyncio.to_thread` for codec compute**: `BatchedCodecPipeline` runs each + codec's encode/decode in `asyncio.to_thread()`, adding another thread hop. + `SyncCodecPipeline` (the foundation this work builds on) already eliminates + this by calling `_decode_sync` / `_encode_sync` inline. + +The net effect: a MemoryStore read of a single small chunk spends more time +in async machinery than in actual decompression. + + +## Solution + +When the codec pipeline and store both support synchronous operation, bypass +the event loop entirely: run IO, codec compute, and buffer scatter all on the +calling thread, with zero async overhead. + +The solution has three layers: + +### Layer 1: Sync Store IO + +Add `supports_sync`, `get_sync()`, `set_sync()`, and `delete_sync()` to the +store abstraction. These are opt-in: the `Store` ABC provides default +implementations that raise `NotImplementedError`, and only stores with native +sync capabilities override them. + +``` +Store ABC (defaults: supports_sync=False, methods raise NotImplementedError) + ├── MemoryStore (supports_sync=True, direct dict access) + ├── LocalStore (supports_sync=True, direct file IO via _get/_put) + └── FsspecStore (unchanged, remains async-only) + +StorePath delegates to its underlying Store: + get_sync() → self.store.get_sync(self.path, ...) + set_sync() → self.store.set_sync(self.path, ...) +``` + +**Key decision**: `StorePath` is what gets passed to the codec pipeline as a +`ByteGetter` / `ByteSetter`. By adding sync methods to `StorePath`, the +pipeline can call them directly without knowing the concrete store type. + +**Protocol gap**: The `ByteGetter` / `ByteSetter` protocols only define async +methods (`get`, `set`, `delete`). Rather than modifying these widely-used +protocols, the sync pipeline methods use `Any` type annotations for the +byte_getter/byte_setter parameters and call `.get_sync()` etc. at runtime. +This is a pragmatic tradeoff: the sync path is an optimization that only +activates when `supports_sync` is True, so the runtime type is always a +`StorePath` that has these methods. + +### Layer 2: Sync Codec Pipeline IO + +Add `supports_sync_io`, `read_sync()`, and `write_sync()` to the +`CodecPipeline` ABC (non-abstract, default raises `NotImplementedError`). + +`SyncCodecPipeline` implements these with a simple sequential loop: + +```python +# read_sync: for each chunk +for byte_getter, chunk_spec, chunk_sel, out_sel, _ in batch_info: + chunk_bytes = byte_getter.get_sync(prototype=chunk_spec.prototype) # sync IO + chunk_array = self._decode_one(chunk_bytes, ...) # sync compute + out[out_selection] = chunk_array[chunk_selection] # scatter +``` + +No batching, no `concurrent_map`, no event loop — just a Python for-loop. + +**Sharding fallback**: When `supports_partial_decode` is True (i.e. the codec +pipeline uses sharding), `supports_sync_io` returns False and the Array falls +back to the standard `sync()` path. This is because `ShardingCodec`'s +`decode_partial` is async (it reads sub-ranges from the store) and does not +have a sync equivalent. + +### Layer 3: Array Bypass + +Each of the 10 sync `Array` selection methods (5 getters, 5 setters) gains a +fast path: + +```python +def get_basic_selection(self, selection, *, out=None, prototype=None, fields=None): + indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + if self._can_use_sync_path(): + return _get_selection_sync( + self.async_array.store_path, self.async_array.metadata, + self.async_array.codec_pipeline, self.async_array.config, + indexer, out=out, fields=fields, prototype=prototype, + ) + return sync(self.async_array._get_selection(indexer, ...)) +``` + +`_can_use_sync_path()` checks three conditions: +1. The codec pipeline supports sync IO (`supports_sync_io`) +2. No partial decode is active (rules out sharding) +3. The store supports sync (`supports_sync`) + +When all three hold, `_get_selection_sync` / `_set_selection_sync` run the +entire operation on the calling thread. These functions mirror the async +`_get_selection` / `_set_selection` exactly, but call `codec_pipeline.read_sync()` +/ `write_sync()` instead of `await codec_pipeline.read()` / `write()`. + + +## Resulting Call Chain + +With the sync bypass active, the call chain becomes: + +``` +Array.__getitem__ + └─ _get_selection_sync # runs on calling thread + └─ SyncCodecPipeline.read_sync + ├─ StorePath.get_sync # direct dict/file access, no event loop + ├─ _decode_one # inline codec chain, no to_thread + └─ out[sel] = array # scatter into output +``` + +No `sync()`, no event loop, no `asyncio.to_thread`, no `concurrent_map`. + + +## Additional Optimization: Codec Instance Caching + +`GzipCodec` was creating a new `GZip(level)` instance on every encode/decode +call. `ZstdCodec` and `BloscCodec` already cache their codec instances via +`@cached_property`. We apply the same pattern to `GzipCodec`: + +```python +@cached_property +def _gzip_codec(self) -> GZip: + return GZip(self.level) +``` + +This is safe because `GzipCodec` is a frozen dataclass — `level` never +changes after construction, so the cached instance is always valid. + + +## What Stays Unchanged + +- **`BatchedCodecPipeline`**: Unmodified. It inherits the default + `supports_sync_io=False` from the ABC. +- **Remote stores** (`FsspecStore`): `supports_sync` stays `False`. All + remote IO remains async. +- **Sharded arrays**: Fall back to the `sync()` path because + `supports_partial_decode` is True. +- **All async APIs**: `AsyncArray`, `async def read/write`, etc. are + completely untouched. The sync bypass is an optimization of the + synchronous `Array` class only. + + +## Files Modified + +| File | Layer | Change | +|------|-------|--------| +| `src/zarr/abc/store.py` | 1 | `supports_sync`, `get_sync`, `set_sync`, `delete_sync` on `Store` ABC | +| `src/zarr/storage/_memory.py` | 1 | Sync store methods (direct dict access) | +| `src/zarr/storage/_local.py` | 1 | Sync store methods (direct `_get`/`_put` calls) | +| `src/zarr/storage/_common.py` | 1 | Sync methods on `StorePath` (delegates to store) | +| `src/zarr/abc/codec.py` | 2 | `supports_sync_io`, `read_sync`, `write_sync` on `CodecPipeline` ABC | +| `src/zarr/experimental/sync_codecs.py` | 2 | `read_sync`, `write_sync` implementation | +| `src/zarr/core/array.py` | 3 | `_can_use_sync_path`, `_get_selection_sync`, `_set_selection_sync`, 10 method modifications | +| `src/zarr/codecs/gzip.py` | — | `@cached_property` for GZip instance | + + +## Design Tradeoffs + +**Duplication of `_get_selection` / `_set_selection`**: The sync versions +(`_get_selection_sync`, `_set_selection_sync`) duplicate the setup logic +(dtype resolution, buffer creation, value coercion) from the async originals. +This is intentional: extracting shared helpers would add complexity and +indirection to the hot path for no functional benefit. The two versions +should be kept in sync manually. + +**Sequential chunk processing**: `read_sync` and `write_sync` process chunks +sequentially in a for-loop, with no parallelism. For the target use case +(MemoryStore, LocalStore), this is optimal: MemoryStore is a dict lookup +(~1us), LocalStore is a file read that benefits from OS page cache, and +Python's GIL prevents true parallelism for CPU-bound codec work anyway. The +async path with `concurrent_map` is better for remote stores where IO latency +can be overlapped. + +**`Any` type annotations**: The `read_sync` and `write_sync` methods on +`SyncCodecPipeline` use `Any` for the byte_getter/byte_setter type in the +`batch_info` tuples. This avoids modifying the `ByteGetter`/`ByteSetter` +protocols, which are public API. The runtime type is always `StorePath`, which +has the sync methods; the type system just can't express this constraint +through the existing protocol hierarchy. + +**No sync partial decode/encode**: Sharding's `decode_partial` / +`encode_partial` methods are inherently async (they issue byte-range reads to +the store). Rather than adding sync variants to the sharding codec (which +would require significant refactoring), we simply fall back to the `sync()` +path for sharded arrays. This is the right tradeoff because sharded arrays +typically involve remote stores where async IO is beneficial anyway. diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 85f9e5eedd..2c459eaefa 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -474,9 +474,31 @@ async def write( """ ... + # ------------------------------------------------------------------- + # Fully synchronous read/write (opt-in) + # + # When a CodecPipeline subclass can run the entire read/write path + # (store IO + codec compute + buffer scatter) without touching the + # event loop, it overrides these methods and sets supports_sync_io + # to True. This lets Array selection methods bypass sync() entirely. + # + # The default implementations raise NotImplementedError, so + # BatchedCodecPipeline (the standard pipeline) is unaffected. + # + # See docs/design/sync-bypass.md for the full design rationale. + # ------------------------------------------------------------------- + @property def supports_sync_io(self) -> bool: - """Whether this pipeline supports fully synchronous read/write.""" + """Whether this pipeline can run read/write entirely on the calling thread. + + True when: + - All codecs support synchronous encode/decode (_decode_sync/_encode_sync) + - The pipeline's read_sync/write_sync methods are implemented + + Checked by ``Array._can_use_sync_path()`` to decide whether to bypass + the ``sync()`` event-loop bridge. + """ return False def read_sync( @@ -485,7 +507,12 @@ def read_sync( out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: - """Synchronous read path. Only available on pipelines that support it.""" + """Synchronous read: fetch bytes from store, decode, scatter into out. + + Runs entirely on the calling thread. Only available when + ``supports_sync_io`` is True. Called by ``_get_selection_sync`` in + ``array.py`` when the sync bypass is active. + """ raise NotImplementedError def write_sync( @@ -494,7 +521,12 @@ def write_sync( value: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: - """Synchronous write path. Only available on pipelines that support it.""" + """Synchronous write: gather from value, encode, persist to store. + + Runs entirely on the calling thread. Only available when + ``supports_sync_io`` is True. Called by ``_set_selection_sync`` in + ``array.py`` when the sync bypass is active. + """ raise NotImplementedError diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 59c711b773..d52a642d3c 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -524,12 +524,29 @@ def supports_partial_writes(self) -> Literal[False]: """ return False + # ----------------------------------------------------------------------- + # Synchronous IO interface (opt-in) + # + # These methods enable the SyncCodecPipeline to bypass the event loop + # entirely for store IO. The default implementations raise + # NotImplementedError; stores that wrap fundamentally synchronous + # operations (MemoryStore, LocalStore) override them with direct + # implementations. Remote/cloud stores (FsspecStore) leave them as-is + # and remain async-only. + # + # See docs/design/sync-bypass.md for the full design rationale. + # ----------------------------------------------------------------------- + @property def supports_sync(self) -> bool: - """Does the store support synchronous get/set/delete? + """Whether this store has native synchronous get/set/delete methods. + + When True, ``SyncCodecPipeline.read_sync`` / ``write_sync`` will call + ``get_sync`` / ``set_sync`` / ``delete_sync`` directly on the calling + thread, avoiding the event loop overhead of the async equivalents. - When True, the sync codec pipeline can bypass the event loop for IO. - Override in subclasses that have native sync implementations. + Subclasses that override the sync methods below should also override + this property to return True. """ return False @@ -539,15 +556,28 @@ def get_sync( prototype: BufferPrototype, byte_range: ByteRequest | None = None, ) -> Buffer | None: - """Synchronous version of get(). Only available when supports_sync is True.""" + """Synchronous version of ``get()``. + + Called by ``SyncCodecPipeline.read_sync`` to fetch chunk bytes without + going through the event loop. Only called when ``supports_sync`` is + True, so the default ``NotImplementedError`` is never hit in practice. + """ raise NotImplementedError def set_sync(self, key: str, value: Buffer) -> None: - """Synchronous version of set(). Only available when supports_sync is True.""" + """Synchronous version of ``set()``. + + Called by ``SyncCodecPipeline.write_sync`` to persist encoded chunk + bytes without going through the event loop. + """ raise NotImplementedError def delete_sync(self, key: str) -> None: - """Synchronous version of delete(). Only available when supports_sync is True.""" + """Synchronous version of ``delete()``. + + Called by ``SyncCodecPipeline.write_sync`` when a chunk should be + removed (e.g. an empty chunk with ``write_empty_chunks=False``). + """ raise NotImplementedError @property diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 1a027fe8a2..a883b0d640 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -49,11 +49,19 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "gzip", "configuration": {"level": self.level}} + # Cache the numcodecs GZip instance. GzipCodec is a frozen dataclass, + # so `level` never changes after construction, making this safe. + # This matches the pattern used by ZstdCodec._zstd_codec and + # BloscCodec._blosc_codec. Without caching, a new GZip(level) was + # created on every encode/decode call. @cached_property def _gzip_codec(self) -> GZip: return GZip(self.level) def _decode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer: + # Use the cached codec instance instead of creating GZip(self.level) + # each time. The async _decode_single delegates to this method via + # asyncio.to_thread, so both paths benefit from the cache. return as_numpy_array_wrapper(self._gzip_codec.decode, chunk_bytes, chunk_spec.prototype) def _encode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 5800bf0cec..2373976219 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1974,7 +1974,31 @@ def config(self) -> ArrayConfig: return self.async_array.config def _can_use_sync_path(self) -> bool: - """Check if we can bypass the event loop entirely for read/write.""" + """Check if we can bypass the event loop entirely for read/write. + + Three conditions must hold: + + 1. The codec pipeline supports fully synchronous IO (all codecs in + the chain have _decode_sync/_encode_sync, and the pipeline + implements read_sync/write_sync). This is True for + SyncCodecPipeline when all codecs support sync. + + 2. Partial decode is NOT active. Partial decode is used by sharding, + whose decode_partial method is async (it issues byte-range reads). + When sharding is in use, we must fall back to the sync() bridge. + + 3. The store supports synchronous operations (MemoryStore, LocalStore). + Remote stores like FsspecStore remain async-only. + + When all three hold, the selection methods below call + _get_selection_sync / _set_selection_sync directly, running the + entire read/write path on the calling thread with zero async + overhead. + + Uses getattr() with defaults for forward compatibility — older or + third-party pipelines/stores that lack these attributes gracefully + fall back to the async path. + """ pipeline = self.async_array.codec_pipeline store_path = self.async_array.store_path return ( @@ -3060,6 +3084,10 @@ def get_basic_selection( if prototype is None: prototype = default_buffer_prototype() indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + # Sync bypass: when the codec pipeline and store both support + # synchronous operation, skip the sync() → event loop bridge and + # run the entire read path on the calling thread. This pattern is + # repeated in all 10 get_*/set_* methods below. if self._can_use_sync_path(): return _get_selection_sync( self.async_array.store_path, @@ -3071,6 +3099,9 @@ def get_basic_selection( fields=fields, prototype=prototype, ) + # Fallback: submit the async coroutine to the background event loop + # thread via sync(). Used for remote stores, sharded arrays, or when + # SyncCodecPipeline is not active. return sync( self.async_array._get_selection( indexer, @@ -5755,8 +5786,23 @@ def _get_selection_sync( out: NDBuffer | None = None, fields: Fields | None = None, ) -> NDArrayLikeOrScalar: - """Synchronous version of _get_selection — bypasses the event loop entirely.""" - # Get dtype from metadata + """Synchronous version of _get_selection — bypasses the event loop entirely. + + This function mirrors ``_get_selection`` (the async version defined above) + exactly, with one critical difference: it calls ``codec_pipeline.read_sync()`` + instead of ``await codec_pipeline.read()``. This means the entire operation + — store IO, codec decode, buffer scatter — runs on the calling thread with + no event loop involvement. + + Called by ``Array.get_basic_selection``, ``get_orthogonal_selection``, etc. + when ``Array._can_use_sync_path()`` returns True. + + The setup logic (dtype resolution, output buffer creation, field checks) is + duplicated from the async version rather than extracted into a shared helper. + This keeps the hot path simple and avoids adding indirection. The two + versions should be kept in sync manually. + """ + # Get dtype from metadata — same logic as async _get_selection if metadata.zarr_format == 2: zdtype = metadata.dtype else: @@ -5793,6 +5839,12 @@ def _get_selection_sync( if metadata.zarr_format == 2: _config = replace(_config, order=order) + # This is the key difference from the async version: read_sync() + # runs the entire pipeline (store fetch → codec decode → scatter) + # on this thread. Each entry in the list is a (StorePath, ArraySpec, + # chunk_selection, out_selection, is_complete_chunk) tuple. + # StorePath acts as the ByteGetter — its get_sync() method is called + # by the pipeline to fetch raw chunk bytes from the store. codec_pipeline.read_sync( [ ( @@ -5823,7 +5875,15 @@ def _set_selection_sync( prototype: BufferPrototype, fields: Fields | None = None, ) -> None: - """Synchronous version of _set_selection — bypasses the event loop entirely.""" + """Synchronous version of _set_selection — bypasses the event loop entirely. + + Mirrors ``_set_selection`` (the async version) with the same setup logic + (dtype coercion, value shape validation, buffer wrapping) but calls + ``codec_pipeline.write_sync()`` instead of ``await codec_pipeline.write()``. + + Called by ``Array.set_basic_selection``, ``set_orthogonal_selection``, etc. + when ``Array._can_use_sync_path()`` returns True. + """ # Get dtype from metadata if metadata.zarr_format == 2: zdtype = metadata.dtype @@ -5863,6 +5923,10 @@ def _set_selection_sync( if metadata.zarr_format == 2: _config = replace(_config, order=order) + # Key difference from async version: write_sync() runs the entire + # pipeline (read existing → decode → merge → encode → store write) + # on this thread. StorePath acts as ByteSetter — its set_sync() and + # delete_sync() methods persist/remove chunk bytes directly. codec_pipeline.write_sync( [ ( diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py index 70080cf3cb..858579c056 100644 --- a/src/zarr/experimental/sync_codecs.py +++ b/src/zarr/experimental/sync_codecs.py @@ -580,10 +580,37 @@ def _merge_and_filter( # ------------------------------------------------------------------- # Fully synchronous read / write (bypass event loop entirely) + # + # These methods implement the same logic as the async read/write + # methods above, but run entirely on the calling thread: + # + # - Store IO uses byte_getter.get_sync() / byte_setter.set_sync() + # instead of the async get()/set() — direct dict lookup for + # MemoryStore, direct file IO for LocalStore. + # + # - Codec compute uses _decode_one() / _encode_one(), which call + # each codec's _decode_sync/_encode_sync inline (no to_thread). + # + # - Chunks are processed sequentially in a for-loop — no batching, + # no concurrent_map, no asyncio tasks. This is optimal for local + # stores where IO is ~1us (dict) or dominated by OS page cache + # (files), and where the GIL prevents true parallel codec work. + # + # The byte_getter/byte_setter parameters are typed as `Any` because + # the ByteGetter/ByteSetter protocols only define async methods. + # At runtime, these are always StorePath instances which have the + # get_sync/set_sync/delete_sync methods. See docs/design/sync-bypass.md. + # + # These methods are only called when supports_sync_io is True (i.e. + # _all_sync is True), which guarantees every codec in the chain has + # _decode_sync/_encode_sync implementations. # ------------------------------------------------------------------- @property def supports_sync_io(self) -> bool: + # Only enable the fully-sync path when every codec in the chain + # supports synchronous dispatch. If any codec lacks _decode_sync + # (e.g. ShardingCodec), we fall back to the async path. return self._all_sync def read_sync( @@ -596,24 +623,32 @@ def read_sync( if not batch_info_list: return - # Resolve metadata chain once (all chunks share the same spec structure) + # Resolve the metadata chain once: compute the ArraySpec at each + # codec boundary. All chunks in a single array share the same codec + # structure, so this is invariant across the loop. _, first_spec, *_ = batch_info_list[0] aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(first_spec) for byte_getter, chunk_spec, chunk_selection, out_selection, _ in batch_info_list: - # IO: sync store read + # Step 1: Sync store read — e.g. dict[key] for MemoryStore, + # Path.read_bytes() for LocalStore. No event loop involvement. chunk_bytes: Buffer | None = byte_getter.get_sync(prototype=chunk_spec.prototype) - # Compute: decode through codec chain + # Step 2: Decode through the full codec chain (bytes→bytes + # codecs in reverse, then array→bytes, then array→array in + # reverse). All synchronous, all inline on this thread. chunk_array = self._decode_one(chunk_bytes, chunk_spec, aa_chain, ab_pair, bb_chain) - # Scatter into output buffer + # Step 3: Scatter decoded chunk data into the output buffer. + # chunk_selection picks the relevant region within the decoded + # chunk; out_selection places it in the output array. if chunk_array is not None: tmp = chunk_array[chunk_selection] if drop_axes != (): tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp else: + # Chunk not found in store — fill with the array's fill value. out[out_selection] = _fill_value_or_default(chunk_spec) def write_sync( @@ -633,12 +668,16 @@ def write_sync( out_selection, is_complete_chunk, ) in batch_info_list: - # Phase 1: Read existing chunk if needed (for partial writes) + # Phase 1: For partial writes (when we're only updating part of + # a chunk), read the existing chunk bytes from the store so we + # can merge the new data into it. For complete-chunk writes, + # skip this — we'll overwrite the entire chunk. existing_bytes: Buffer | None = None if not is_complete_chunk: existing_bytes = byte_setter.get_sync(prototype=chunk_spec.prototype) - # Phase 2: Decode existing chunk + # Phase 2: Decode the existing chunk bytes (if any) so we can + # merge new data into the decoded array. existing_array: NDBuffer | None = None if existing_bytes is not None: aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(chunk_spec) @@ -646,7 +685,10 @@ def write_sync( existing_bytes, chunk_spec, aa_chain, ab_pair, bb_chain ) - # Phase 3: Merge + # Phase 3: Merge new data into the chunk. For complete chunks + # that match the chunk shape, this is a direct passthrough. + # For partial writes, it creates a new buffer (or copies the + # existing one) and splices in the new values. chunk_array: NDBuffer | None = self._merge_chunk_array( existing_array, value, @@ -657,7 +699,8 @@ def write_sync( drop_axes, ) - # Phase 4: Check empty chunk + # Phase 4: If write_empty_chunks is False and the merged chunk + # is entirely fill values, skip writing it (delete instead). if ( chunk_array is not None and not chunk_spec.config.write_empty_chunks @@ -665,7 +708,9 @@ def write_sync( ): chunk_array = None - # Phase 5: Encode and write + # Phase 5: Encode and persist. If the chunk was determined to + # be empty (phase 4) or encoding returns None, delete the key. + # Otherwise, write the encoded bytes directly to the store. if chunk_array is None: byte_setter.delete_sync() else: diff --git a/src/zarr/storage/_common.py b/src/zarr/storage/_common.py index 4814702239..c57a717025 100644 --- a/src/zarr/storage/_common.py +++ b/src/zarr/storage/_common.py @@ -228,6 +228,21 @@ async def is_empty(self) -> bool: """ return await self.store.is_empty(self.path) + # ------------------------------------------------------------------- + # Synchronous IO delegation + # + # StorePath is what gets passed to the codec pipeline as a ByteGetter / + # ByteSetter. The async path uses get() / set() / delete(); the sync + # bypass uses these sync variants instead. They simply prepend + # self.path to the key and delegate to the underlying Store's sync + # methods. + # + # Note: The ByteGetter / ByteSetter protocols only define async + # methods. The sync pipeline uses `Any` type annotations to call + # these methods at runtime. See docs/design/sync-bypass.md for why + # we chose not to modify the protocols. + # ------------------------------------------------------------------- + @property def supports_sync(self) -> bool: """Whether the underlying store supports synchronous operations.""" @@ -238,17 +253,17 @@ def get_sync( prototype: BufferPrototype | None = None, byte_range: ByteRequest | None = None, ) -> Buffer | None: - """Synchronous read from the store.""" + """Synchronous read — delegates to ``self.store.get_sync(self.path, ...)``.""" if prototype is None: prototype = default_buffer_prototype() return self.store.get_sync(self.path, prototype=prototype, byte_range=byte_range) def set_sync(self, value: Buffer) -> None: - """Synchronous write to the store.""" + """Synchronous write — delegates to ``self.store.set_sync(self.path, value)``.""" self.store.set_sync(self.path, value) def delete_sync(self) -> None: - """Synchronous delete from the store.""" + """Synchronous delete — delegates to ``self.store.delete_sync(self.path)``.""" self.store.delete_sync(self.path) def __truediv__(self, other: str) -> StorePath: diff --git a/src/zarr/storage/_local.py b/src/zarr/storage/_local.py index 61e0795e1e..2296879cb2 100644 --- a/src/zarr/storage/_local.py +++ b/src/zarr/storage/_local.py @@ -187,6 +187,18 @@ def __repr__(self) -> str: def __eq__(self, other: object) -> bool: return isinstance(other, type(self)) and self.root == other.root + # ------------------------------------------------------------------- + # Synchronous store methods + # + # LocalStore's async get/set wrap the synchronous helpers _get() and + # _put() (defined at module level) in asyncio.to_thread(). These sync + # methods call _get/_put directly, removing the thread-hop overhead. + # + # The open-guard logic is inlined from _open(): create root dir if + # writable, check existence, set _is_open. We can't call the async + # _open() from a sync context, so we replicate its logic here. + # ------------------------------------------------------------------- + @property def supports_sync(self) -> bool: return True @@ -199,6 +211,7 @@ def get_sync( ) -> Buffer | None: if prototype is None: prototype = default_buffer_prototype() + # Inline open guard: mirrors async _open() but without await. if not self._is_open: if not self.read_only: self.root.mkdir(parents=True, exist_ok=True) @@ -208,6 +221,8 @@ def get_sync( assert isinstance(key, str) path = self.root / key try: + # Call _get() directly — the async version wraps this same + # function in asyncio.to_thread(). return _get(path, prototype, byte_range) except (FileNotFoundError, IsADirectoryError, NotADirectoryError): return None @@ -226,11 +241,14 @@ def set_sync(self, key: str, value: Buffer) -> None: f"LocalStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead." ) path = self.root / key + # Call _put() directly — the async version wraps this in + # asyncio.to_thread(). _put(path, value) def delete_sync(self, key: str) -> None: self._check_writable() path = self.root / key + # Same logic as async delete(), but without await. if path.is_dir(): shutil.rmtree(path) else: diff --git a/src/zarr/storage/_memory.py b/src/zarr/storage/_memory.py index 46b3e409da..950fa197e9 100644 --- a/src/zarr/storage/_memory.py +++ b/src/zarr/storage/_memory.py @@ -77,6 +77,20 @@ def __eq__(self, other: object) -> bool: and self.read_only == other.read_only ) + # ------------------------------------------------------------------- + # Synchronous store methods + # + # MemoryStore is a thin wrapper around a Python dict. The async get/set + # methods are already synchronous in substance — they just happen to be + # ``async def``. These sync variants let SyncCodecPipeline.read_sync / + # write_sync access the dict directly without going through the event + # loop, eliminating the dominant source of overhead for in-memory arrays. + # + # The logic mirrors the async counterparts exactly, except: + # - We set _is_open = True inline instead of ``await self._open()``, + # since MemoryStore._open() is a no-op beyond setting the flag. + # ------------------------------------------------------------------- + @property def supports_sync(self) -> bool: return True @@ -89,10 +103,13 @@ def get_sync( ) -> Buffer | None: if prototype is None: prototype = default_buffer_prototype() + # Inline open: MemoryStore._open() just sets _is_open = True. if not self._is_open: self._is_open = True assert isinstance(key, str) try: + # Direct dict lookup — this is what async get() does too, + # but without the event loop round-trip. value = self._store_dict[key] start, stop = _normalize_byte_range_index(value, byte_range) return prototype.buffer.from_buffer(value[start:stop]) @@ -108,6 +125,7 @@ def set_sync(self, key: str, value: Buffer) -> None: raise TypeError( f"MemoryStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead." ) + # Direct dict assignment — no event loop overhead. self._store_dict[key] = value def delete_sync(self, key: str) -> None: From 73ac84526f56fb48eb546b72c768dd8535b74dac Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 12:29:40 +0100 Subject: [PATCH 06/23] refactor sharding to allow sync --- src/zarr/codecs/sharding.py | 406 +++++++++++++++++++++++++++ src/zarr/core/array.py | 15 +- src/zarr/experimental/sync_codecs.py | 44 ++- tests/test_sync_codec_pipeline.py | 6 +- 4 files changed, 456 insertions(+), 15 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 8124ea44ea..096d58b008 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -92,6 +92,12 @@ async def get( ) return self.shard_dict.get(self.chunk_coords) + def get_sync( + self, prototype: BufferPrototype | None = None, byte_range: ByteRequest | None = None + ) -> Buffer | None: + # Sync equivalent of get() — just a dict lookup, no IO. + return self.shard_dict.get(self.chunk_coords) + @dataclass(frozen=True) class _ShardingByteSetter(_ShardingByteGetter, ByteSetter): @@ -107,6 +113,12 @@ async def delete(self) -> None: async def set_if_not_exists(self, default: Buffer) -> None: self.shard_dict.setdefault(self.chunk_coords, default) + def set_sync(self, value: Buffer) -> None: + self.shard_dict[self.chunk_coords] = value + + def delete_sync(self) -> None: + del self.shard_dict[self.chunk_coords] + class _ShardIndex(NamedTuple): # dtype uint64, shape (chunks_per_shard_0, chunks_per_shard_1, ..., 2) @@ -195,6 +207,22 @@ async def from_bytes( obj.index = await codec._decode_shard_index(shard_index_bytes, chunks_per_shard) return obj + @classmethod + def from_bytes_sync( + cls, buf: Buffer, codec: ShardingCodec, chunks_per_shard: tuple[int, ...] + ) -> _ShardReader: + """Synchronous version of from_bytes — decodes the shard index inline.""" + shard_index_size = codec._shard_index_size(chunks_per_shard) + obj = cls() + obj.buf = buf + if codec.index_location == ShardingCodecIndexLocation.start: + shard_index_bytes = obj.buf[:shard_index_size] + else: + shard_index_bytes = obj.buf[-shard_index_size:] + + obj.index = codec._decode_shard_index_sync(shard_index_bytes, chunks_per_shard) + return obj + @classmethod def create_empty( cls, chunks_per_shard: tuple[int, ...], buffer_prototype: BufferPrototype | None = None @@ -372,6 +400,317 @@ async def _decode_single( return out + def _decode_sync( + self, + shard_bytes: Buffer, + shard_spec: ArraySpec, + ) -> NDBuffer: + """Synchronous full-shard decode. + + Receives the complete shard bytes, decodes the index inline, then + decodes each inner chunk through the inner codec pipeline's sync path. + The inner codec pipeline's read_sync uses _ShardingByteGetter.get_sync + (a dict lookup) for IO, so the entire operation is synchronous. + """ + shard_shape = shard_spec.shape + chunk_shape = self.chunk_shape + chunks_per_shard = self._get_chunks_per_shard(shard_spec) + chunk_spec = self._get_chunk_spec(shard_spec) + + indexer = BasicIndexer( + tuple(slice(0, s) for s in shard_shape), + shape=shard_shape, + chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), + ) + + # setup output array + out = chunk_spec.prototype.nd_buffer.empty( + shape=shard_shape, + dtype=shard_spec.dtype.to_native_dtype(), + order=shard_spec.order, + ) + shard_dict = _ShardReader.from_bytes_sync(shard_bytes, self, chunks_per_shard) + + if shard_dict.index.is_all_empty(): + out.fill(shard_spec.fill_value) + return out + + # Decode each inner chunk synchronously through the inner pipeline. + # _ShardingByteGetter.get_sync is a dict lookup, so IO is trivial. + self.codec_pipeline.read_sync( + [ + ( + _ShardingByteGetter(shard_dict, chunk_coords), + chunk_spec, + chunk_selection, + out_selection, + is_complete_shard, + ) + for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer + ], + out, + ) + + return out + + def _encode_sync( + self, + shard_array: NDBuffer, + shard_spec: ArraySpec, + ) -> Buffer | None: + """Synchronous full-shard encode. + + Encodes each inner chunk through the inner codec pipeline's sync path, + then assembles the shard with index bytes. + """ + shard_shape = shard_spec.shape + chunk_shape = self.chunk_shape + chunks_per_shard = self._get_chunks_per_shard(shard_spec) + chunk_spec = self._get_chunk_spec(shard_spec) + + indexer = list( + BasicIndexer( + tuple(slice(0, s) for s in shard_shape), + shape=shard_shape, + chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), + ) + ) + + shard_builder = dict.fromkeys(morton_order_iter(chunks_per_shard)) + + # Encode each inner chunk synchronously + self.codec_pipeline.write_sync( + [ + ( + _ShardingByteSetter(shard_builder, chunk_coords), + chunk_spec, + chunk_selection, + out_selection, + is_complete_shard, + ) + for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer + ], + shard_array, + ) + + return self._encode_shard_dict_sync( + shard_builder, + chunks_per_shard=chunks_per_shard, + buffer_prototype=default_buffer_prototype(), + ) + + def _encode_shard_dict_sync( + self, + map: ShardMapping, + chunks_per_shard: tuple[int, ...], + buffer_prototype: BufferPrototype, + ) -> Buffer | None: + """Synchronous version of _encode_shard_dict.""" + index = _ShardIndex.create_empty(chunks_per_shard) + + buffers = [] + + template = buffer_prototype.buffer.create_zero_length() + chunk_start = 0 + for chunk_coords in morton_order_iter(chunks_per_shard): + value = map.get(chunk_coords) + if value is None: + continue + + if len(value) == 0: + continue + + chunk_length = len(value) + buffers.append(value) + index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length)) + chunk_start += chunk_length + + if len(buffers) == 0: + return None + + index_bytes = self._encode_shard_index_sync(index) + if self.index_location == ShardingCodecIndexLocation.start: + empty_chunks_mask = index.offsets_and_lengths[..., 0] == MAX_UINT_64 + index.offsets_and_lengths[~empty_chunks_mask, 0] += len(index_bytes) + index_bytes = self._encode_shard_index_sync(index) # encode again with corrected offsets + buffers.insert(0, index_bytes) + else: + buffers.append(index_bytes) + + return template.combine(buffers) + + def _load_shard_index_maybe_sync( + self, byte_getter: Any, chunks_per_shard: tuple[int, ...] + ) -> _ShardIndex | None: + """Synchronous version of _load_shard_index_maybe. + + Reads the shard index bytes via byte_getter.get_sync (a sync byte-range + read from the store), then decodes the index inline. + """ + shard_index_size = self._shard_index_size(chunks_per_shard) + if self.index_location == ShardingCodecIndexLocation.start: + index_bytes = byte_getter.get_sync( + prototype=numpy_buffer_prototype(), + byte_range=RangeByteRequest(0, shard_index_size), + ) + else: + index_bytes = byte_getter.get_sync( + prototype=numpy_buffer_prototype(), + byte_range=SuffixByteRequest(shard_index_size), + ) + if index_bytes is not None: + return self._decode_shard_index_sync(index_bytes, chunks_per_shard) + return None + + def _load_full_shard_maybe_sync( + self, + byte_getter: Any, + prototype: BufferPrototype, + chunks_per_shard: tuple[int, ...], + ) -> _ShardReader | None: + """Synchronous version of _load_full_shard_maybe.""" + shard_bytes = byte_getter.get_sync(prototype=prototype) + return ( + _ShardReader.from_bytes_sync(shard_bytes, self, chunks_per_shard) + if shard_bytes + else None + ) + + def _decode_partial_sync( + self, + byte_getter: Any, + selection: SelectorTuple, + shard_spec: ArraySpec, + ) -> NDBuffer | None: + """Synchronous partial decode: fetch shard index + requested chunks + via sync byte-range reads, then decode through the inner pipeline. + + The byte_getter is a StorePath with get_sync(). After fetching the + index (one byte-range read), each requested chunk is another byte-range + read. Once all bytes are in memory, the inner pipeline decodes them + synchronously via read_sync with _ShardingByteGetter (dict lookups). + """ + shard_shape = shard_spec.shape + chunk_shape = self.chunk_shape + chunks_per_shard = self._get_chunks_per_shard(shard_spec) + chunk_spec = self._get_chunk_spec(shard_spec) + + indexer = get_indexer( + selection, + shape=shard_shape, + chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), + ) + + # setup output array + out = shard_spec.prototype.nd_buffer.empty( + shape=indexer.shape, + dtype=shard_spec.dtype.to_native_dtype(), + order=shard_spec.order, + ) + + indexed_chunks = list(indexer) + all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks} + + # reading bytes of all requested chunks + shard_dict: ShardMapping = {} + if self._is_total_shard(all_chunk_coords, chunks_per_shard): + # read entire shard + shard_dict_maybe = self._load_full_shard_maybe_sync( + byte_getter=byte_getter, + prototype=chunk_spec.prototype, + chunks_per_shard=chunks_per_shard, + ) + if shard_dict_maybe is None: + return None + shard_dict = shard_dict_maybe + else: + # read some chunks within the shard + shard_index = self._load_shard_index_maybe_sync(byte_getter, chunks_per_shard) + if shard_index is None: + return None + shard_dict = {} + for chunk_coords in all_chunk_coords: + chunk_byte_slice = shard_index.get_chunk_slice(chunk_coords) + if chunk_byte_slice: + chunk_bytes = byte_getter.get_sync( + prototype=chunk_spec.prototype, + byte_range=RangeByteRequest(chunk_byte_slice[0], chunk_byte_slice[1]), + ) + if chunk_bytes: + shard_dict[chunk_coords] = chunk_bytes + + # decoding chunks and writing them into the output buffer + self.codec_pipeline.read_sync( + [ + ( + _ShardingByteGetter(shard_dict, chunk_coords), + chunk_spec, + chunk_selection, + out_selection, + is_complete_shard, + ) + for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer + ], + out, + ) + + if hasattr(indexer, "sel_shape"): + return out.reshape(indexer.sel_shape) + else: + return out + + def _encode_partial_sync( + self, + byte_setter: Any, + shard_array: NDBuffer, + selection: SelectorTuple, + shard_spec: ArraySpec, + ) -> None: + """Synchronous partial encode: read existing shard, merge new data, + encode and write back via sync store IO.""" + shard_shape = shard_spec.shape + chunk_shape = self.chunk_shape + chunks_per_shard = self._get_chunks_per_shard(shard_spec) + chunk_spec = self._get_chunk_spec(shard_spec) + + shard_reader = self._load_full_shard_maybe_sync( + byte_getter=byte_setter, + prototype=chunk_spec.prototype, + chunks_per_shard=chunks_per_shard, + ) + shard_reader = shard_reader or _ShardReader.create_empty(chunks_per_shard) + shard_dict = {k: shard_reader.get(k) for k in morton_order_iter(chunks_per_shard)} + + indexer = list( + get_indexer( + selection, shape=shard_shape, chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape) + ) + ) + + self.codec_pipeline.write_sync( + [ + ( + _ShardingByteSetter(shard_dict, chunk_coords), + chunk_spec, + chunk_selection, + out_selection, + is_complete_shard, + ) + for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer + ], + shard_array, + ) + buf = self._encode_shard_dict_sync( + shard_dict, + chunks_per_shard=chunks_per_shard, + buffer_prototype=default_buffer_prototype(), + ) + + if buf is None: + byte_setter.delete_sync() + else: + byte_setter.set_sync(buf) + async def _decode_partial_single( self, byte_getter: ByteGetter, @@ -585,6 +924,73 @@ def _is_total_shard( chunk_coords in all_chunk_coords for chunk_coords in c_order_iter(chunks_per_shard) ) + def _decode_shard_index_sync( + self, index_bytes: Buffer, chunks_per_shard: tuple[int, ...] + ) -> _ShardIndex: + """Decode shard index synchronously by running index codecs inline. + + The index codecs are always simple codecs (BytesCodec + Crc32cCodec) + that support _decode_sync. We run them directly without going through + a pipeline: bytes-bytes codecs in reverse, then the array-bytes codec. + """ + index_chunk_spec = self._get_index_chunk_spec(chunks_per_shard) + + # Classify index codecs the same way a pipeline would + from zarr.core.codec_pipeline import codecs_from_list + + aa_codecs, ab_codec, bb_codecs = codecs_from_list(list(self.index_codecs)) + + # Resolve metadata through the chain + spec = index_chunk_spec + aa_with_spec = [] + for aa in aa_codecs: + aa_with_spec.append((aa, spec)) + spec = aa.resolve_metadata(spec) + ab_spec = spec + spec = ab_codec.resolve_metadata(spec) + bb_with_spec = [] + for bb in bb_codecs: + bb_with_spec.append((bb, spec)) + spec = bb.resolve_metadata(spec) + + # Decode: reverse bb, then ab, then reverse aa + chunk_bytes: Buffer | None = index_bytes + for bb_codec, s in reversed(bb_with_spec): + chunk_bytes = bb_codec._decode_sync(chunk_bytes, s) + chunk_array = ab_codec._decode_sync(chunk_bytes, ab_spec) + for aa_codec, s in reversed(aa_with_spec): + chunk_array = aa_codec._decode_sync(chunk_array, s) + + assert chunk_array is not None + return _ShardIndex(chunk_array.as_numpy_array()) + + def _encode_shard_index_sync(self, index: _ShardIndex) -> Buffer: + """Encode shard index synchronously by running index codecs inline.""" + index_chunk_spec = self._get_index_chunk_spec(index.chunks_per_shard) + + from zarr.core.codec_pipeline import codecs_from_list + + aa_codecs, ab_codec, bb_codecs = codecs_from_list(list(self.index_codecs)) + + chunk_array: NDBuffer | None = get_ndbuffer_class().from_numpy_array( + index.offsets_and_lengths + ) + + # Encode: aa forward, then ab, then bb forward + spec = index_chunk_spec + for aa_codec in aa_codecs: + chunk_array = aa_codec._encode_sync(chunk_array, spec) + spec = aa_codec.resolve_metadata(spec) + chunk_bytes = ab_codec._encode_sync(chunk_array, spec) + spec = ab_codec.resolve_metadata(spec) + for bb_codec in bb_codecs: + chunk_bytes = bb_codec._encode_sync(chunk_bytes, spec) + spec = bb_codec.resolve_metadata(spec) + + assert chunk_bytes is not None + assert isinstance(chunk_bytes, Buffer) + return chunk_bytes + async def _decode_shard_index( self, index_bytes: Buffer, chunks_per_shard: tuple[int, ...] ) -> _ShardIndex: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2373976219..703fae1a24 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1976,21 +1976,19 @@ def config(self) -> ArrayConfig: def _can_use_sync_path(self) -> bool: """Check if we can bypass the event loop entirely for read/write. - Three conditions must hold: + Two conditions must hold: 1. The codec pipeline supports fully synchronous IO (all codecs in the chain have _decode_sync/_encode_sync, and the pipeline implements read_sync/write_sync). This is True for - SyncCodecPipeline when all codecs support sync. + SyncCodecPipeline when all codecs support sync — including + ShardingCodec, which has _decode_sync/_encode_sync and + _decode_partial_sync/_encode_partial_sync for the sharding path. - 2. Partial decode is NOT active. Partial decode is used by sharding, - whose decode_partial method is async (it issues byte-range reads). - When sharding is in use, we must fall back to the sync() bridge. - - 3. The store supports synchronous operations (MemoryStore, LocalStore). + 2. The store supports synchronous operations (MemoryStore, LocalStore). Remote stores like FsspecStore remain async-only. - When all three hold, the selection methods below call + When both hold, the selection methods below call _get_selection_sync / _set_selection_sync directly, running the entire read/write path on the calling thread with zero async overhead. @@ -2003,7 +2001,6 @@ def _can_use_sync_path(self) -> bool: store_path = self.async_array.store_path return ( getattr(pipeline, "supports_sync_io", False) - and not pipeline.supports_partial_decode and getattr(store_path, "supports_sync", False) ) diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py index 858579c056..d021969adb 100644 --- a/src/zarr/experimental/sync_codecs.py +++ b/src/zarr/experimental/sync_codecs.py @@ -608,9 +608,10 @@ def _merge_and_filter( @property def supports_sync_io(self) -> bool: - # Only enable the fully-sync path when every codec in the chain - # supports synchronous dispatch. If any codec lacks _decode_sync - # (e.g. ShardingCodec), we fall back to the async path. + # Enable the fully-sync path when every codec in the chain supports + # synchronous dispatch. This includes ShardingCodec, which has + # _decode_sync/_encode_sync (full shard) and _decode_partial_sync/ + # _encode_partial_sync (byte-range reads for partial shard access). return self._all_sync def read_sync( @@ -623,6 +624,27 @@ def read_sync( if not batch_info_list: return + # Partial decode path: when the array_bytes_codec supports partial + # decode (e.g. ShardingCodec), delegate to its _decode_partial_sync. + # This handles shard index fetch + per-chunk byte-range reads + inner + # codec decode, all synchronously. + if self.supports_partial_decode: + # The array_bytes_codec is a ShardingCodec (or similar) that has + # _decode_partial_sync. We use getattr to avoid coupling to the + # concrete type — the type system can't express this through the + # ArrayBytesCodecPartialDecodeMixin protocol. + ab_codec: Any = self.array_bytes_codec + for byte_getter, chunk_spec, chunk_selection, out_selection, _ in batch_info_list: + chunk_array: NDBuffer | None = ab_codec._decode_partial_sync( + byte_getter, chunk_selection, chunk_spec + ) + if chunk_array is not None: + out[out_selection] = chunk_array + else: + out[out_selection] = _fill_value_or_default(chunk_spec) + return + + # Non-partial path: standard sync decode through the full codec chain. # Resolve the metadata chain once: compute the ArraySpec at each # codec boundary. All chunks in a single array share the same codec # structure, so this is invariant across the loop. @@ -661,6 +683,22 @@ def write_sync( if not batch_info_list: return + # Partial encode path: when the array_bytes_codec supports partial + # encode (e.g. ShardingCodec), delegate to its _encode_partial_sync. + # This reads the existing shard, merges new data, encodes and writes + # back, all synchronously. + if self.supports_partial_encode: + ab_codec: Any = self.array_bytes_codec + if len(value.shape) == 0: + for byte_setter, chunk_spec, chunk_selection, _, _ in batch_info_list: + ab_codec._encode_partial_sync(byte_setter, value, chunk_selection, chunk_spec) + else: + for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info_list: + ab_codec._encode_partial_sync( + byte_setter, value[out_selection], chunk_selection, chunk_spec + ) + return + for ( byte_setter, chunk_spec, diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py index ac5ec8fa2e..fa65d4fcc6 100644 --- a/tests/test_sync_codec_pipeline.py +++ b/tests/test_sync_codec_pipeline.py @@ -52,10 +52,10 @@ def test_bytes_supports_sync(self): def test_transpose_supports_sync(self): assert TransposeCodec(order=(0, 1)).supports_sync - def test_sharding_does_not_support_sync(self): + def test_sharding_supports_sync(self): from zarr.codecs.sharding import ShardingCodec - assert not ShardingCodec(chunk_shape=(8,)).supports_sync + assert ShardingCodec(chunk_shape=(8,)).supports_sync # --------------------------------------------------------------------------- @@ -138,7 +138,7 @@ def test_from_codecs_accepts_sharding(self): pipeline = SyncCodecPipeline.from_codecs([ShardingCodec(chunk_shape=(8,))]) assert isinstance(pipeline, SyncCodecPipeline) - assert not pipeline._all_sync + assert pipeline._all_sync def test_from_codecs_rejects_missing_array_bytes(self): with pytest.raises(ValueError, match="Required ArrayBytesCodec"): From aeecda865eba38ea5a3e60602fea823ec24ec84e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 12:35:36 +0100 Subject: [PATCH 07/23] fix array spec propagation --- src/zarr/experimental/sync_codecs.py | 39 +++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py index d021969adb..e59f06791c 100644 --- a/src/zarr/experimental/sync_codecs.py +++ b/src/zarr/experimental/sync_codecs.py @@ -232,21 +232,48 @@ async def _decode_async( self, chunk_bytes_and_specs: list[tuple[Buffer | None, ArraySpec]], ) -> Iterable[NDBuffer | None]: - """Async fallback: walk codecs one at a time (like BatchedCodecPipeline).""" + """Async fallback: walk codecs one at a time (like BatchedCodecPipeline). + + Metadata must be resolved forward through the codec chain so each codec + gets the correct spec during reverse (decode) traversal. This matches + BatchedCodecPipeline._codecs_with_resolved_metadata_batched. + """ chunk_bytes_batch, chunk_specs = _unzip2(chunk_bytes_and_specs) - for bb_codec in self.bytes_bytes_codecs[::-1]: + # Resolve metadata forward: aa → ab → bb, recording the spec at each step. + aa_specs: list[list[ArraySpec]] = [] + specs = list(chunk_specs) + for aa_codec in self.array_array_codecs: + aa_specs.append(specs) + specs = [aa_codec.resolve_metadata(s) for s in specs] + + ab_specs = specs + specs = [self.array_bytes_codec.resolve_metadata(s) for s in specs] + + bb_specs: list[list[ArraySpec]] = [] + for bb_codec in self.bytes_bytes_codecs: + bb_specs.append(specs) + specs = [bb_codec.resolve_metadata(s) for s in specs] + + # Decode in reverse, using the forward-resolved specs. + for bb_codec, bb_spec in zip( + self.bytes_bytes_codecs[::-1], bb_specs[::-1], strict=False + ): chunk_bytes_batch = list( - await bb_codec.decode(zip(chunk_bytes_batch, chunk_specs, strict=False)) + await bb_codec.decode(zip(chunk_bytes_batch, bb_spec, strict=False)) ) chunk_array_batch: list[NDBuffer | None] = list( - await self.array_bytes_codec.decode(zip(chunk_bytes_batch, chunk_specs, strict=False)) + await self.array_bytes_codec.decode( + zip(chunk_bytes_batch, ab_specs, strict=False) + ) ) - for aa_codec in self.array_array_codecs[::-1]: + for aa_codec, aa_spec in zip( + self.array_array_codecs[::-1], aa_specs[::-1], strict=False + ): chunk_array_batch = list( - await aa_codec.decode(zip(chunk_array_batch, chunk_specs, strict=False)) + await aa_codec.decode(zip(chunk_array_batch, aa_spec, strict=False)) ) return chunk_array_batch From 69172fb4edae4663f59efc1353aa6491f6695d91 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 13:34:02 +0100 Subject: [PATCH 08/23] fix countingdict tests --- tests/test_indexing.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_indexing.py b/tests/test_indexing.py index c0bf7dd270..ac54cd0cb4 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -34,6 +34,7 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator + from zarr.abc.store import ByteRequest from zarr.core.buffer import BufferPrototype from zarr.core.buffer.core import Buffer @@ -83,6 +84,21 @@ async def set(self, key: str, value: Buffer, byte_range: tuple[int, int] | None self.counter["__setitem__", key_suffix] += 1 return await super().set(key, value, byte_range) + def get_sync( + self, + key: str, + prototype: BufferPrototype | None = None, + byte_range: ByteRequest | None = None, + ) -> Buffer | None: + key_suffix = "/".join(key.split("/")[1:]) + self.counter["__getitem__", key_suffix] += 1 + return super().get_sync(key, prototype, byte_range) + + def set_sync(self, key: str, value: Buffer) -> None: + key_suffix = "/".join(key.split("/")[1:]) + self.counter["__setitem__", key_suffix] += 1 + return super().set_sync(key, value) + def test_normalize_integer_selection() -> None: assert 1 == normalize_integer_selection(1, 100) From 28d0def03a09d0bf440e0862e5d75f341c077abc Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 13:43:14 +0100 Subject: [PATCH 09/23] update design doc --- docs/design/sync-bypass.md | 138 +++++++++++++++++++++++++++++++------ 1 file changed, 116 insertions(+), 22 deletions(-) diff --git a/docs/design/sync-bypass.md b/docs/design/sync-bypass.md index fdb3702232..2e93f5b702 100644 --- a/docs/design/sync-bypass.md +++ b/docs/design/sync-bypass.md @@ -94,7 +94,7 @@ Add `supports_sync_io`, `read_sync()`, and `write_sync()` to the `SyncCodecPipeline` implements these with a simple sequential loop: ```python -# read_sync: for each chunk +# read_sync: for each chunk (non-sharded path) for byte_getter, chunk_spec, chunk_sel, out_sel, _ in batch_info: chunk_bytes = byte_getter.get_sync(prototype=chunk_spec.prototype) # sync IO chunk_array = self._decode_one(chunk_bytes, ...) # sync compute @@ -103,11 +103,13 @@ for byte_getter, chunk_spec, chunk_sel, out_sel, _ in batch_info: No batching, no `concurrent_map`, no event loop — just a Python for-loop. -**Sharding fallback**: When `supports_partial_decode` is True (i.e. the codec -pipeline uses sharding), `supports_sync_io` returns False and the Array falls -back to the standard `sync()` path. This is because `ShardingCodec`'s -`decode_partial` is async (it reads sub-ranges from the store) and does not -have a sync equivalent. +**Sharding support**: When the pipeline uses `ShardingCodec` (i.e. +`supports_partial_decode` is True), `read_sync` delegates to +`ShardingCodec._decode_partial_sync()` instead. This method fetches +the shard index and requested chunk bytes via sync byte-range reads +(`byte_getter.get_sync()` with `RangeByteRequest`/`SuffixByteRequest`), +then decodes through the inner pipeline's `read_sync` — all on the +calling thread. See [Sync Sharding](#sync-sharding) below for details. ### Layer 3: Array Bypass @@ -126,12 +128,11 @@ def get_basic_selection(self, selection, *, out=None, prototype=None, fields=Non return sync(self.async_array._get_selection(indexer, ...)) ``` -`_can_use_sync_path()` checks three conditions: +`_can_use_sync_path()` checks two conditions: 1. The codec pipeline supports sync IO (`supports_sync_io`) -2. No partial decode is active (rules out sharding) -3. The store supports sync (`supports_sync`) +2. The store supports sync (`supports_sync`) -When all three hold, `_get_selection_sync` / `_set_selection_sync` run the +When both hold, `_get_selection_sync` / `_set_selection_sync` run the entire operation on the calling thread. These functions mirror the async `_get_selection` / `_set_selection` exactly, but call `codec_pipeline.read_sync()` / `write_sync()` instead of `await codec_pipeline.read()` / `write()`. @@ -139,7 +140,7 @@ entire operation on the calling thread. These functions mirror the async ## Resulting Call Chain -With the sync bypass active, the call chain becomes: +With the sync bypass active, the call chain for non-sharded arrays becomes: ``` Array.__getitem__ @@ -150,9 +151,54 @@ Array.__getitem__ └─ out[sel] = array # scatter into output ``` +For sharded arrays: + +``` +Array.__getitem__ + └─ _get_selection_sync # runs on calling thread + └─ SyncCodecPipeline.read_sync + └─ ShardingCodec._decode_partial_sync + ├─ StorePath.get_sync(byte_range) # sync byte-range read for shard index + ├─ _decode_shard_index_sync # inline index codec chain + ├─ StorePath.get_sync(byte_range) # sync byte-range read per chunk + └─ inner_pipeline.read_sync # inner codec chain (sync) + ├─ _ShardingByteGetter.get_sync # dict lookup + ├─ _decode_one # inline codec chain + └─ out[sel] = array # scatter +``` + No `sync()`, no event loop, no `asyncio.to_thread`, no `concurrent_map`. +## Sync Sharding + +`ShardingCodec` participates in the fully-synchronous path through sync +variants of all its methods: + +**Shard index codec chain**: The index codecs (typically `BytesCodec` + +`Crc32cCodec`) are run inline via `_decode_shard_index_sync` / +`_encode_shard_index_sync`. These classify the index codecs using +`codecs_from_list`, resolve metadata forward through the chain, then +run the decode/encode in the correct order — all without constructing a +pipeline object. + +**Full shard decode/encode** (`_decode_sync` / `_encode_sync`): Receives +complete shard bytes, decodes the index, then delegates to the inner +codec pipeline's `read_sync` / `write_sync` with `_ShardingByteGetter` / +`_ShardingByteSetter` (dict-backed, so "IO" is a dict lookup). + +**Partial shard decode/encode** (`_decode_partial_sync` / +`_encode_partial_sync`): The partial path is where most of the IO happens — +it issues sync byte-range reads to fetch the shard index and individual +chunk data from the store. Once bytes are in memory, the inner pipeline +decodes them synchronously. + +**Inner pipeline**: `ShardingCodec.codec_pipeline` is obtained via +`get_pipeline_class()`. When `SyncCodecPipeline` is configured globally, +the inner pipeline is also a `SyncCodecPipeline`, enabling recursive sync +dispatch for nested sharding. + + ## Additional Optimization: Codec Instance Caching `GzipCodec` was creating a new `GZip(level)` instance on every encode/decode @@ -169,14 +215,25 @@ This is safe because `GzipCodec` is a frozen dataclass — `level` never changes after construction, so the cached instance is always valid. +## Bugfix: _decode_async Metadata Resolution + +The async fallback path in `SyncCodecPipeline._decode_async()` (used when +a codec in the chain doesn't support sync) had a metadata resolution bug: +it passed the same unresolved `chunk_specs` to every codec during decode. + +Size-changing codecs like `FixedScaleOffset` and `PackBits` alter the data +shape/dtype, so each codec needs specs resolved through the forward chain. +The fix resolves metadata forward (aa -> ab -> bb), records specs at each +step, then uses the correct resolved specs during reverse decode traversal. +This matches `BatchedCodecPipeline._codecs_with_resolved_metadata_batched`. + + ## What Stays Unchanged - **`BatchedCodecPipeline`**: Unmodified. It inherits the default `supports_sync_io=False` from the ABC. - **Remote stores** (`FsspecStore`): `supports_sync` stays `False`. All remote IO remains async. -- **Sharded arrays**: Fall back to the `sync()` path because - `supports_partial_decode` is True. - **All async APIs**: `AsyncArray`, `async def read/write`, etc. are completely untouched. The sync bypass is an optimization of the synchronous `Array` class only. @@ -190,10 +247,39 @@ changes after construction, so the cached instance is always valid. | `src/zarr/storage/_memory.py` | 1 | Sync store methods (direct dict access) | | `src/zarr/storage/_local.py` | 1 | Sync store methods (direct `_get`/`_put` calls) | | `src/zarr/storage/_common.py` | 1 | Sync methods on `StorePath` (delegates to store) | -| `src/zarr/abc/codec.py` | 2 | `supports_sync_io`, `read_sync`, `write_sync` on `CodecPipeline` ABC | -| `src/zarr/experimental/sync_codecs.py` | 2 | `read_sync`, `write_sync` implementation | +| `src/zarr/abc/codec.py` | 2 | `_decode_sync`, `_encode_sync`, `supports_sync` on `BaseCodec`; `supports_sync_io`, `read_sync`, `write_sync` on `CodecPipeline` | +| `src/zarr/experimental/sync_codecs.py` | 2 | `read_sync`, `write_sync`, `_decode_async` metadata fix | +| `src/zarr/codecs/sharding.py` | 2 | `_decode_sync`, `_encode_sync`, `_decode_partial_sync`, `_encode_partial_sync`, shard index sync codec chain | | `src/zarr/core/array.py` | 3 | `_can_use_sync_path`, `_get_selection_sync`, `_set_selection_sync`, 10 method modifications | | `src/zarr/codecs/gzip.py` | — | `@cached_property` for GZip instance | +| `src/zarr/codecs/blosc.py` | — | `_decode_sync`/`_encode_sync`; `_decode_single`/`_encode_single` delegate to sync | +| `src/zarr/codecs/zstd.py` | — | `_decode_sync`/`_encode_sync`; `_decode_single`/`_encode_single` delegate to sync | +| `src/zarr/codecs/bytes.py` | — | `_decode_sync`/`_encode_sync` (was `_decode_single`/`_encode_single`) | +| `src/zarr/codecs/crc32c_.py` | — | `_decode_sync`/`_encode_sync` (was `_decode_single`/`_encode_single`) | +| `src/zarr/codecs/transpose.py` | — | `_decode_sync`/`_encode_sync`; `_decode_single`/`_encode_single` delegate to sync | +| `src/zarr/codecs/vlen_utf8.py` | — | `_decode_sync`/`_encode_sync` for `VLenUTF8Codec` and `VLenBytesCodec` | + + +## Performance + +Benchmarks on MemoryStore with `SyncCodecPipeline` vs `BatchedCodecPipeline`: + +**Non-sharded arrays** (zstd compression, 100x100 float64, 32x32 chunks): +- Single-chunk read: ~2-4x faster +- Full-array read: ~2-11x faster (varies with chunk count) +- Single-chunk write: ~2-3x faster + +**Sharded arrays** (4x4 shard of 8x8 inner chunks, zstd, MemoryStore): +- Single-chunk read: ~1.5-2.5x faster +- Full-array read: ~1.5-2x faster +- Single-chunk write: ~1.3-1.6x faster +- Full-array write: ~1.3-1.5x faster + +The sharded speedup is smaller because the shard index decode and +per-chunk byte-range reads add overhead that wasn't present in the +non-sharded path. Still, eliminating the event loop round-trip and +`asyncio.to_thread` for each inner chunk decode provides a meaningful +improvement. ## Design Tradeoffs @@ -216,13 +302,21 @@ can be overlapped. **`Any` type annotations**: The `read_sync` and `write_sync` methods on `SyncCodecPipeline` use `Any` for the byte_getter/byte_setter type in the `batch_info` tuples. This avoids modifying the `ByteGetter`/`ByteSetter` -protocols, which are public API. The runtime type is always `StorePath`, which +protocols, which are public API. The runtime type is always `StorePath` (or +`_ShardingByteGetter`/`_ShardingByteSetter` for inner-shard access), which has the sync methods; the type system just can't express this constraint through the existing protocol hierarchy. -**No sync partial decode/encode**: Sharding's `decode_partial` / -`encode_partial` methods are inherently async (they issue byte-range reads to -the store). Rather than adding sync variants to the sharding codec (which -would require significant refactoring), we simply fall back to the `sync()` -path for sharded arrays. This is the right tradeoff because sharded arrays -typically involve remote stores where async IO is beneficial anyway. +**Sync sharding — sequential chunk reads**: The sync partial decode path +fetches each chunk's bytes sequentially via `byte_getter.get_sync()` with +byte-range requests. The async path can overlap these reads via +`concurrent_map`. For MemoryStore this doesn't matter (dict lookup is ~1us). +For LocalStore, OS page cache means sequential reads are fast for warm data. +For remote stores where overlapping IO would help, `supports_sync` is False +and the async path is used automatically. + +**Inline shard index codec chain**: `_decode_shard_index_sync` and +`_encode_shard_index_sync` run the index codecs (BytesCodec + Crc32cCodec) +directly rather than constructing a temporary `CodecPipeline`. This avoids +the overhead of pipeline construction for a simple two-codec chain and keeps +the sync path self-contained. From f8e39e6209b967549d365db3bd064daee30e01b5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 14:46:56 +0100 Subject: [PATCH 10/23] dynamic pool allocation --- src/zarr/experimental/sync_codecs.py | 353 +++++++++++++++++++++------ src/zarr/testing/buffer.py | 18 ++ 2 files changed, 293 insertions(+), 78 deletions(-) diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py index e59f06791c..2637c3d562 100644 --- a/src/zarr/experimental/sync_codecs.py +++ b/src/zarr/experimental/sync_codecs.py @@ -14,6 +14,8 @@ from __future__ import annotations +import os +from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from itertools import islice from typing import TYPE_CHECKING, TypeVar @@ -29,7 +31,7 @@ ) from zarr.core.buffer import Buffer, NDBuffer from zarr.core.codec_pipeline import _unzip2, codecs_from_list, resolve_batched -from zarr.core.common import concurrent_map +from zarr.core.common import concurrent_map, product from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar from zarr.registry import register_pipeline @@ -69,6 +71,150 @@ def _fill_value_or_default(chunk_spec: ArraySpec) -> Any: return fill_value +def _get_pool(max_workers: int) -> ThreadPoolExecutor: + """Get a thread pool with at most *max_workers* threads. + + Reuses a cached pool when the requested size is <= the cached size. + CPU-heavy codecs (zstd, gzip, blosc) release the GIL during their C-level + compress/decompress calls, so real parallelism is achieved across threads. + """ + global _pool + if _pool is None or _pool._max_workers < max_workers: + _pool = ThreadPoolExecutor(max_workers=max_workers) + return _pool + + +_pool: ThreadPoolExecutor | None = None + +# Sentinel to distinguish "delete this key" from None (which _encode_one +# can return when a chunk encodes to nothing). +_DELETED = object() + +# --------------------------------------------------------------------------- +# Work estimation for thread pool sizing +# --------------------------------------------------------------------------- + +# Approximate nanoseconds-per-byte for codec decode and encode, measured on +# typical hardware. These don't need to be exact — they just need to rank +# codecs correctly so the pool-sizing heuristic makes good decisions. +# +# Decode and encode have very different costs for many codecs: +# - gzip decode ~5-10 ns/byte vs encode ~50-100 ns/byte +# - zstd decode ~1-2 ns/byte vs encode ~2-10 ns/byte +# - blosc decode ~0.5-1 ns/byte vs encode ~1-5 ns/byte +# +# "Cheap" codecs (memcpy-like): BytesCodec, Crc32cCodec, TransposeCodec +# → ~0.1-1 ns/byte, dominated by memcpy; no benefit from threading. +# "Medium" codecs: ZstdCodec, BloscCodec +# → decode ~1-2 ns/byte, encode ~2-5 ns/byte; GIL released in C. +# "Expensive" codecs: GzipCodec +# → decode ~5-10 ns/byte, encode ~50-100 ns/byte; GIL released in C. +# +# For unknown codecs (e.g. third-party numcodecs wrappers), we assume +# "medium" cost — better to over-parallelize slightly than miss a win. + +_CODEC_DECODE_NS_PER_BYTE: dict[str, float] = { + # Near-zero cost — just reshaping/copying/checksumming + "BytesCodec": 0, + "Crc32cCodec": 0, + "TransposeCodec": 0, + "VLenUTF8Codec": 0, + "VLenBytesCodec": 0, + # Medium cost — fast C codecs, GIL released + "ZstdCodec": 1, + "BloscCodec": 0.5, + # High cost — slower C codecs, GIL released + "GzipCodec": 8, +} + +_CODEC_ENCODE_NS_PER_BYTE: dict[str, float] = { + # Near-zero cost — just reshaping/copying/checksumming + "BytesCodec": 0, + "Crc32cCodec": 0, + "TransposeCodec": 0, + "VLenUTF8Codec": 0, + "VLenBytesCodec": 0, + # Medium cost — fast C codecs, GIL released + "ZstdCodec": 3, + "BloscCodec": 2, + # High cost — slower C codecs, GIL released + "GzipCodec": 50, +} + +_DEFAULT_DECODE_NS_PER_BYTE = 1 # assume medium for unknown codecs +_DEFAULT_ENCODE_NS_PER_BYTE = 3 # encode is typically slower + +# Thread pool dispatch overhead in nanoseconds (~50-100us per task). +# We only parallelize when the estimated per-chunk work exceeds this. +_POOL_OVERHEAD_NS = 200_000 + + +def _estimate_chunk_work_ns( + chunk_nbytes: int, + codecs: Iterable[Codec], + *, + is_encode: bool = False, +) -> float: + """Estimate nanoseconds of codec work for one chunk. + + Sums the per-byte cost of each codec in the chain, multiplied by the + chunk size. Uses separate decode/encode cost tables since compression + is typically much more expensive than decompression. + + This is a rough estimate — compression ratios, cache effects, + and hardware differences mean the actual time can vary 2-5x. But the + estimate is good enough to decide "use pool" vs "don't use pool". + """ + table = _CODEC_ENCODE_NS_PER_BYTE if is_encode else _CODEC_DECODE_NS_PER_BYTE + default = _DEFAULT_ENCODE_NS_PER_BYTE if is_encode else _DEFAULT_DECODE_NS_PER_BYTE + total_ns_per_byte = 0.0 + for codec in codecs: + name = type(codec).__name__ + total_ns_per_byte += table.get(name, default) + return chunk_nbytes * total_ns_per_byte + + +def _choose_workers( + n_chunks: int, + chunk_nbytes: int, + codecs: Iterable[Codec], + *, + is_encode: bool = False, +) -> int: + """Decide how many thread pool workers to use (0 = don't use pool). + + The model: + 1. Estimate per-chunk codec work in nanoseconds (decode or encode). + 2. If per-chunk work < pool dispatch overhead, return 0 (sequential). + Small chunks with fast codecs aren't worth the pool dispatch cost. + 3. Check that total codec work significantly exceeds total dispatch + overhead (n_chunks * per-task cost). If not, sequential is faster. + 4. Scale workers with total work, capped at CPU count and chunk count. + """ + if n_chunks < 2: + return 0 + + per_chunk_ns = _estimate_chunk_work_ns(chunk_nbytes, codecs, is_encode=is_encode) + + if per_chunk_ns < _POOL_OVERHEAD_NS: + return 0 + + # Total codec work must exceed total dispatch overhead by a margin. + # Each task submitted to pool.map has ~50us dispatch overhead. + total_work_ns = per_chunk_ns * n_chunks + total_dispatch_ns = n_chunks * 50_000 # ~50us per task + if total_work_ns < total_dispatch_ns * 3: + return 0 + + # Scale workers: each worker should do at least 1ms of work to + # amortize pool overhead. + target_per_worker_ns = 1_000_000 # 1ms + workers = max(1, int(total_work_ns / target_per_worker_ns)) + + cpu_count = os.cpu_count() or 4 + return min(workers, n_chunks, cpu_count) + + # --------------------------------------------------------------------------- # SyncCodecPipeline # --------------------------------------------------------------------------- @@ -618,10 +764,11 @@ def _merge_and_filter( # - Codec compute uses _decode_one() / _encode_one(), which call # each codec's _decode_sync/_encode_sync inline (no to_thread). # - # - Chunks are processed sequentially in a for-loop — no batching, - # no concurrent_map, no asyncio tasks. This is optimal for local - # stores where IO is ~1us (dict) or dominated by OS page cache - # (files), and where the GIL prevents true parallel codec work. + # - When there are multiple chunks, codec compute is parallelized + # across a thread pool. CPU-heavy codecs (zstd, gzip, blosc) + # release the GIL during C-level compress/decompress, so real + # parallelism is achieved. Store IO remains sequential (fast + # for local/memory stores). # # The byte_getter/byte_setter parameters are typed as `Any` because # the ByteGetter/ByteSetter protocols only define async methods. @@ -678,27 +825,86 @@ def read_sync( _, first_spec, *_ = batch_info_list[0] aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(first_spec) - for byte_getter, chunk_spec, chunk_selection, out_selection, _ in batch_info_list: - # Step 1: Sync store read — e.g. dict[key] for MemoryStore, - # Path.read_bytes() for LocalStore. No event loop involvement. - chunk_bytes: Buffer | None = byte_getter.get_sync(prototype=chunk_spec.prototype) + # Phase 1: IO — fetch all chunk bytes from the store sequentially. + # For MemoryStore this is a dict lookup (~1us), for LocalStore a + # file read that benefits from OS page cache. Sequential is fine. + chunk_bytes_list: list[Buffer | None] = [ + byte_getter.get_sync(prototype=chunk_spec.prototype) + for byte_getter, chunk_spec, *_ in batch_info_list + ] - # Step 2: Decode through the full codec chain (bytes→bytes - # codecs in reverse, then array→bytes, then array→array in - # reverse). All synchronous, all inline on this thread. - chunk_array = self._decode_one(chunk_bytes, chunk_spec, aa_chain, ab_pair, bb_chain) + # Phase 2: Decode — run the codec chain for each chunk. + # Estimate per-chunk codec work and decide whether to parallelize. + chunk_nbytes = product(first_spec.shape) * first_spec.dtype.item_size + n_workers = _choose_workers(len(batch_info_list), chunk_nbytes, self) + if n_workers > 0: + pool = _get_pool(n_workers) + chunk_arrays: list[NDBuffer | None] = list( + pool.map( + self._decode_one, + chunk_bytes_list, + [chunk_spec for _, chunk_spec, *_ in batch_info_list], + [aa_chain] * len(batch_info_list), + [ab_pair] * len(batch_info_list), + [bb_chain] * len(batch_info_list), + ) + ) + else: + chunk_arrays = [ + self._decode_one(chunk_bytes, chunk_spec, aa_chain, ab_pair, bb_chain) + for chunk_bytes, (_, chunk_spec, *_) in zip( + chunk_bytes_list, batch_info_list, strict=False + ) + ] - # Step 3: Scatter decoded chunk data into the output buffer. - # chunk_selection picks the relevant region within the decoded - # chunk; out_selection places it in the output array. - if chunk_array is not None: - tmp = chunk_array[chunk_selection] - if drop_axes != (): - tmp = tmp.squeeze(axis=drop_axes) - out[out_selection] = tmp - else: - # Chunk not found in store — fill with the array's fill value. - out[out_selection] = _fill_value_or_default(chunk_spec) + # Phase 3: Scatter decoded chunk data into the output buffer. + self._scatter(chunk_arrays, batch_info_list, out, drop_axes) + + def _write_chunk_compute( + self, + existing_bytes: Buffer | None, + chunk_spec: ArraySpec, + chunk_selection: SelectorTuple, + out_selection: SelectorTuple, + is_complete_chunk: bool, + value: NDBuffer, + drop_axes: tuple[int, ...], + ) -> Buffer | None | object: # object is _DELETED sentinel + """Per-chunk compute for write: decode existing → merge → encode. + + Returns encoded bytes, or _DELETED sentinel if the chunk should + be removed from the store. Thread-safe: operates only on its own + chunk data, no shared mutable state. + """ + # Decode existing chunk (for partial writes) + existing_array: NDBuffer | None = None + if existing_bytes is not None: + aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(chunk_spec) + existing_array = self._decode_one( + existing_bytes, chunk_spec, aa_chain, ab_pair, bb_chain + ) + + # Merge new data into the chunk + chunk_array: NDBuffer | None = self._merge_chunk_array( + existing_array, value, out_selection, chunk_spec, + chunk_selection, is_complete_chunk, drop_axes, + ) + + # Filter empty chunks + if ( + chunk_array is not None + and not chunk_spec.config.write_empty_chunks + and chunk_array.all_equal(_fill_value_or_default(chunk_spec)) + ): + chunk_array = None + + # Encode + if chunk_array is None: + return _DELETED + chunk_bytes = self._encode_one(chunk_array, chunk_spec) + if chunk_bytes is None: + return _DELETED + return chunk_bytes def write_sync( self, @@ -726,64 +932,55 @@ def write_sync( ) return - for ( - byte_setter, - chunk_spec, - chunk_selection, - out_selection, - is_complete_chunk, - ) in batch_info_list: - # Phase 1: For partial writes (when we're only updating part of - # a chunk), read the existing chunk bytes from the store so we - # can merge the new data into it. For complete-chunk writes, - # skip this — we'll overwrite the entire chunk. - existing_bytes: Buffer | None = None - if not is_complete_chunk: - existing_bytes = byte_setter.get_sync(prototype=chunk_spec.prototype) - - # Phase 2: Decode the existing chunk bytes (if any) so we can - # merge new data into the decoded array. - existing_array: NDBuffer | None = None - if existing_bytes is not None: - aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(chunk_spec) - existing_array = self._decode_one( - existing_bytes, chunk_spec, aa_chain, ab_pair, bb_chain - ) + # Phase 1: IO — read existing chunk bytes for partial writes. + existing_bytes_list: list[Buffer | None] = [ + byte_setter.get_sync(prototype=chunk_spec.prototype) + if not is_complete_chunk + else None + for byte_setter, chunk_spec, _, _, is_complete_chunk in batch_info_list + ] - # Phase 3: Merge new data into the chunk. For complete chunks - # that match the chunk shape, this is a direct passthrough. - # For partial writes, it creates a new buffer (or copies the - # existing one) and splices in the new values. - chunk_array: NDBuffer | None = self._merge_chunk_array( - existing_array, - value, - out_selection, - chunk_spec, - chunk_selection, - is_complete_chunk, - drop_axes, + # Phase 2: Compute — decode existing, merge new data, encode. + # Estimate per-chunk work to decide whether to parallelize. + # Use encode cost model since writes are dominated by compression. + _, first_spec, *_ = batch_info_list[0] + chunk_nbytes = product(first_spec.shape) * first_spec.dtype.item_size + n_workers = _choose_workers(len(batch_info_list), chunk_nbytes, self, is_encode=True) + if n_workers > 0: + pool = _get_pool(n_workers) + encoded_list: list[Buffer | None | object] = list( + pool.map( + self._write_chunk_compute, + existing_bytes_list, + [chunk_spec for _, chunk_spec, *_ in batch_info_list], + [chunk_selection for _, _, chunk_selection, _, _ in batch_info_list], + [out_selection for _, _, _, out_selection, _ in batch_info_list], + [is_complete for _, _, _, _, is_complete in batch_info_list], + [value] * len(batch_info_list), + [drop_axes] * len(batch_info_list), + ) ) - - # Phase 4: If write_empty_chunks is False and the merged chunk - # is entirely fill values, skip writing it (delete instead). - if ( - chunk_array is not None - and not chunk_spec.config.write_empty_chunks - and chunk_array.all_equal(_fill_value_or_default(chunk_spec)) - ): - chunk_array = None - - # Phase 5: Encode and persist. If the chunk was determined to - # be empty (phase 4) or encoding returns None, delete the key. - # Otherwise, write the encoded bytes directly to the store. - if chunk_array is None: + else: + encoded_list = [ + self._write_chunk_compute( + existing_bytes, chunk_spec, chunk_selection, + out_selection, is_complete_chunk, value, drop_axes, + ) + for existing_bytes, ( + _, chunk_spec, chunk_selection, out_selection, is_complete_chunk, + ) in zip(existing_bytes_list, batch_info_list, strict=False) + ] + + # Phase 3: IO — write encoded chunks to store. + # A sentinel _DELETED object distinguishes "delete key" from + # "no-op" (which doesn't arise here, but keeps the logic clean). + for encoded, (byte_setter, *_) in zip(encoded_list, batch_info_list, strict=False): + if encoded is _DELETED: byte_setter.delete_sync() + elif encoded is not None: + byte_setter.set_sync(encoded) else: - chunk_bytes = self._encode_one(chunk_array, chunk_spec) - if chunk_bytes is None: - byte_setter.delete_sync() - else: - byte_setter.set_sync(chunk_bytes) + byte_setter.delete_sync() register_pipeline(SyncCodecPipeline) diff --git a/src/zarr/testing/buffer.py b/src/zarr/testing/buffer.py index 6096ece2f8..abedb07306 100644 --- a/src/zarr/testing/buffer.py +++ b/src/zarr/testing/buffer.py @@ -72,6 +72,11 @@ async def set(self, key: str, value: Buffer, byte_range: tuple[int, int] | None assert isinstance(value, TestBuffer) await super().set(key, value, byte_range) + def set_sync(self, key: str, value: Buffer) -> None: + if "json" not in key: + assert isinstance(value, TestBuffer) + super().set_sync(key, value) + async def get( self, key: str, @@ -84,3 +89,16 @@ async def get( if ret is not None: assert isinstance(ret, prototype.buffer) return ret + + def get_sync( + self, + key: str, + prototype: BufferPrototype | None = None, + byte_range: Any = None, + ) -> Buffer | None: + if "json" not in key and prototype is not None: + assert prototype.buffer is TestBuffer + ret = super().get_sync(key=key, prototype=prototype, byte_range=byte_range) + if ret is not None and prototype is not None: + assert isinstance(ret, prototype.buffer) + return ret From b388911e415bfc7cd6705dcdb490d019202bce06 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 15:03:37 +0100 Subject: [PATCH 11/23] default to 1 itemsize for data types that don't declare it --- src/zarr/experimental/sync_codecs.py | 50 +++++++++++++++++----------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py index 2637c3d562..138520a63e 100644 --- a/src/zarr/experimental/sync_codecs.py +++ b/src/zarr/experimental/sync_codecs.py @@ -402,22 +402,16 @@ async def _decode_async( specs = [bb_codec.resolve_metadata(s) for s in specs] # Decode in reverse, using the forward-resolved specs. - for bb_codec, bb_spec in zip( - self.bytes_bytes_codecs[::-1], bb_specs[::-1], strict=False - ): + for bb_codec, bb_spec in zip(self.bytes_bytes_codecs[::-1], bb_specs[::-1], strict=False): chunk_bytes_batch = list( await bb_codec.decode(zip(chunk_bytes_batch, bb_spec, strict=False)) ) chunk_array_batch: list[NDBuffer | None] = list( - await self.array_bytes_codec.decode( - zip(chunk_bytes_batch, ab_specs, strict=False) - ) + await self.array_bytes_codec.decode(zip(chunk_bytes_batch, ab_specs, strict=False)) ) - for aa_codec, aa_spec in zip( - self.array_array_codecs[::-1], aa_specs[::-1], strict=False - ): + for aa_codec, aa_spec in zip(self.array_array_codecs[::-1], aa_specs[::-1], strict=False): chunk_array_batch = list( await aa_codec.decode(zip(chunk_array_batch, aa_spec, strict=False)) ) @@ -835,7 +829,10 @@ def read_sync( # Phase 2: Decode — run the codec chain for each chunk. # Estimate per-chunk codec work and decide whether to parallelize. - chunk_nbytes = product(first_spec.shape) * first_spec.dtype.item_size + # Not all dtypes have item_size (e.g. custom dtypes), so fall back + # to sequential processing when we can't estimate chunk size. + dtype_item_size = getattr(first_spec.dtype, "item_size", 1) + chunk_nbytes = product(first_spec.shape) * dtype_item_size n_workers = _choose_workers(len(batch_info_list), chunk_nbytes, self) if n_workers > 0: pool = _get_pool(n_workers) @@ -886,8 +883,13 @@ def _write_chunk_compute( # Merge new data into the chunk chunk_array: NDBuffer | None = self._merge_chunk_array( - existing_array, value, out_selection, chunk_spec, - chunk_selection, is_complete_chunk, drop_axes, + existing_array, + value, + out_selection, + chunk_spec, + chunk_selection, + is_complete_chunk, + drop_axes, ) # Filter empty chunks @@ -934,17 +936,18 @@ def write_sync( # Phase 1: IO — read existing chunk bytes for partial writes. existing_bytes_list: list[Buffer | None] = [ - byte_setter.get_sync(prototype=chunk_spec.prototype) - if not is_complete_chunk - else None + byte_setter.get_sync(prototype=chunk_spec.prototype) if not is_complete_chunk else None for byte_setter, chunk_spec, _, _, is_complete_chunk in batch_info_list ] # Phase 2: Compute — decode existing, merge new data, encode. # Estimate per-chunk work to decide whether to parallelize. # Use encode cost model since writes are dominated by compression. + # Not all dtypes have item_size (e.g. custom dtypes), so fall back + # to sequential processing when we can't estimate chunk size. _, first_spec, *_ = batch_info_list[0] - chunk_nbytes = product(first_spec.shape) * first_spec.dtype.item_size + dtype_item_size = getattr(first_spec.dtype, "item_size", 1) + chunk_nbytes = product(first_spec.shape) * dtype_item_size n_workers = _choose_workers(len(batch_info_list), chunk_nbytes, self, is_encode=True) if n_workers > 0: pool = _get_pool(n_workers) @@ -963,11 +966,20 @@ def write_sync( else: encoded_list = [ self._write_chunk_compute( - existing_bytes, chunk_spec, chunk_selection, - out_selection, is_complete_chunk, value, drop_axes, + existing_bytes, + chunk_spec, + chunk_selection, + out_selection, + is_complete_chunk, + value, + drop_axes, ) for existing_bytes, ( - _, chunk_spec, chunk_selection, out_selection, is_complete_chunk, + _, + chunk_spec, + chunk_selection, + out_selection, + is_complete_chunk, ) in zip(existing_bytes_list, batch_info_list, strict=False) ] From 9d77ca574ce8664766964b2b8e7dd6c158e5b2ac Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 16:50:00 +0100 Subject: [PATCH 12/23] remove extra codec pipeline --- src/zarr/__init__.py | 2 +- src/zarr/abc/codec.py | 4 +- src/zarr/abc/store.py | 17 +- src/zarr/codecs/sharding.py | 8 +- src/zarr/core/array.py | 14 +- src/zarr/core/codec_pipeline.py | 774 ++++++++++++++++----- src/zarr/core/config.py | 2 +- src/zarr/experimental/sync_codecs.py | 998 +-------------------------- src/zarr/storage/_memory.py | 2 +- tests/test_config.py | 10 +- tests/test_sync_codec_pipeline.py | 65 +- 11 files changed, 694 insertions(+), 1202 deletions(-) diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index e206892fb6..d10000ed29 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -37,7 +37,7 @@ from zarr.core.array import Array, AsyncArray from zarr.core.config import config from zarr.core.group import AsyncGroup, Group -from zarr.experimental.sync_codecs import SyncCodecPipeline # noqa: F401 (registers pipeline) +from zarr.experimental.sync_codecs import SyncCodecPipeline # noqa: F401 (backwards compat) # in case setuptools scm screw up and find version to be 0.0.0 assert not __version__.startswith("0.0.0") diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 2c459eaefa..78dd9add5d 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -139,12 +139,12 @@ def validate( def _decode_sync(self, chunk_data: CodecOutput, chunk_spec: ArraySpec) -> CodecInput: """Synchronously decode a single chunk. Override in subclasses to enable - SyncCodecPipeline support.""" + sync codec pipeline support.""" raise NotImplementedError # pragma: no cover def _encode_sync(self, chunk_data: CodecInput, chunk_spec: ArraySpec) -> CodecOutput | None: """Synchronously encode a single chunk. Override in subclasses to enable - SyncCodecPipeline support.""" + sync codec pipeline support.""" raise NotImplementedError # pragma: no cover @property diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index d52a642d3c..8625c33536 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -527,23 +527,22 @@ def supports_partial_writes(self) -> Literal[False]: # ----------------------------------------------------------------------- # Synchronous IO interface (opt-in) # - # These methods enable the SyncCodecPipeline to bypass the event loop + # These methods enable the codec pipeline to bypass the event loop # entirely for store IO. The default implementations raise # NotImplementedError; stores that wrap fundamentally synchronous # operations (MemoryStore, LocalStore) override them with direct # implementations. Remote/cloud stores (FsspecStore) leave them as-is # and remain async-only. - # - # See docs/design/sync-bypass.md for the full design rationale. # ----------------------------------------------------------------------- @property def supports_sync(self) -> bool: """Whether this store has native synchronous get/set/delete methods. - When True, ``SyncCodecPipeline.read_sync`` / ``write_sync`` will call - ``get_sync`` / ``set_sync`` / ``delete_sync`` directly on the calling - thread, avoiding the event loop overhead of the async equivalents. + When True, the codec pipeline's ``read_sync`` / ``write_sync`` will + call ``get_sync`` / ``set_sync`` / ``delete_sync`` directly on the + calling thread, avoiding the event loop overhead of the async + equivalents. Subclasses that override the sync methods below should also override this property to return True. @@ -558,7 +557,7 @@ def get_sync( ) -> Buffer | None: """Synchronous version of ``get()``. - Called by ``SyncCodecPipeline.read_sync`` to fetch chunk bytes without + Called by the codec pipeline's ``read_sync`` to fetch chunk bytes without going through the event loop. Only called when ``supports_sync`` is True, so the default ``NotImplementedError`` is never hit in practice. """ @@ -567,7 +566,7 @@ def get_sync( def set_sync(self, key: str, value: Buffer) -> None: """Synchronous version of ``set()``. - Called by ``SyncCodecPipeline.write_sync`` to persist encoded chunk + Called by the codec pipeline's ``write_sync`` to persist encoded chunk bytes without going through the event loop. """ raise NotImplementedError @@ -575,7 +574,7 @@ def set_sync(self, key: str, value: Buffer) -> None: def delete_sync(self, key: str) -> None: """Synchronous version of ``delete()``. - Called by ``SyncCodecPipeline.write_sync`` when a chunk should be + Called by the codec pipeline's ``write_sync`` when a chunk should be removed (e.g. an empty chunk with ``write_empty_chunks=False``). """ raise NotImplementedError diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index af2fc85147..b7b8af2668 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -102,7 +102,13 @@ def get_sync( self, prototype: BufferPrototype | None = None, byte_range: ByteRequest | None = None ) -> Buffer | None: # Sync equivalent of get() — just a dict lookup, no IO. - return self.shard_dict.get(self.chunk_coords) + value = self.shard_dict.get(self.chunk_coords) + if value is None: + return None + if byte_range is None: + return value + start, stop = _normalize_byte_range_index(value, byte_range) + return value[start:stop] @dataclass(frozen=True) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 703fae1a24..66b146fea0 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1979,11 +1979,8 @@ def _can_use_sync_path(self) -> bool: Two conditions must hold: 1. The codec pipeline supports fully synchronous IO (all codecs in - the chain have _decode_sync/_encode_sync, and the pipeline - implements read_sync/write_sync). This is True for - SyncCodecPipeline when all codecs support sync — including - ShardingCodec, which has _decode_sync/_encode_sync and - _decode_partial_sync/_encode_partial_sync for the sharding path. + the chain have _decode_sync/_encode_sync). This is True for + BatchedCodecPipeline when all codecs support sync. 2. The store supports synchronous operations (MemoryStore, LocalStore). Remote stores like FsspecStore remain async-only. @@ -1991,11 +1988,8 @@ def _can_use_sync_path(self) -> bool: When both hold, the selection methods below call _get_selection_sync / _set_selection_sync directly, running the entire read/write path on the calling thread with zero async - overhead. - - Uses getattr() with defaults for forward compatibility — older or - third-party pipelines/stores that lack these attributes gracefully - fall back to the async path. + overhead. Otherwise, the async path with concurrent IO overlap + is used automatically. """ pipeline = self.async_array.codec_pipeline store_path = self.async_array.store_path diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index fd557ac43e..c3a291dd41 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -1,5 +1,7 @@ from __future__ import annotations +import os +from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from itertools import islice, pairwise from typing import TYPE_CHECKING, Any, TypeVar @@ -14,7 +16,7 @@ Codec, CodecPipeline, ) -from zarr.core.common import concurrent_map +from zarr.core.common import concurrent_map, product from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar from zarr.errors import ZarrUserWarning @@ -68,13 +70,138 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any: return fill_value +# --------------------------------------------------------------------------- +# Work estimation for thread pool sizing +# --------------------------------------------------------------------------- + +# Approximate nanoseconds-per-byte for codec decode and encode, measured on +# typical hardware. These don't need to be exact — they just need to rank +# codecs correctly so the pool-sizing heuristic makes good decisions. +# +# Decode and encode have very different costs for many codecs: +# - gzip decode ~5-10 ns/byte vs encode ~50-100 ns/byte +# - zstd decode ~1-2 ns/byte vs encode ~2-10 ns/byte +# - blosc decode ~0.5-1 ns/byte vs encode ~1-5 ns/byte +# +# "Cheap" codecs (memcpy-like): BytesCodec, Crc32cCodec, TransposeCodec +# → ~0.1-1 ns/byte, dominated by memcpy; no benefit from threading. +# "Medium" codecs: ZstdCodec, BloscCodec +# → decode ~1-2 ns/byte, encode ~2-5 ns/byte; GIL released in C. +# "Expensive" codecs: GzipCodec +# → decode ~5-10 ns/byte, encode ~50-100 ns/byte; GIL released in C. +# +# For unknown codecs (e.g. third-party numcodecs wrappers), we assume +# "medium" cost — better to over-parallelize slightly than miss a win. + +_CODEC_DECODE_NS_PER_BYTE: dict[str, float] = { + # Near-zero cost — just reshaping/copying/checksumming + "BytesCodec": 0, + "Crc32cCodec": 0, + "TransposeCodec": 0, + "VLenUTF8Codec": 0, + "VLenBytesCodec": 0, + # Medium cost — fast C codecs, GIL released + "ZstdCodec": 1, + "BloscCodec": 0.5, + # High cost — slower C codecs, GIL released + "GzipCodec": 8, +} + +_CODEC_ENCODE_NS_PER_BYTE: dict[str, float] = { + # Near-zero cost — just reshaping/copying/checksumming + "BytesCodec": 0, + "Crc32cCodec": 0, + "TransposeCodec": 0, + "VLenUTF8Codec": 0, + "VLenBytesCodec": 0, + # Medium cost — fast C codecs, GIL released + "ZstdCodec": 3, + "BloscCodec": 2, + # High cost — slower C codecs, GIL released + "GzipCodec": 50, +} + +_DEFAULT_DECODE_NS_PER_BYTE = 1 # assume medium for unknown codecs +_DEFAULT_ENCODE_NS_PER_BYTE = 3 # encode is typically slower + +# Thread pool dispatch overhead in nanoseconds (~50-100us per task). +# We only parallelize when the estimated per-chunk work exceeds this. +_POOL_OVERHEAD_NS = 200_000 + + +def _estimate_chunk_work_ns( + chunk_nbytes: int, + codecs: Iterable[Codec], + *, + is_encode: bool = False, +) -> float: + """Estimate nanoseconds of codec work for one chunk.""" + table = _CODEC_ENCODE_NS_PER_BYTE if is_encode else _CODEC_DECODE_NS_PER_BYTE + default = _DEFAULT_ENCODE_NS_PER_BYTE if is_encode else _DEFAULT_DECODE_NS_PER_BYTE + total_ns_per_byte = 0.0 + for codec in codecs: + name = type(codec).__name__ + total_ns_per_byte += table.get(name, default) + return chunk_nbytes * total_ns_per_byte + + +def _choose_workers( + n_chunks: int, + chunk_nbytes: int, + codecs: Iterable[Codec], + *, + is_encode: bool = False, +) -> int: + """Decide how many thread pool workers to use (0 = don't use pool).""" + if n_chunks < 2: + return 0 + + per_chunk_ns = _estimate_chunk_work_ns(chunk_nbytes, codecs, is_encode=is_encode) + + if per_chunk_ns < _POOL_OVERHEAD_NS: + return 0 + + total_work_ns = per_chunk_ns * n_chunks + total_dispatch_ns = n_chunks * 50_000 # ~50us per task + if total_work_ns < total_dispatch_ns * 3: + return 0 + + target_per_worker_ns = 1_000_000 # 1ms + workers = max(1, int(total_work_ns / target_per_worker_ns)) + + cpu_count = os.cpu_count() or 4 + return min(workers, n_chunks, cpu_count) + + +def _get_pool(max_workers: int) -> ThreadPoolExecutor: + """Get a thread pool with at most *max_workers* threads.""" + global _pool + if _pool is None or _pool._max_workers < max_workers: + _pool = ThreadPoolExecutor(max_workers=max_workers) + return _pool + + +_pool: ThreadPoolExecutor | None = None + +# Sentinel to distinguish "delete this key" from None. +_DELETED = object() + + @dataclass(frozen=True) class BatchedCodecPipeline(CodecPipeline): - """Default codec pipeline. + """Codec pipeline that automatically selects the optimal execution strategy. + + When all codecs support synchronous operations and the store supports + sync IO, this pipeline runs the entire read/write path on the calling + thread with zero async overhead, using a thread pool for parallel codec + compute on multi-chunk operations. - This batched codec pipeline divides the chunk batches into batches of a configurable - batch size ("mini-batch"). Fetching, decoding, encoding and storing are performed in - lock step for each mini-batch. Multiple mini-batches are processing concurrently. + When the store requires async IO (e.g. cloud stores), this pipeline uses + the async path with concurrent IO overlap via ``concurrent_map``. + + This automatic dispatch eliminates the need for users to choose between + pipeline implementations — the right strategy is selected based on codec + and store capabilities. """ array_array_codecs: tuple[ArrayArrayCodec, ...] @@ -82,12 +209,17 @@ class BatchedCodecPipeline(CodecPipeline): bytes_bytes_codecs: tuple[BytesBytesCodec, ...] batch_size: int + @property + def _all_sync(self) -> bool: + """True when every codec in the chain supports synchronous dispatch.""" + return all(c.supports_sync for c in self) + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return type(self).from_codecs(c.evolve_from_array_spec(array_spec=array_spec) for c in self) @classmethod def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: - array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) + array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(list(codecs)) return cls( array_array_codecs=array_array_codecs, @@ -98,32 +230,12 @@ def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) @property def supports_partial_decode(self) -> bool: - """Determines whether the codec pipeline supports partial decoding. - - Currently, only codec pipelines with a single ArrayBytesCodec that supports - partial decoding can support partial decoding. This limitation is due to the fact - that ArrayArrayCodecs can change the slice selection leading to non-contiguous - slices and BytesBytesCodecs can change the chunk bytes in a way that slice - selections cannot be attributed to byte ranges anymore which renders partial - decoding infeasible. - - This limitation may softened in the future.""" return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin ) @property def supports_partial_encode(self) -> bool: - """Determines whether the codec pipeline supports partial encoding. - - Currently, only codec pipelines with a single ArrayBytesCodec that supports - partial encoding can support partial encoding. This limitation is due to the fact - that ArrayArrayCodecs can change the slice selection leading to non-contiguous - slices and BytesBytesCodecs can change the chunk bytes in a way that slice - selections cannot be attributed to byte ranges anymore which renders partial - encoding infeasible. - - This limitation may softened in the future.""" return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin ) @@ -149,6 +261,85 @@ def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: array_spec = codec.resolve_metadata(array_spec) return byte_length + # ------------------------------------------------------------------- + # Per-chunk sync codec chain + # ------------------------------------------------------------------- + + def _resolve_metadata_chain( + self, chunk_spec: ArraySpec + ) -> tuple[ + list[tuple[ArrayArrayCodec, ArraySpec]], + tuple[ArrayBytesCodec, ArraySpec], + list[tuple[BytesBytesCodec, ArraySpec]], + ]: + """Resolve metadata through the codec chain for a single chunk_spec.""" + aa_codecs_with_spec: list[tuple[ArrayArrayCodec, ArraySpec]] = [] + spec = chunk_spec + for aa_codec in self.array_array_codecs: + aa_codecs_with_spec.append((aa_codec, spec)) + spec = aa_codec.resolve_metadata(spec) + + ab_codec_with_spec = (self.array_bytes_codec, spec) + spec = self.array_bytes_codec.resolve_metadata(spec) + + bb_codecs_with_spec: list[tuple[BytesBytesCodec, ArraySpec]] = [] + for bb_codec in self.bytes_bytes_codecs: + bb_codecs_with_spec.append((bb_codec, spec)) + spec = bb_codec.resolve_metadata(spec) + + return (aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec) + + def _decode_one( + self, + chunk_bytes: Buffer | None, + chunk_spec: ArraySpec, + aa_chain: list[tuple[ArrayArrayCodec, ArraySpec]], + ab_pair: tuple[ArrayBytesCodec, ArraySpec], + bb_chain: list[tuple[BytesBytesCodec, ArraySpec]], + ) -> NDBuffer | None: + """Decode a single chunk through the full codec chain, synchronously.""" + if chunk_bytes is None: + return None + + for bb_codec, spec in reversed(bb_chain): + chunk_bytes = bb_codec._decode_sync(chunk_bytes, spec) + + ab_codec, ab_spec = ab_pair + chunk_array = ab_codec._decode_sync(chunk_bytes, ab_spec) + + for aa_codec, spec in reversed(aa_chain): + chunk_array = aa_codec._decode_sync(chunk_array, spec) + + return chunk_array + + def _encode_one( + self, + chunk_array: NDBuffer | None, + chunk_spec: ArraySpec, + ) -> Buffer | None: + """Encode a single chunk through the full codec chain, synchronously.""" + if chunk_array is None: + return None + + spec = chunk_spec + + for aa_codec in self.array_array_codecs: + chunk_array = aa_codec._encode_sync(chunk_array, spec) + spec = aa_codec.resolve_metadata(spec) + + chunk_bytes = self.array_bytes_codec._encode_sync(chunk_array, spec) + spec = self.array_bytes_codec.resolve_metadata(spec) + + for bb_codec in self.bytes_bytes_codecs: + chunk_bytes = bb_codec._encode_sync(chunk_bytes, spec) + spec = bb_codec.resolve_metadata(spec) + + return chunk_bytes + + # ------------------------------------------------------------------- + # Batched async decode/encode (layer-by-layer across all chunks) + # ------------------------------------------------------------------- + def _codecs_with_resolved_metadata_batched( self, chunk_specs: Iterable[ArraySpec] ) -> tuple[ @@ -246,12 +437,63 @@ async def encode_partial_batch( assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) await self.array_bytes_codec.encode_partial(batch_info) + # ------------------------------------------------------------------- + # Top-level decode / encode + # ------------------------------------------------------------------- + + async def decode( + self, + chunk_bytes_and_specs: Iterable[tuple[Buffer | None, ArraySpec]], + ) -> Iterable[NDBuffer | None]: + items = list(chunk_bytes_and_specs) + if not items: + return [] + + if self._all_sync: + # All codecs support sync -- run the full chain inline (no threading). + _, first_spec = items[0] + aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(first_spec) + return [ + self._decode_one(chunk_bytes, chunk_spec, aa_chain, ab_pair, bb_chain) + for chunk_bytes, chunk_spec in items + ] + + # Async fallback: layer-by-layer across all chunks. + output: list[NDBuffer | None] = [] + for batch_info in batched(items, self.batch_size): + output.extend(await self.decode_batch(batch_info)) + return output + + async def encode( + self, + chunk_arrays_and_specs: Iterable[tuple[NDBuffer | None, ArraySpec]], + ) -> Iterable[Buffer | None]: + items = list(chunk_arrays_and_specs) + if not items: + return [] + + if self._all_sync: + # All codecs support sync -- run the full chain inline (no threading). + return [self._encode_one(chunk_array, chunk_spec) for chunk_array, chunk_spec in items] + + # Async fallback: layer-by-layer across all chunks. + output: list[Buffer | None] = [] + for single_batch_info in batched(items, self.batch_size): + output.extend(await self.encode_batch(single_batch_info)) + return output + + # ------------------------------------------------------------------- + # Async read / write (IO overlap via concurrent_map) + # ------------------------------------------------------------------- + async def read_batch( self, batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: + batch_info = list(batch_info) + if self.supports_partial_decode: chunk_array_batch = await self.decode_partial_batch( [ @@ -266,30 +508,56 @@ async def read_batch( out[out_selection] = chunk_array else: out[out_selection] = fill_value_or_default(chunk_spec) - else: - chunk_bytes_batch = await concurrent_map( - [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], - lambda byte_getter, prototype: byte_getter.get(prototype), - config.get("async.concurrency"), - ) - chunk_array_batch = await self.decode_batch( - [ - (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, *_) in zip( - chunk_bytes_batch, batch_info, strict=False - ) - ], - ) - for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip( - chunk_array_batch, batch_info, strict=False - ): - if chunk_array is not None: - tmp = chunk_array[chunk_selection] - if drop_axes != (): - tmp = tmp.squeeze(axis=drop_axes) - out[out_selection] = tmp - else: - out[out_selection] = fill_value_or_default(chunk_spec) + return + + # Phase 1: IO -- fetch bytes from store (always async) + chunk_bytes_batch = await concurrent_map( + [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], + lambda byte_getter, prototype: byte_getter.get(prototype), + config.get("async.concurrency"), + ) + + # Phase 2: Compute -- decode + scatter + decode_items = [ + (chunk_bytes, chunk_spec) + for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch_info, strict=False) + ] + + chunk_array_batch_decoded: Iterable[NDBuffer | None] = await self.decode(decode_items) + self._scatter(chunk_array_batch_decoded, batch_info, out, drop_axes) + + @staticmethod + def _scatter( + chunk_array_batch: Iterable[NDBuffer | None], + batch_info: list[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + out: NDBuffer, + drop_axes: tuple[int, ...], + ) -> None: + for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip( + chunk_array_batch, batch_info, strict=False + ): + if chunk_array is not None: + tmp = chunk_array[chunk_selection] + if drop_axes != (): + tmp = tmp.squeeze(axis=drop_axes) + out[out_selection] = tmp + else: + out[out_selection] = fill_value_or_default(chunk_spec) + + async def read( + self, + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + out: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + await concurrent_map( + [ + (single_batch_info, out, drop_axes) + for single_batch_info in batched(batch_info, self.batch_size) + ], + self.read_batch, + config.get("async.concurrency"), + ) def _merge_chunk_array( self, @@ -304,7 +572,6 @@ def _merge_chunk_array( if ( is_complete_chunk and value.shape == chunk_spec.shape - # Guard that this is not a partial chunk at the end with is_complete_chunk=True and value[out_selection].shape == chunk_spec.shape ): return value @@ -316,20 +583,16 @@ def _merge_chunk_array( fill_value=fill_value_or_default(chunk_spec), ) else: - chunk_array = existing_chunk_array.copy() # make a writable copy + chunk_array = existing_chunk_array.copy() if chunk_selection == () or is_scalar( value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype() ): chunk_value = value else: chunk_value = value[out_selection] - # handle missing singleton dimensions if drop_axes != (): item = tuple( - None # equivalent to np.newaxis - if idx in drop_axes - else slice(None) - for idx in range(chunk_spec.ndim) + None if idx in drop_axes else slice(None) for idx in range(chunk_spec.ndim) ) chunk_value = chunk_value[item] chunk_array[chunk_selection] = chunk_value @@ -341,8 +604,9 @@ async def write_batch( value: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: + batch_info = list(batch_info) + if self.supports_partial_encode: - # Pass scalar values as is if len(value.shape) == 0: await self.encode_partial_batch( [ @@ -357,128 +621,116 @@ async def write_batch( for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info ], ) - - else: - # Read existing bytes if not total slice - async def _read_key( - byte_setter: ByteSetter | None, prototype: BufferPrototype - ) -> Buffer | None: - if byte_setter is None: - return None - return await byte_setter.get(prototype=prototype) - - chunk_bytes_batch: Iterable[Buffer | None] - chunk_bytes_batch = await concurrent_map( - [ - ( - None if is_complete_chunk else byte_setter, - chunk_spec.prototype, - ) - for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info - ], - _read_key, - config.get("async.concurrency"), - ) - chunk_array_decoded = await self.decode_batch( - [ - (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, *_) in zip( - chunk_bytes_batch, batch_info, strict=False - ) - ], - ) - - chunk_array_merged = [ - self._merge_chunk_array( - chunk_array, - value, - out_selection, - chunk_spec, - chunk_selection, - is_complete_chunk, - drop_axes, + return + + # Phase 1: IO -- read existing bytes for non-complete chunks + async def _read_key( + byte_setter: ByteSetter | None, prototype: BufferPrototype + ) -> Buffer | None: + if byte_setter is None: + return None + return await byte_setter.get(prototype=prototype) + + chunk_bytes_batch: list[Buffer | None] + chunk_bytes_batch = await concurrent_map( + [ + ( + None if is_complete_chunk else byte_setter, + chunk_spec.prototype, ) - for chunk_array, ( - _, - chunk_spec, - chunk_selection, - out_selection, - is_complete_chunk, - ) in zip(chunk_array_decoded, batch_info, strict=False) - ] - chunk_array_batch: list[NDBuffer | None] = [] - for chunk_array, (_, chunk_spec, *_) in zip( - chunk_array_merged, batch_info, strict=False - ): - if chunk_array is None: - chunk_array_batch.append(None) # type: ignore[unreachable] - else: - if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal( - fill_value_or_default(chunk_spec) - ): - chunk_array_batch.append(None) - else: - chunk_array_batch.append(chunk_array) - - chunk_bytes_batch = await self.encode_batch( - [ - (chunk_array, chunk_spec) - for chunk_array, (_, chunk_spec, *_) in zip( - chunk_array_batch, batch_info, strict=False - ) - ], - ) + for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info + ], + _read_key, + config.get("async.concurrency"), + ) - async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None: - if chunk_bytes is None: - await byte_setter.delete() - else: - await byte_setter.set(chunk_bytes) + # Phase 2: Compute -- decode, merge, encode + decode_items = [ + (chunk_bytes, chunk_spec) + for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch_info, strict=False) + ] - await concurrent_map( - [ - (byte_setter, chunk_bytes) - for chunk_bytes, (byte_setter, *_) in zip( - chunk_bytes_batch, batch_info, strict=False - ) - ], - _write_key, - config.get("async.concurrency"), - ) + encoded_batch = await self._write_batch_compute(decode_items, batch_info, value, drop_axes) - async def decode( - self, - chunk_bytes_and_specs: Iterable[tuple[Buffer | None, ArraySpec]], - ) -> Iterable[NDBuffer | None]: - output: list[NDBuffer | None] = [] - for batch_info in batched(chunk_bytes_and_specs, self.batch_size): - output.extend(await self.decode_batch(batch_info)) - return output - - async def encode( - self, - chunk_arrays_and_specs: Iterable[tuple[NDBuffer | None, ArraySpec]], - ) -> Iterable[Buffer | None]: - output: list[Buffer | None] = [] - for single_batch_info in batched(chunk_arrays_and_specs, self.batch_size): - output.extend(await self.encode_batch(single_batch_info)) - return output + # Phase 3: IO -- write to store + async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None: + if chunk_bytes is None: + await byte_setter.delete() + else: + await byte_setter.set(chunk_bytes) - async def read( - self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], - out: NDBuffer, - drop_axes: tuple[int, ...] = (), - ) -> None: await concurrent_map( [ - (single_batch_info, out, drop_axes) - for single_batch_info in batched(batch_info, self.batch_size) + (byte_setter, chunk_bytes) + for chunk_bytes, (byte_setter, *_) in zip(encoded_batch, batch_info, strict=False) ], - self.read_batch, + _write_key, config.get("async.concurrency"), ) + async def _write_batch_compute( + self, + decode_items: list[tuple[Buffer | None, ArraySpec]], + batch_info: list[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + value: NDBuffer, + drop_axes: tuple[int, ...], + ) -> list[Buffer | None]: + chunk_array_decoded: Iterable[NDBuffer | None] = await self.decode(decode_items) + + chunk_array_batch = self._merge_and_filter( + chunk_array_decoded, batch_info, value, drop_axes + ) + + encoded_batch: Iterable[Buffer | None] = await self.encode( + [ + (chunk_array, chunk_spec) + for chunk_array, (_, chunk_spec, *_) in zip( + chunk_array_batch, batch_info, strict=False + ) + ] + ) + return list(encoded_batch) + + def _merge_and_filter( + self, + chunk_array_decoded: Iterable[NDBuffer | None], + batch_info: list, + value: NDBuffer, + drop_axes: tuple[int, ...], + ) -> list[NDBuffer | None]: + chunk_array_merged = [ + self._merge_chunk_array( + chunk_array, + value, + out_selection, + chunk_spec, + chunk_selection, + is_complete_chunk, + drop_axes, + ) + for chunk_array, ( + _, + chunk_spec, + chunk_selection, + out_selection, + is_complete_chunk, + ) in zip(chunk_array_decoded, batch_info, strict=False) + ] + chunk_array_batch: list[NDBuffer | None] = [] + for chunk_array, (_, chunk_spec, *_) in zip( + chunk_array_merged, batch_info, strict=False + ): + if chunk_array is None: + chunk_array_batch.append(None) # type: ignore[unreachable] + else: + if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal( + fill_value_or_default(chunk_spec) + ): + chunk_array_batch.append(None) + else: + chunk_array_batch.append(chunk_array) + return chunk_array_batch + async def write( self, batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], @@ -494,6 +746,190 @@ async def write( config.get("async.concurrency"), ) + # ------------------------------------------------------------------- + # Fully synchronous read / write (no event loop) + # ------------------------------------------------------------------- + + @property + def supports_sync_io(self) -> bool: + return self._all_sync + + def read_sync( + self, + batch_info: Iterable[tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool]], + out: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + batch_info_list = list(batch_info) + if not batch_info_list: + return + + if self.supports_partial_decode: + ab_codec: Any = self.array_bytes_codec + for byte_getter, chunk_spec, chunk_selection, out_selection, _ in batch_info_list: + chunk_array: NDBuffer | None = ab_codec._decode_partial_sync( + byte_getter, chunk_selection, chunk_spec + ) + if chunk_array is not None: + out[out_selection] = chunk_array + else: + out[out_selection] = fill_value_or_default(chunk_spec) + return + + _, first_spec, *_ = batch_info_list[0] + aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(first_spec) + + # Phase 1: IO — fetch all chunk bytes sequentially. + chunk_bytes_list: list[Buffer | None] = [ + byte_getter.get_sync(prototype=chunk_spec.prototype) + for byte_getter, chunk_spec, *_ in batch_info_list + ] + + # Phase 2: Decode — run the codec chain for each chunk. + dtype_item_size = getattr(first_spec.dtype, "item_size", 1) + chunk_nbytes = product(first_spec.shape) * dtype_item_size + n_workers = _choose_workers(len(batch_info_list), chunk_nbytes, self) + if n_workers > 0: + pool = _get_pool(n_workers) + chunk_arrays: list[NDBuffer | None] = list( + pool.map( + self._decode_one, + chunk_bytes_list, + [chunk_spec for _, chunk_spec, *_ in batch_info_list], + [aa_chain] * len(batch_info_list), + [ab_pair] * len(batch_info_list), + [bb_chain] * len(batch_info_list), + ) + ) + else: + chunk_arrays = [ + self._decode_one(chunk_bytes, chunk_spec, aa_chain, ab_pair, bb_chain) + for chunk_bytes, (_, chunk_spec, *_) in zip( + chunk_bytes_list, batch_info_list, strict=False + ) + ] + + # Phase 3: Scatter decoded chunk data into the output buffer. + self._scatter(chunk_arrays, batch_info_list, out, drop_axes) + + def _write_chunk_compute( + self, + existing_bytes: Buffer | None, + chunk_spec: ArraySpec, + chunk_selection: SelectorTuple, + out_selection: SelectorTuple, + is_complete_chunk: bool, + value: NDBuffer, + drop_axes: tuple[int, ...], + ) -> Buffer | None | object: + """Per-chunk compute for write: decode existing -> merge -> encode.""" + existing_array: NDBuffer | None = None + if existing_bytes is not None: + aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(chunk_spec) + existing_array = self._decode_one( + existing_bytes, chunk_spec, aa_chain, ab_pair, bb_chain + ) + + chunk_array: NDBuffer | None = self._merge_chunk_array( + existing_array, + value, + out_selection, + chunk_spec, + chunk_selection, + is_complete_chunk, + drop_axes, + ) + + if ( + chunk_array is not None + and not chunk_spec.config.write_empty_chunks + and chunk_array.all_equal(fill_value_or_default(chunk_spec)) + ): + chunk_array = None + + if chunk_array is None: + return _DELETED + chunk_bytes = self._encode_one(chunk_array, chunk_spec) + if chunk_bytes is None: + return _DELETED + return chunk_bytes + + def write_sync( + self, + batch_info: Iterable[tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool]], + value: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + batch_info_list = list(batch_info) + if not batch_info_list: + return + + if self.supports_partial_encode: + ab_codec: Any = self.array_bytes_codec + if len(value.shape) == 0: + for byte_setter, chunk_spec, chunk_selection, _, _ in batch_info_list: + ab_codec._encode_partial_sync(byte_setter, value, chunk_selection, chunk_spec) + else: + for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info_list: + ab_codec._encode_partial_sync( + byte_setter, value[out_selection], chunk_selection, chunk_spec + ) + return + + # Phase 1: IO — read existing chunk bytes for partial writes. + existing_bytes_list: list[Buffer | None] = [ + byte_setter.get_sync(prototype=chunk_spec.prototype) if not is_complete_chunk else None + for byte_setter, chunk_spec, _, _, is_complete_chunk in batch_info_list + ] + + # Phase 2: Compute — decode existing, merge new data, encode. + _, first_spec, *_ = batch_info_list[0] + dtype_item_size = getattr(first_spec.dtype, "item_size", 1) + chunk_nbytes = product(first_spec.shape) * dtype_item_size + n_workers = _choose_workers(len(batch_info_list), chunk_nbytes, self, is_encode=True) + if n_workers > 0: + pool = _get_pool(n_workers) + encoded_list: list[Buffer | None | object] = list( + pool.map( + self._write_chunk_compute, + existing_bytes_list, + [chunk_spec for _, chunk_spec, *_ in batch_info_list], + [chunk_selection for _, _, chunk_selection, _, _ in batch_info_list], + [out_selection for _, _, _, out_selection, _ in batch_info_list], + [is_complete for _, _, _, _, is_complete in batch_info_list], + [value] * len(batch_info_list), + [drop_axes] * len(batch_info_list), + ) + ) + else: + encoded_list = [ + self._write_chunk_compute( + existing_bytes, + chunk_spec, + chunk_selection, + out_selection, + is_complete_chunk, + value, + drop_axes, + ) + for existing_bytes, ( + _, + chunk_spec, + chunk_selection, + out_selection, + is_complete_chunk, + ) in zip(existing_bytes_list, batch_info_list, strict=False) + ] + + # Phase 3: IO — write encoded chunks to store. + for encoded, (byte_setter, *_) in zip(encoded_list, batch_info_list, strict=False): + if encoded is _DELETED: + byte_setter.delete_sync() + elif encoded is not None: + byte_setter.set_sync(encoded) + else: + byte_setter.delete_sync() + def codecs_from_list( codecs: Iterable[Codec], diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index ddf38eaf25..f8f8ea4f5f 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -102,7 +102,7 @@ def enable_gpu(self) -> ConfigSet: "threading": {"max_workers": None}, "json_indent": 2, "codec_pipeline": { - "path": "zarr.experimental.sync_codecs.SyncCodecPipeline", + "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", "batch_size": 1, }, "codecs": { diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py index 138520a63e..2e89634f1b 100644 --- a/src/zarr/experimental/sync_codecs.py +++ b/src/zarr/experimental/sync_codecs.py @@ -1,998 +1,40 @@ -"""Experimental synchronous codec pipeline. +"""Backwards-compatible alias for SyncCodecPipeline. -The standard zarr codec pipeline (``BatchedCodecPipeline``) wraps fundamentally -synchronous operations (e.g. gzip compress/decompress) in ``asyncio.to_thread``. -The ``SyncCodecPipeline`` in this module eliminates that overhead by running -per-chunk codec chains synchronously, achieving 2-11x throughput improvements. +The synchronous codec optimizations (inline per-chunk codec chains, thread pool +parallelism, fully synchronous read/write bypass) have been merged into +``BatchedCodecPipeline``. That pipeline now automatically selects the optimal +strategy based on codec and store capabilities — no configuration needed. -Usage:: - - import zarr - - zarr.config.set({"codec_pipeline.path": "zarr.experimental.sync_codecs.SyncCodecPipeline"}) +``SyncCodecPipeline`` is retained as a subclass alias so that existing config +references (``codec_pipeline.path: zarr.experimental.sync_codecs.SyncCodecPipeline``) +and imports continue to work. """ from __future__ import annotations -import os -from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass -from itertools import islice -from typing import TYPE_CHECKING, TypeVar -from zarr.abc.codec import ( - ArrayArrayCodec, - ArrayBytesCodec, - ArrayBytesCodecPartialDecodeMixin, - ArrayBytesCodecPartialEncodeMixin, - BytesBytesCodec, - Codec, - CodecPipeline, +from zarr.core.codec_pipeline import ( + BatchedCodecPipeline, + _CODEC_DECODE_NS_PER_BYTE, # noqa: F401 + _CODEC_ENCODE_NS_PER_BYTE, # noqa: F401 + _choose_workers, # noqa: F401 + _estimate_chunk_work_ns, # noqa: F401 ) -from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.codec_pipeline import _unzip2, codecs_from_list, resolve_batched -from zarr.core.common import concurrent_map, product -from zarr.core.config import config -from zarr.core.indexing import SelectorTuple, is_scalar from zarr.registry import register_pipeline -if TYPE_CHECKING: - from collections.abc import Iterable, Iterator - from typing import Any, Self - - from zarr.abc.store import ByteGetter, ByteSetter - from zarr.core.array_spec import ArraySpec - from zarr.core.buffer import BufferPrototype - from zarr.core.chunk_grids import ChunkGrid - from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType - __all__ = ["SyncCodecPipeline"] -T = TypeVar("T") - - -# --------------------------------------------------------------------------- -# Pipeline helpers -# --------------------------------------------------------------------------- - - -def _batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]: - if n < 1: - raise ValueError("n must be at least one") - it = iter(iterable) - while batch := tuple(islice(it, n)): - yield batch - - -def _fill_value_or_default(chunk_spec: ArraySpec) -> Any: - fill_value = chunk_spec.fill_value - if fill_value is None: - return chunk_spec.dtype.default_scalar() - return fill_value - - -def _get_pool(max_workers: int) -> ThreadPoolExecutor: - """Get a thread pool with at most *max_workers* threads. - - Reuses a cached pool when the requested size is <= the cached size. - CPU-heavy codecs (zstd, gzip, blosc) release the GIL during their C-level - compress/decompress calls, so real parallelism is achieved across threads. - """ - global _pool - if _pool is None or _pool._max_workers < max_workers: - _pool = ThreadPoolExecutor(max_workers=max_workers) - return _pool - - -_pool: ThreadPoolExecutor | None = None - -# Sentinel to distinguish "delete this key" from None (which _encode_one -# can return when a chunk encodes to nothing). -_DELETED = object() - -# --------------------------------------------------------------------------- -# Work estimation for thread pool sizing -# --------------------------------------------------------------------------- - -# Approximate nanoseconds-per-byte for codec decode and encode, measured on -# typical hardware. These don't need to be exact — they just need to rank -# codecs correctly so the pool-sizing heuristic makes good decisions. -# -# Decode and encode have very different costs for many codecs: -# - gzip decode ~5-10 ns/byte vs encode ~50-100 ns/byte -# - zstd decode ~1-2 ns/byte vs encode ~2-10 ns/byte -# - blosc decode ~0.5-1 ns/byte vs encode ~1-5 ns/byte -# -# "Cheap" codecs (memcpy-like): BytesCodec, Crc32cCodec, TransposeCodec -# → ~0.1-1 ns/byte, dominated by memcpy; no benefit from threading. -# "Medium" codecs: ZstdCodec, BloscCodec -# → decode ~1-2 ns/byte, encode ~2-5 ns/byte; GIL released in C. -# "Expensive" codecs: GzipCodec -# → decode ~5-10 ns/byte, encode ~50-100 ns/byte; GIL released in C. -# -# For unknown codecs (e.g. third-party numcodecs wrappers), we assume -# "medium" cost — better to over-parallelize slightly than miss a win. - -_CODEC_DECODE_NS_PER_BYTE: dict[str, float] = { - # Near-zero cost — just reshaping/copying/checksumming - "BytesCodec": 0, - "Crc32cCodec": 0, - "TransposeCodec": 0, - "VLenUTF8Codec": 0, - "VLenBytesCodec": 0, - # Medium cost — fast C codecs, GIL released - "ZstdCodec": 1, - "BloscCodec": 0.5, - # High cost — slower C codecs, GIL released - "GzipCodec": 8, -} - -_CODEC_ENCODE_NS_PER_BYTE: dict[str, float] = { - # Near-zero cost — just reshaping/copying/checksumming - "BytesCodec": 0, - "Crc32cCodec": 0, - "TransposeCodec": 0, - "VLenUTF8Codec": 0, - "VLenBytesCodec": 0, - # Medium cost — fast C codecs, GIL released - "ZstdCodec": 3, - "BloscCodec": 2, - # High cost — slower C codecs, GIL released - "GzipCodec": 50, -} - -_DEFAULT_DECODE_NS_PER_BYTE = 1 # assume medium for unknown codecs -_DEFAULT_ENCODE_NS_PER_BYTE = 3 # encode is typically slower - -# Thread pool dispatch overhead in nanoseconds (~50-100us per task). -# We only parallelize when the estimated per-chunk work exceeds this. -_POOL_OVERHEAD_NS = 200_000 - - -def _estimate_chunk_work_ns( - chunk_nbytes: int, - codecs: Iterable[Codec], - *, - is_encode: bool = False, -) -> float: - """Estimate nanoseconds of codec work for one chunk. - - Sums the per-byte cost of each codec in the chain, multiplied by the - chunk size. Uses separate decode/encode cost tables since compression - is typically much more expensive than decompression. - - This is a rough estimate — compression ratios, cache effects, - and hardware differences mean the actual time can vary 2-5x. But the - estimate is good enough to decide "use pool" vs "don't use pool". - """ - table = _CODEC_ENCODE_NS_PER_BYTE if is_encode else _CODEC_DECODE_NS_PER_BYTE - default = _DEFAULT_ENCODE_NS_PER_BYTE if is_encode else _DEFAULT_DECODE_NS_PER_BYTE - total_ns_per_byte = 0.0 - for codec in codecs: - name = type(codec).__name__ - total_ns_per_byte += table.get(name, default) - return chunk_nbytes * total_ns_per_byte - - -def _choose_workers( - n_chunks: int, - chunk_nbytes: int, - codecs: Iterable[Codec], - *, - is_encode: bool = False, -) -> int: - """Decide how many thread pool workers to use (0 = don't use pool). - - The model: - 1. Estimate per-chunk codec work in nanoseconds (decode or encode). - 2. If per-chunk work < pool dispatch overhead, return 0 (sequential). - Small chunks with fast codecs aren't worth the pool dispatch cost. - 3. Check that total codec work significantly exceeds total dispatch - overhead (n_chunks * per-task cost). If not, sequential is faster. - 4. Scale workers with total work, capped at CPU count and chunk count. - """ - if n_chunks < 2: - return 0 - - per_chunk_ns = _estimate_chunk_work_ns(chunk_nbytes, codecs, is_encode=is_encode) - - if per_chunk_ns < _POOL_OVERHEAD_NS: - return 0 - - # Total codec work must exceed total dispatch overhead by a margin. - # Each task submitted to pool.map has ~50us dispatch overhead. - total_work_ns = per_chunk_ns * n_chunks - total_dispatch_ns = n_chunks * 50_000 # ~50us per task - if total_work_ns < total_dispatch_ns * 3: - return 0 - - # Scale workers: each worker should do at least 1ms of work to - # amortize pool overhead. - target_per_worker_ns = 1_000_000 # 1ms - workers = max(1, int(total_work_ns / target_per_worker_ns)) - - cpu_count = os.cpu_count() or 4 - return min(workers, n_chunks, cpu_count) - - -# --------------------------------------------------------------------------- -# SyncCodecPipeline -# --------------------------------------------------------------------------- - @dataclass(frozen=True) -class SyncCodecPipeline(CodecPipeline): - """A codec pipeline that runs per-chunk codec chains synchronously. - - When all codecs implement ``_decode_sync`` / ``_encode_sync`` (i.e. - ``supports_sync`` is ``True``), the per-chunk codec chain runs synchronously - without any ``asyncio.to_thread`` overhead. +class SyncCodecPipeline(BatchedCodecPipeline): + """Backwards-compatible alias for BatchedCodecPipeline. - When a codec does *not* support sync (e.g. ``ShardingCodec``), the pipeline - falls back to the standard async ``decode`` / ``encode`` path, preserving - correctness while still benefiting from sync dispatch for the inner pipeline. + All synchronous codec optimizations are now built into + ``BatchedCodecPipeline``. This subclass exists only so that + existing ``codec_pipeline.path`` config values and imports + continue to work. """ - array_array_codecs: tuple[ArrayArrayCodec, ...] - array_bytes_codec: ArrayBytesCodec - bytes_bytes_codecs: tuple[BytesBytesCodec, ...] - batch_size: int - - @property - def _all_sync(self) -> bool: - """True when every codec in the chain supports synchronous dispatch.""" - return all(c.supports_sync for c in self) - - def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - return type(self).from_codecs(c.evolve_from_array_spec(array_spec=array_spec) for c in self) - - @classmethod - def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: - array_array, array_bytes, bytes_bytes = codecs_from_list(list(codecs)) - return cls( - array_array_codecs=array_array, - array_bytes_codec=array_bytes, - bytes_bytes_codecs=bytes_bytes, - batch_size=batch_size or config.get("codec_pipeline.batch_size"), - ) - - @property - def supports_partial_decode(self) -> bool: - return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( - self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin - ) - - @property - def supports_partial_encode(self) -> bool: - return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( - self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin - ) - - def __iter__(self) -> Iterator[Codec]: - yield from self.array_array_codecs - yield self.array_bytes_codec - yield from self.bytes_bytes_codecs - - def validate( - self, - *, - shape: tuple[int, ...], - dtype: ZDType[TBaseDType, TBaseScalar], - chunk_grid: ChunkGrid, - ) -> None: - for codec in self: - codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid) - - def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: - for codec in self: - byte_length = codec.compute_encoded_size(byte_length, array_spec) - array_spec = codec.resolve_metadata(array_spec) - return byte_length - - # ------------------------------------------------------------------- - # Per-chunk sync codec chain - # ------------------------------------------------------------------- - - def _resolve_metadata_chain( - self, chunk_spec: ArraySpec - ) -> tuple[ - list[tuple[ArrayArrayCodec, ArraySpec]], - tuple[ArrayBytesCodec, ArraySpec], - list[tuple[BytesBytesCodec, ArraySpec]], - ]: - """Resolve metadata through the codec chain for a single chunk_spec.""" - aa_codecs_with_spec: list[tuple[ArrayArrayCodec, ArraySpec]] = [] - spec = chunk_spec - for aa_codec in self.array_array_codecs: - aa_codecs_with_spec.append((aa_codec, spec)) - spec = aa_codec.resolve_metadata(spec) - - ab_codec_with_spec = (self.array_bytes_codec, spec) - spec = self.array_bytes_codec.resolve_metadata(spec) - - bb_codecs_with_spec: list[tuple[BytesBytesCodec, ArraySpec]] = [] - for bb_codec in self.bytes_bytes_codecs: - bb_codecs_with_spec.append((bb_codec, spec)) - spec = bb_codec.resolve_metadata(spec) - - return (aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec) - - def _decode_one( - self, - chunk_bytes: Buffer | None, - chunk_spec: ArraySpec, - aa_chain: list[tuple[ArrayArrayCodec, ArraySpec]], - ab_pair: tuple[ArrayBytesCodec, ArraySpec], - bb_chain: list[tuple[BytesBytesCodec, ArraySpec]], - ) -> NDBuffer | None: - """Decode a single chunk through the full codec chain, synchronously.""" - if chunk_bytes is None: - return None - - # bytes-bytes decode (reverse order) - for bb_codec, spec in reversed(bb_chain): - chunk_bytes = bb_codec._decode_sync(chunk_bytes, spec) - - # array-bytes decode - ab_codec, ab_spec = ab_pair - chunk_array = ab_codec._decode_sync(chunk_bytes, ab_spec) - - # array-array decode (reverse order) - for aa_codec, spec in reversed(aa_chain): - chunk_array = aa_codec._decode_sync(chunk_array, spec) - - return chunk_array - - def _encode_one( - self, - chunk_array: NDBuffer | None, - chunk_spec: ArraySpec, - ) -> Buffer | None: - """Encode a single chunk through the full codec chain, synchronously.""" - if chunk_array is None: - return None - - spec = chunk_spec - - # array-array encode - for aa_codec in self.array_array_codecs: - chunk_array = aa_codec._encode_sync(chunk_array, spec) - spec = aa_codec.resolve_metadata(spec) - - # array-bytes encode - chunk_bytes = self.array_bytes_codec._encode_sync(chunk_array, spec) - spec = self.array_bytes_codec.resolve_metadata(spec) - - # bytes-bytes encode - for bb_codec in self.bytes_bytes_codecs: - chunk_bytes = bb_codec._encode_sync(chunk_bytes, spec) - spec = bb_codec.resolve_metadata(spec) - - return chunk_bytes - - # ------------------------------------------------------------------- - # Async fallback for codecs that don't support sync (e.g. sharding) - # ------------------------------------------------------------------- - - async def _decode_async( - self, - chunk_bytes_and_specs: list[tuple[Buffer | None, ArraySpec]], - ) -> Iterable[NDBuffer | None]: - """Async fallback: walk codecs one at a time (like BatchedCodecPipeline). - - Metadata must be resolved forward through the codec chain so each codec - gets the correct spec during reverse (decode) traversal. This matches - BatchedCodecPipeline._codecs_with_resolved_metadata_batched. - """ - chunk_bytes_batch, chunk_specs = _unzip2(chunk_bytes_and_specs) - - # Resolve metadata forward: aa → ab → bb, recording the spec at each step. - aa_specs: list[list[ArraySpec]] = [] - specs = list(chunk_specs) - for aa_codec in self.array_array_codecs: - aa_specs.append(specs) - specs = [aa_codec.resolve_metadata(s) for s in specs] - - ab_specs = specs - specs = [self.array_bytes_codec.resolve_metadata(s) for s in specs] - - bb_specs: list[list[ArraySpec]] = [] - for bb_codec in self.bytes_bytes_codecs: - bb_specs.append(specs) - specs = [bb_codec.resolve_metadata(s) for s in specs] - - # Decode in reverse, using the forward-resolved specs. - for bb_codec, bb_spec in zip(self.bytes_bytes_codecs[::-1], bb_specs[::-1], strict=False): - chunk_bytes_batch = list( - await bb_codec.decode(zip(chunk_bytes_batch, bb_spec, strict=False)) - ) - - chunk_array_batch: list[NDBuffer | None] = list( - await self.array_bytes_codec.decode(zip(chunk_bytes_batch, ab_specs, strict=False)) - ) - - for aa_codec, aa_spec in zip(self.array_array_codecs[::-1], aa_specs[::-1], strict=False): - chunk_array_batch = list( - await aa_codec.decode(zip(chunk_array_batch, aa_spec, strict=False)) - ) - - return chunk_array_batch - - async def _encode_async( - self, - chunk_arrays_and_specs: list[tuple[NDBuffer | None, ArraySpec]], - ) -> Iterable[Buffer | None]: - """Async fallback: walk codecs one at a time (like BatchedCodecPipeline).""" - chunk_array_batch, chunk_specs = _unzip2(chunk_arrays_and_specs) - - for aa_codec in self.array_array_codecs: - chunk_array_batch = list( - await aa_codec.encode(zip(chunk_array_batch, chunk_specs, strict=False)) - ) - chunk_specs = list(resolve_batched(aa_codec, chunk_specs)) - - chunk_bytes_batch: list[Buffer | None] = list( - await self.array_bytes_codec.encode(zip(chunk_array_batch, chunk_specs, strict=False)) - ) - chunk_specs = list(resolve_batched(self.array_bytes_codec, chunk_specs)) - - for bb_codec in self.bytes_bytes_codecs: - chunk_bytes_batch = list( - await bb_codec.encode(zip(chunk_bytes_batch, chunk_specs, strict=False)) - ) - chunk_specs = list(resolve_batched(bb_codec, chunk_specs)) - - return chunk_bytes_batch - - # ------------------------------------------------------------------- - # Top-level decode / encode - # ------------------------------------------------------------------- - - async def decode( - self, - chunk_bytes_and_specs: Iterable[tuple[Buffer | None, ArraySpec]], - ) -> Iterable[NDBuffer | None]: - items = list(chunk_bytes_and_specs) - if not items: - return [] - - if not self._all_sync: - return await self._decode_async(items) - - # All codecs support sync -- run the full chain inline (no threading). - _, first_spec = items[0] - aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(first_spec) - - return [ - self._decode_one(chunk_bytes, chunk_spec, aa_chain, ab_pair, bb_chain) - for chunk_bytes, chunk_spec in items - ] - - async def encode( - self, - chunk_arrays_and_specs: Iterable[tuple[NDBuffer | None, ArraySpec]], - ) -> Iterable[Buffer | None]: - items = list(chunk_arrays_and_specs) - if not items: - return [] - - if not self._all_sync: - return await self._encode_async(items) - - # All codecs support sync -- run the full chain inline (no threading). - return [self._encode_one(chunk_array, chunk_spec) for chunk_array, chunk_spec in items] - - # ------------------------------------------------------------------- - # read / write (IO stays async, compute runs inline) - # ------------------------------------------------------------------- - - async def read( - self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], - out: NDBuffer, - drop_axes: tuple[int, ...] = (), - ) -> None: - await concurrent_map( - [ - (single_batch_info, out, drop_axes) - for single_batch_info in _batched(batch_info, self.batch_size) - ], - self._read_batch, - config.get("async.concurrency"), - ) - - async def _read_batch( - self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], - out: NDBuffer, - drop_axes: tuple[int, ...] = (), - ) -> None: - batch_info = list(batch_info) - - if self.supports_partial_decode: - assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) - chunk_array_batch = await self.array_bytes_codec.decode_partial( - [ - (byte_getter, chunk_selection, chunk_spec) - for byte_getter, chunk_spec, chunk_selection, *_ in batch_info - ] - ) - for chunk_array, (_, chunk_spec, _, out_selection, _) in zip( - chunk_array_batch, batch_info, strict=False - ): - if chunk_array is not None: - out[out_selection] = chunk_array - else: - out[out_selection] = _fill_value_or_default(chunk_spec) - return - - # Phase 1: IO -- fetch bytes from store (always async) - chunk_bytes_batch = await concurrent_map( - [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], - lambda byte_getter, prototype: byte_getter.get(prototype), - config.get("async.concurrency"), - ) - - # Phase 2: Compute -- decode + scatter - decode_items = [ - (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch_info, strict=False) - ] - - chunk_array_batch_decoded: Iterable[NDBuffer | None] = await self.decode(decode_items) - self._scatter(chunk_array_batch_decoded, batch_info, out, drop_axes) - - @staticmethod - def _scatter( - chunk_array_batch: Iterable[NDBuffer | None], - batch_info: list[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], - out: NDBuffer, - drop_axes: tuple[int, ...], - ) -> None: - for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip( - chunk_array_batch, batch_info, strict=False - ): - if chunk_array is not None: - tmp = chunk_array[chunk_selection] - if drop_axes != (): - tmp = tmp.squeeze(axis=drop_axes) - out[out_selection] = tmp - else: - out[out_selection] = _fill_value_or_default(chunk_spec) - - async def write( - self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], - value: NDBuffer, - drop_axes: tuple[int, ...] = (), - ) -> None: - await concurrent_map( - [ - (single_batch_info, value, drop_axes) - for single_batch_info in _batched(batch_info, self.batch_size) - ], - self._write_batch, - config.get("async.concurrency"), - ) - - def _merge_chunk_array( - self, - existing_chunk_array: NDBuffer | None, - value: NDBuffer, - out_selection: SelectorTuple, - chunk_spec: ArraySpec, - chunk_selection: SelectorTuple, - is_complete_chunk: bool, - drop_axes: tuple[int, ...], - ) -> NDBuffer: - if ( - is_complete_chunk - and value.shape == chunk_spec.shape - and value[out_selection].shape == chunk_spec.shape - ): - return value - if existing_chunk_array is None: - chunk_array = chunk_spec.prototype.nd_buffer.create( - shape=chunk_spec.shape, - dtype=chunk_spec.dtype.to_native_dtype(), - order=chunk_spec.order, - fill_value=_fill_value_or_default(chunk_spec), - ) - else: - chunk_array = existing_chunk_array.copy() - if chunk_selection == () or is_scalar( - value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype() - ): - chunk_value = value - else: - chunk_value = value[out_selection] - if drop_axes != (): - item = tuple( - None if idx in drop_axes else slice(None) for idx in range(chunk_spec.ndim) - ) - chunk_value = chunk_value[item] - chunk_array[chunk_selection] = chunk_value - return chunk_array - - async def _write_batch( - self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], - value: NDBuffer, - drop_axes: tuple[int, ...] = (), - ) -> None: - batch_info = list(batch_info) - - if self.supports_partial_encode: - assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) - if len(value.shape) == 0: - await self.array_bytes_codec.encode_partial( - [ - (byte_setter, value, chunk_selection, chunk_spec) - for byte_setter, chunk_spec, chunk_selection, _, _ in batch_info - ], - ) - else: - await self.array_bytes_codec.encode_partial( - [ - (byte_setter, value[out_selection], chunk_selection, chunk_spec) - for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info - ], - ) - return - - # Phase 1: IO -- read existing bytes for non-complete chunks - async def _read_key( - byte_setter: ByteSetter | None, prototype: BufferPrototype - ) -> Buffer | None: - if byte_setter is None: - return None - return await byte_setter.get(prototype=prototype) - - chunk_bytes_batch: list[Buffer | None] - chunk_bytes_batch = await concurrent_map( - [ - ( - None if is_complete_chunk else byte_setter, - chunk_spec.prototype, - ) - for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info - ], - _read_key, - config.get("async.concurrency"), - ) - - # Phase 2: Compute -- decode, merge, encode - decode_items = [ - (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch_info, strict=False) - ] - - encoded_batch = await self._write_batch_compute(decode_items, batch_info, value, drop_axes) - - # Phase 3: IO -- write to store - async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None: - if chunk_bytes is None: - await byte_setter.delete() - else: - await byte_setter.set(chunk_bytes) - - await concurrent_map( - [ - (byte_setter, chunk_bytes) - for chunk_bytes, (byte_setter, *_) in zip(encoded_batch, batch_info, strict=False) - ], - _write_key, - config.get("async.concurrency"), - ) - - async def _write_batch_compute( - self, - decode_items: list[tuple[Buffer | None, ArraySpec]], - batch_info: list[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], - value: NDBuffer, - drop_axes: tuple[int, ...], - ) -> list[Buffer | None]: - """Async fallback for compute phase of _write_batch.""" - chunk_array_decoded: Iterable[NDBuffer | None] = await self.decode(decode_items) - - chunk_array_batch = self._merge_and_filter( - chunk_array_decoded, batch_info, value, drop_axes - ) - - encode_items = [ - (chunk_array, chunk_spec) - for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_batch, batch_info, strict=False) - ] - return list(await self.encode(encode_items)) - - def _merge_and_filter( - self, - chunk_array_decoded: Iterable[NDBuffer | None], - batch_info: list[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], - value: NDBuffer, - drop_axes: tuple[int, ...], - ) -> list[NDBuffer | None]: - """Merge decoded chunks with new data and filter empty chunks.""" - chunk_array_merged = [ - self._merge_chunk_array( - chunk_array, - value, - out_selection, - chunk_spec, - chunk_selection, - is_complete_chunk, - drop_axes, - ) - for chunk_array, ( - _, - chunk_spec, - chunk_selection, - out_selection, - is_complete_chunk, - ) in zip(chunk_array_decoded, batch_info, strict=False) - ] - - result: list[NDBuffer | None] = [] - for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_merged, batch_info, strict=False): - if chunk_array is None: - result.append(None) - else: - if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal( - _fill_value_or_default(chunk_spec) - ): - result.append(None) - else: - result.append(chunk_array) - return result - - # ------------------------------------------------------------------- - # Fully synchronous read / write (bypass event loop entirely) - # - # These methods implement the same logic as the async read/write - # methods above, but run entirely on the calling thread: - # - # - Store IO uses byte_getter.get_sync() / byte_setter.set_sync() - # instead of the async get()/set() — direct dict lookup for - # MemoryStore, direct file IO for LocalStore. - # - # - Codec compute uses _decode_one() / _encode_one(), which call - # each codec's _decode_sync/_encode_sync inline (no to_thread). - # - # - When there are multiple chunks, codec compute is parallelized - # across a thread pool. CPU-heavy codecs (zstd, gzip, blosc) - # release the GIL during C-level compress/decompress, so real - # parallelism is achieved. Store IO remains sequential (fast - # for local/memory stores). - # - # The byte_getter/byte_setter parameters are typed as `Any` because - # the ByteGetter/ByteSetter protocols only define async methods. - # At runtime, these are always StorePath instances which have the - # get_sync/set_sync/delete_sync methods. See docs/design/sync-bypass.md. - # - # These methods are only called when supports_sync_io is True (i.e. - # _all_sync is True), which guarantees every codec in the chain has - # _decode_sync/_encode_sync implementations. - # ------------------------------------------------------------------- - - @property - def supports_sync_io(self) -> bool: - # Enable the fully-sync path when every codec in the chain supports - # synchronous dispatch. This includes ShardingCodec, which has - # _decode_sync/_encode_sync (full shard) and _decode_partial_sync/ - # _encode_partial_sync (byte-range reads for partial shard access). - return self._all_sync - - def read_sync( - self, - batch_info: Iterable[tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool]], - out: NDBuffer, - drop_axes: tuple[int, ...] = (), - ) -> None: - batch_info_list = list(batch_info) - if not batch_info_list: - return - - # Partial decode path: when the array_bytes_codec supports partial - # decode (e.g. ShardingCodec), delegate to its _decode_partial_sync. - # This handles shard index fetch + per-chunk byte-range reads + inner - # codec decode, all synchronously. - if self.supports_partial_decode: - # The array_bytes_codec is a ShardingCodec (or similar) that has - # _decode_partial_sync. We use getattr to avoid coupling to the - # concrete type — the type system can't express this through the - # ArrayBytesCodecPartialDecodeMixin protocol. - ab_codec: Any = self.array_bytes_codec - for byte_getter, chunk_spec, chunk_selection, out_selection, _ in batch_info_list: - chunk_array: NDBuffer | None = ab_codec._decode_partial_sync( - byte_getter, chunk_selection, chunk_spec - ) - if chunk_array is not None: - out[out_selection] = chunk_array - else: - out[out_selection] = _fill_value_or_default(chunk_spec) - return - - # Non-partial path: standard sync decode through the full codec chain. - # Resolve the metadata chain once: compute the ArraySpec at each - # codec boundary. All chunks in a single array share the same codec - # structure, so this is invariant across the loop. - _, first_spec, *_ = batch_info_list[0] - aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(first_spec) - - # Phase 1: IO — fetch all chunk bytes from the store sequentially. - # For MemoryStore this is a dict lookup (~1us), for LocalStore a - # file read that benefits from OS page cache. Sequential is fine. - chunk_bytes_list: list[Buffer | None] = [ - byte_getter.get_sync(prototype=chunk_spec.prototype) - for byte_getter, chunk_spec, *_ in batch_info_list - ] - - # Phase 2: Decode — run the codec chain for each chunk. - # Estimate per-chunk codec work and decide whether to parallelize. - # Not all dtypes have item_size (e.g. custom dtypes), so fall back - # to sequential processing when we can't estimate chunk size. - dtype_item_size = getattr(first_spec.dtype, "item_size", 1) - chunk_nbytes = product(first_spec.shape) * dtype_item_size - n_workers = _choose_workers(len(batch_info_list), chunk_nbytes, self) - if n_workers > 0: - pool = _get_pool(n_workers) - chunk_arrays: list[NDBuffer | None] = list( - pool.map( - self._decode_one, - chunk_bytes_list, - [chunk_spec for _, chunk_spec, *_ in batch_info_list], - [aa_chain] * len(batch_info_list), - [ab_pair] * len(batch_info_list), - [bb_chain] * len(batch_info_list), - ) - ) - else: - chunk_arrays = [ - self._decode_one(chunk_bytes, chunk_spec, aa_chain, ab_pair, bb_chain) - for chunk_bytes, (_, chunk_spec, *_) in zip( - chunk_bytes_list, batch_info_list, strict=False - ) - ] - - # Phase 3: Scatter decoded chunk data into the output buffer. - self._scatter(chunk_arrays, batch_info_list, out, drop_axes) - - def _write_chunk_compute( - self, - existing_bytes: Buffer | None, - chunk_spec: ArraySpec, - chunk_selection: SelectorTuple, - out_selection: SelectorTuple, - is_complete_chunk: bool, - value: NDBuffer, - drop_axes: tuple[int, ...], - ) -> Buffer | None | object: # object is _DELETED sentinel - """Per-chunk compute for write: decode existing → merge → encode. - - Returns encoded bytes, or _DELETED sentinel if the chunk should - be removed from the store. Thread-safe: operates only on its own - chunk data, no shared mutable state. - """ - # Decode existing chunk (for partial writes) - existing_array: NDBuffer | None = None - if existing_bytes is not None: - aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(chunk_spec) - existing_array = self._decode_one( - existing_bytes, chunk_spec, aa_chain, ab_pair, bb_chain - ) - - # Merge new data into the chunk - chunk_array: NDBuffer | None = self._merge_chunk_array( - existing_array, - value, - out_selection, - chunk_spec, - chunk_selection, - is_complete_chunk, - drop_axes, - ) - - # Filter empty chunks - if ( - chunk_array is not None - and not chunk_spec.config.write_empty_chunks - and chunk_array.all_equal(_fill_value_or_default(chunk_spec)) - ): - chunk_array = None - - # Encode - if chunk_array is None: - return _DELETED - chunk_bytes = self._encode_one(chunk_array, chunk_spec) - if chunk_bytes is None: - return _DELETED - return chunk_bytes - - def write_sync( - self, - batch_info: Iterable[tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool]], - value: NDBuffer, - drop_axes: tuple[int, ...] = (), - ) -> None: - batch_info_list = list(batch_info) - if not batch_info_list: - return - - # Partial encode path: when the array_bytes_codec supports partial - # encode (e.g. ShardingCodec), delegate to its _encode_partial_sync. - # This reads the existing shard, merges new data, encodes and writes - # back, all synchronously. - if self.supports_partial_encode: - ab_codec: Any = self.array_bytes_codec - if len(value.shape) == 0: - for byte_setter, chunk_spec, chunk_selection, _, _ in batch_info_list: - ab_codec._encode_partial_sync(byte_setter, value, chunk_selection, chunk_spec) - else: - for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info_list: - ab_codec._encode_partial_sync( - byte_setter, value[out_selection], chunk_selection, chunk_spec - ) - return - - # Phase 1: IO — read existing chunk bytes for partial writes. - existing_bytes_list: list[Buffer | None] = [ - byte_setter.get_sync(prototype=chunk_spec.prototype) if not is_complete_chunk else None - for byte_setter, chunk_spec, _, _, is_complete_chunk in batch_info_list - ] - - # Phase 2: Compute — decode existing, merge new data, encode. - # Estimate per-chunk work to decide whether to parallelize. - # Use encode cost model since writes are dominated by compression. - # Not all dtypes have item_size (e.g. custom dtypes), so fall back - # to sequential processing when we can't estimate chunk size. - _, first_spec, *_ = batch_info_list[0] - dtype_item_size = getattr(first_spec.dtype, "item_size", 1) - chunk_nbytes = product(first_spec.shape) * dtype_item_size - n_workers = _choose_workers(len(batch_info_list), chunk_nbytes, self, is_encode=True) - if n_workers > 0: - pool = _get_pool(n_workers) - encoded_list: list[Buffer | None | object] = list( - pool.map( - self._write_chunk_compute, - existing_bytes_list, - [chunk_spec for _, chunk_spec, *_ in batch_info_list], - [chunk_selection for _, _, chunk_selection, _, _ in batch_info_list], - [out_selection for _, _, _, out_selection, _ in batch_info_list], - [is_complete for _, _, _, _, is_complete in batch_info_list], - [value] * len(batch_info_list), - [drop_axes] * len(batch_info_list), - ) - ) - else: - encoded_list = [ - self._write_chunk_compute( - existing_bytes, - chunk_spec, - chunk_selection, - out_selection, - is_complete_chunk, - value, - drop_axes, - ) - for existing_bytes, ( - _, - chunk_spec, - chunk_selection, - out_selection, - is_complete_chunk, - ) in zip(existing_bytes_list, batch_info_list, strict=False) - ] - - # Phase 3: IO — write encoded chunks to store. - # A sentinel _DELETED object distinguishes "delete key" from - # "no-op" (which doesn't arise here, but keeps the logic clean). - for encoded, (byte_setter, *_) in zip(encoded_list, batch_info_list, strict=False): - if encoded is _DELETED: - byte_setter.delete_sync() - elif encoded is not None: - byte_setter.set_sync(encoded) - else: - byte_setter.delete_sync() - register_pipeline(SyncCodecPipeline) diff --git a/src/zarr/storage/_memory.py b/src/zarr/storage/_memory.py index 950fa197e9..ebb57aeef0 100644 --- a/src/zarr/storage/_memory.py +++ b/src/zarr/storage/_memory.py @@ -82,7 +82,7 @@ def __eq__(self, other: object) -> bool: # # MemoryStore is a thin wrapper around a Python dict. The async get/set # methods are already synchronous in substance — they just happen to be - # ``async def``. These sync variants let SyncCodecPipeline.read_sync / + # ``async def``. These sync variants let the codec pipeline's read_sync / # write_sync access the dict directly without going through the event # loop, eliminating the dominant source of overhead for in-memory arrays. # diff --git a/tests/test_config.py b/tests/test_config.py index fc33bd87cb..f8ea3c6487 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -59,7 +59,7 @@ def test_config_defaults_set() -> None: "threading": {"max_workers": None}, "json_indent": 2, "codec_pipeline": { - "path": "zarr.experimental.sync_codecs.SyncCodecPipeline", + "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", "batch_size": 1, }, "codecs": { @@ -146,6 +146,14 @@ async def write( ) -> None: _mock.call() + def write_sync( + self, + batch_info: Any, + value: NDBuffer, + drop_axes: tuple[int, ...] = (), + ) -> None: + _mock.call() + register_pipeline(MockCodecPipeline) config.set({"codec_pipeline.path": fully_qualified_name(MockCodecPipeline)}) diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py index fa65d4fcc6..661dc25f36 100644 --- a/tests/test_sync_codec_pipeline.py +++ b/tests/test_sync_codec_pipeline.py @@ -1,4 +1,4 @@ -"""Tests for zarr.experimental.sync_codecs module.""" +"""Tests for sync codec capabilities in BatchedCodecPipeline.""" from __future__ import annotations @@ -12,8 +12,8 @@ from zarr.codecs.zstd import ZstdCodec from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.buffer import default_buffer_prototype +from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.experimental.sync_codecs import SyncCodecPipeline from zarr.storage import MemoryStore @@ -122,30 +122,30 @@ def test_roundtrip(self): # --------------------------------------------------------------------------- -# Unit tests: SyncCodecPipeline construction +# Unit tests: pipeline construction # --------------------------------------------------------------------------- -class TestSyncCodecPipelineConstruction: +class TestPipelineConstruction: def test_from_codecs_valid(self): - pipeline = SyncCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) - assert isinstance(pipeline, SyncCodecPipeline) + pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) + assert isinstance(pipeline, BatchedCodecPipeline) assert len(pipeline.bytes_bytes_codecs) == 1 assert isinstance(pipeline.array_bytes_codec, BytesCodec) def test_from_codecs_accepts_sharding(self): from zarr.codecs.sharding import ShardingCodec - pipeline = SyncCodecPipeline.from_codecs([ShardingCodec(chunk_shape=(8,))]) - assert isinstance(pipeline, SyncCodecPipeline) + pipeline = BatchedCodecPipeline.from_codecs([ShardingCodec(chunk_shape=(8,))]) + assert isinstance(pipeline, BatchedCodecPipeline) assert pipeline._all_sync def test_from_codecs_rejects_missing_array_bytes(self): with pytest.raises(ValueError, match="Required ArrayBytesCodec"): - SyncCodecPipeline.from_codecs([GzipCodec()]) + BatchedCodecPipeline.from_codecs([GzipCodec()]) def test_from_codecs_with_transpose(self): - pipeline = SyncCodecPipeline.from_codecs([ + pipeline = BatchedCodecPipeline.from_codecs([ TransposeCodec(order=(1, 0)), BytesCodec(), GzipCodec(level=1), @@ -155,14 +155,14 @@ def test_from_codecs_with_transpose(self): # --------------------------------------------------------------------------- -# Unit tests: SyncCodecPipeline encode/decode roundtrip +# Unit tests: pipeline encode/decode roundtrip # --------------------------------------------------------------------------- -class TestSyncCodecPipelineRoundtrip: +class TestPipelineRoundtrip: @pytest.mark.asyncio async def test_encode_decode_single_chunk(self): - pipeline = SyncCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) + pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) arr = np.random.default_rng(42).standard_normal((32, 32)).astype("float64") spec = _make_array_spec(arr.shape, arr.dtype) pipeline = pipeline.evolve_from_array_spec(spec) @@ -176,7 +176,7 @@ async def test_encode_decode_single_chunk(self): @pytest.mark.asyncio async def test_encode_decode_multiple_chunks(self): - pipeline = SyncCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) + pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) rng = np.random.default_rng(42) spec = _make_array_spec((16, 16), np.dtype("float64")) pipeline = pipeline.evolve_from_array_spec(spec) @@ -191,7 +191,7 @@ async def test_encode_decode_multiple_chunks(self): @pytest.mark.asyncio async def test_encode_decode_empty_batch(self): - pipeline = SyncCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) + pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) encoded = await pipeline.encode([]) assert list(encoded) == [] decoded = await pipeline.decode([]) @@ -199,7 +199,7 @@ async def test_encode_decode_empty_batch(self): @pytest.mark.asyncio async def test_encode_decode_none_chunk(self): - pipeline = SyncCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) + pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) spec = _make_array_spec((8,), np.dtype("float64")) pipeline = pipeline.evolve_from_array_spec(spec) @@ -211,12 +211,12 @@ async def test_encode_decode_none_chunk(self): # --------------------------------------------------------------------------- -# Integration tests: SyncCodecPipeline is the default pipeline +# Integration tests: default pipeline has sync capabilities # --------------------------------------------------------------------------- -class TestSyncCodecPipelineDefault: - def test_create_array_uses_sync_pipeline(self): +class TestDefaultPipelineSync: + def test_create_array_uses_batched_pipeline(self): store = MemoryStore() arr = zarr.create_array( store, @@ -224,13 +224,13 @@ def test_create_array_uses_sync_pipeline(self): chunks=(32, 32), dtype="float64", ) - assert isinstance(arr.async_array.codec_pipeline, SyncCodecPipeline) + assert isinstance(arr.async_array.codec_pipeline, BatchedCodecPipeline) data = np.random.default_rng(42).standard_normal((100, 100)) arr[:] = data np.testing.assert_array_equal(arr[:], data) - def test_open_uses_sync_pipeline(self): + def test_open_uses_batched_pipeline(self): store = MemoryStore() arr = zarr.create_array( store, @@ -242,10 +242,10 @@ def test_open_uses_sync_pipeline(self): arr[:] = data arr2 = zarr.open_array(store=store) - assert isinstance(arr2.async_array.codec_pipeline, SyncCodecPipeline) + assert isinstance(arr2.async_array.codec_pipeline, BatchedCodecPipeline) np.testing.assert_array_equal(arr2[:], data) - def test_from_array_uses_sync_pipeline(self): + def test_from_array_uses_batched_pipeline(self): store1 = MemoryStore() arr1 = zarr.create_array( store1, @@ -258,7 +258,7 @@ def test_from_array_uses_sync_pipeline(self): store2 = MemoryStore() arr2 = zarr.from_array(store2, data=arr1) - assert isinstance(arr2.async_array.codec_pipeline, SyncCodecPipeline) + assert isinstance(arr2.async_array.codec_pipeline, BatchedCodecPipeline) np.testing.assert_array_equal(arr2[:], data) def test_partial_write(self): @@ -289,18 +289,25 @@ def test_zstd_codec(self): arr[:] = data np.testing.assert_array_equal(arr[:], data) - def test_config_switch_to_batched(self): - """Verify we can switch back to BatchedCodecPipeline via config.""" - from zarr.core.codec_pipeline import BatchedCodecPipeline + def test_supports_sync_io(self): + """Default pipeline supports sync IO when all codecs are sync.""" + pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) + assert pipeline.supports_sync_io + + def test_config_switch_to_sync_pipeline_compat(self): + """Verify backwards compat: SyncCodecPipeline config path still works.""" + from zarr.experimental.sync_codecs import SyncCodecPipeline zarr.config.set( - {"codec_pipeline.path": "zarr.core.codec_pipeline.BatchedCodecPipeline"} + {"codec_pipeline.path": "zarr.experimental.sync_codecs.SyncCodecPipeline"} ) try: store = MemoryStore() arr = zarr.create_array(store, shape=(10,), dtype="float64") + assert isinstance(arr.async_array.codec_pipeline, SyncCodecPipeline) + # SyncCodecPipeline is-a BatchedCodecPipeline assert isinstance(arr.async_array.codec_pipeline, BatchedCodecPipeline) finally: zarr.config.set( - {"codec_pipeline.path": "zarr.experimental.sync_codecs.SyncCodecPipeline"} + {"codec_pipeline.path": "zarr.core.codec_pipeline.BatchedCodecPipeline"} ) From 88a48756d8a26172c70bf6d13af8f57b1a2e9259 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 17:03:20 +0100 Subject: [PATCH 13/23] remove garbage --- docs/design/sync-bypass.md | 322 --------------------------- src/zarr/abc/codec.py | 6 +- src/zarr/experimental/sync_codecs.py | 8 +- 3 files changed, 3 insertions(+), 333 deletions(-) delete mode 100644 docs/design/sync-bypass.md diff --git a/docs/design/sync-bypass.md b/docs/design/sync-bypass.md deleted file mode 100644 index 2e93f5b702..0000000000 --- a/docs/design/sync-bypass.md +++ /dev/null @@ -1,322 +0,0 @@ -# Design: Fully Synchronous Read/Write Bypass - -## Problem - -Zarr-python's read/write path is inherently async: every `Array.__getitem__` -or `Array.__setitem__` call passes through several layers of async machinery -before any actual work happens. For workloads where both the codec chain and -the store are fundamentally synchronous (e.g. gzip + MemoryStore, or -zstd + LocalStore), this async overhead dominates latency. - -The call chain looks like this: - -``` -Array.__getitem__ - └─ sync() # (1) thread hop: submits coroutine to background event loop - └─ AsyncArray._get_selection # runs on the event loop thread - └─ CodecPipeline.read # async pipeline - ├─ concurrent_map # (2) launches tasks on event loop - │ └─ ByteGetter.get(prototype) # (3) async store IO - │ └─ MemoryStore.get() # just a dict lookup! - └─ codec.decode() - └─ asyncio.to_thread(...) # (4) thread hop for CPU work - └─ gzip.decompress(...) # actual compute -``` - -There are four sources of overhead, marked (1)-(4): - -1. **`sync()` bridge**: Every synchronous `Array` method calls `sync()`, which - uses `asyncio.run_coroutine_threadsafe()` to submit work to a background - event loop thread. Even when the coroutine does zero awaiting, this costs - ~30-50us for the round-trip through the event loop. - -2. **`concurrent_map` batching**: The pipeline groups chunks into batches and - dispatches them via `concurrent_map`, which creates asyncio tasks. For - single-chunk reads (the common case), this is pure overhead. - -3. **Async store IO**: `StorePath.get()` / `StorePath.set()` are `async def`. - For `MemoryStore` (a dict lookup) and `LocalStore` (a file read), the - underlying operation is synchronous — wrapping it in `async def` forces an - unnecessary context switch through the event loop. - -4. **`asyncio.to_thread` for codec compute**: `BatchedCodecPipeline` runs each - codec's encode/decode in `asyncio.to_thread()`, adding another thread hop. - `SyncCodecPipeline` (the foundation this work builds on) already eliminates - this by calling `_decode_sync` / `_encode_sync` inline. - -The net effect: a MemoryStore read of a single small chunk spends more time -in async machinery than in actual decompression. - - -## Solution - -When the codec pipeline and store both support synchronous operation, bypass -the event loop entirely: run IO, codec compute, and buffer scatter all on the -calling thread, with zero async overhead. - -The solution has three layers: - -### Layer 1: Sync Store IO - -Add `supports_sync`, `get_sync()`, `set_sync()`, and `delete_sync()` to the -store abstraction. These are opt-in: the `Store` ABC provides default -implementations that raise `NotImplementedError`, and only stores with native -sync capabilities override them. - -``` -Store ABC (defaults: supports_sync=False, methods raise NotImplementedError) - ├── MemoryStore (supports_sync=True, direct dict access) - ├── LocalStore (supports_sync=True, direct file IO via _get/_put) - └── FsspecStore (unchanged, remains async-only) - -StorePath delegates to its underlying Store: - get_sync() → self.store.get_sync(self.path, ...) - set_sync() → self.store.set_sync(self.path, ...) -``` - -**Key decision**: `StorePath` is what gets passed to the codec pipeline as a -`ByteGetter` / `ByteSetter`. By adding sync methods to `StorePath`, the -pipeline can call them directly without knowing the concrete store type. - -**Protocol gap**: The `ByteGetter` / `ByteSetter` protocols only define async -methods (`get`, `set`, `delete`). Rather than modifying these widely-used -protocols, the sync pipeline methods use `Any` type annotations for the -byte_getter/byte_setter parameters and call `.get_sync()` etc. at runtime. -This is a pragmatic tradeoff: the sync path is an optimization that only -activates when `supports_sync` is True, so the runtime type is always a -`StorePath` that has these methods. - -### Layer 2: Sync Codec Pipeline IO - -Add `supports_sync_io`, `read_sync()`, and `write_sync()` to the -`CodecPipeline` ABC (non-abstract, default raises `NotImplementedError`). - -`SyncCodecPipeline` implements these with a simple sequential loop: - -```python -# read_sync: for each chunk (non-sharded path) -for byte_getter, chunk_spec, chunk_sel, out_sel, _ in batch_info: - chunk_bytes = byte_getter.get_sync(prototype=chunk_spec.prototype) # sync IO - chunk_array = self._decode_one(chunk_bytes, ...) # sync compute - out[out_selection] = chunk_array[chunk_selection] # scatter -``` - -No batching, no `concurrent_map`, no event loop — just a Python for-loop. - -**Sharding support**: When the pipeline uses `ShardingCodec` (i.e. -`supports_partial_decode` is True), `read_sync` delegates to -`ShardingCodec._decode_partial_sync()` instead. This method fetches -the shard index and requested chunk bytes via sync byte-range reads -(`byte_getter.get_sync()` with `RangeByteRequest`/`SuffixByteRequest`), -then decodes through the inner pipeline's `read_sync` — all on the -calling thread. See [Sync Sharding](#sync-sharding) below for details. - -### Layer 3: Array Bypass - -Each of the 10 sync `Array` selection methods (5 getters, 5 setters) gains a -fast path: - -```python -def get_basic_selection(self, selection, *, out=None, prototype=None, fields=None): - indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) - if self._can_use_sync_path(): - return _get_selection_sync( - self.async_array.store_path, self.async_array.metadata, - self.async_array.codec_pipeline, self.async_array.config, - indexer, out=out, fields=fields, prototype=prototype, - ) - return sync(self.async_array._get_selection(indexer, ...)) -``` - -`_can_use_sync_path()` checks two conditions: -1. The codec pipeline supports sync IO (`supports_sync_io`) -2. The store supports sync (`supports_sync`) - -When both hold, `_get_selection_sync` / `_set_selection_sync` run the -entire operation on the calling thread. These functions mirror the async -`_get_selection` / `_set_selection` exactly, but call `codec_pipeline.read_sync()` -/ `write_sync()` instead of `await codec_pipeline.read()` / `write()`. - - -## Resulting Call Chain - -With the sync bypass active, the call chain for non-sharded arrays becomes: - -``` -Array.__getitem__ - └─ _get_selection_sync # runs on calling thread - └─ SyncCodecPipeline.read_sync - ├─ StorePath.get_sync # direct dict/file access, no event loop - ├─ _decode_one # inline codec chain, no to_thread - └─ out[sel] = array # scatter into output -``` - -For sharded arrays: - -``` -Array.__getitem__ - └─ _get_selection_sync # runs on calling thread - └─ SyncCodecPipeline.read_sync - └─ ShardingCodec._decode_partial_sync - ├─ StorePath.get_sync(byte_range) # sync byte-range read for shard index - ├─ _decode_shard_index_sync # inline index codec chain - ├─ StorePath.get_sync(byte_range) # sync byte-range read per chunk - └─ inner_pipeline.read_sync # inner codec chain (sync) - ├─ _ShardingByteGetter.get_sync # dict lookup - ├─ _decode_one # inline codec chain - └─ out[sel] = array # scatter -``` - -No `sync()`, no event loop, no `asyncio.to_thread`, no `concurrent_map`. - - -## Sync Sharding - -`ShardingCodec` participates in the fully-synchronous path through sync -variants of all its methods: - -**Shard index codec chain**: The index codecs (typically `BytesCodec` + -`Crc32cCodec`) are run inline via `_decode_shard_index_sync` / -`_encode_shard_index_sync`. These classify the index codecs using -`codecs_from_list`, resolve metadata forward through the chain, then -run the decode/encode in the correct order — all without constructing a -pipeline object. - -**Full shard decode/encode** (`_decode_sync` / `_encode_sync`): Receives -complete shard bytes, decodes the index, then delegates to the inner -codec pipeline's `read_sync` / `write_sync` with `_ShardingByteGetter` / -`_ShardingByteSetter` (dict-backed, so "IO" is a dict lookup). - -**Partial shard decode/encode** (`_decode_partial_sync` / -`_encode_partial_sync`): The partial path is where most of the IO happens — -it issues sync byte-range reads to fetch the shard index and individual -chunk data from the store. Once bytes are in memory, the inner pipeline -decodes them synchronously. - -**Inner pipeline**: `ShardingCodec.codec_pipeline` is obtained via -`get_pipeline_class()`. When `SyncCodecPipeline` is configured globally, -the inner pipeline is also a `SyncCodecPipeline`, enabling recursive sync -dispatch for nested sharding. - - -## Additional Optimization: Codec Instance Caching - -`GzipCodec` was creating a new `GZip(level)` instance on every encode/decode -call. `ZstdCodec` and `BloscCodec` already cache their codec instances via -`@cached_property`. We apply the same pattern to `GzipCodec`: - -```python -@cached_property -def _gzip_codec(self) -> GZip: - return GZip(self.level) -``` - -This is safe because `GzipCodec` is a frozen dataclass — `level` never -changes after construction, so the cached instance is always valid. - - -## Bugfix: _decode_async Metadata Resolution - -The async fallback path in `SyncCodecPipeline._decode_async()` (used when -a codec in the chain doesn't support sync) had a metadata resolution bug: -it passed the same unresolved `chunk_specs` to every codec during decode. - -Size-changing codecs like `FixedScaleOffset` and `PackBits` alter the data -shape/dtype, so each codec needs specs resolved through the forward chain. -The fix resolves metadata forward (aa -> ab -> bb), records specs at each -step, then uses the correct resolved specs during reverse decode traversal. -This matches `BatchedCodecPipeline._codecs_with_resolved_metadata_batched`. - - -## What Stays Unchanged - -- **`BatchedCodecPipeline`**: Unmodified. It inherits the default - `supports_sync_io=False` from the ABC. -- **Remote stores** (`FsspecStore`): `supports_sync` stays `False`. All - remote IO remains async. -- **All async APIs**: `AsyncArray`, `async def read/write`, etc. are - completely untouched. The sync bypass is an optimization of the - synchronous `Array` class only. - - -## Files Modified - -| File | Layer | Change | -|------|-------|--------| -| `src/zarr/abc/store.py` | 1 | `supports_sync`, `get_sync`, `set_sync`, `delete_sync` on `Store` ABC | -| `src/zarr/storage/_memory.py` | 1 | Sync store methods (direct dict access) | -| `src/zarr/storage/_local.py` | 1 | Sync store methods (direct `_get`/`_put` calls) | -| `src/zarr/storage/_common.py` | 1 | Sync methods on `StorePath` (delegates to store) | -| `src/zarr/abc/codec.py` | 2 | `_decode_sync`, `_encode_sync`, `supports_sync` on `BaseCodec`; `supports_sync_io`, `read_sync`, `write_sync` on `CodecPipeline` | -| `src/zarr/experimental/sync_codecs.py` | 2 | `read_sync`, `write_sync`, `_decode_async` metadata fix | -| `src/zarr/codecs/sharding.py` | 2 | `_decode_sync`, `_encode_sync`, `_decode_partial_sync`, `_encode_partial_sync`, shard index sync codec chain | -| `src/zarr/core/array.py` | 3 | `_can_use_sync_path`, `_get_selection_sync`, `_set_selection_sync`, 10 method modifications | -| `src/zarr/codecs/gzip.py` | — | `@cached_property` for GZip instance | -| `src/zarr/codecs/blosc.py` | — | `_decode_sync`/`_encode_sync`; `_decode_single`/`_encode_single` delegate to sync | -| `src/zarr/codecs/zstd.py` | — | `_decode_sync`/`_encode_sync`; `_decode_single`/`_encode_single` delegate to sync | -| `src/zarr/codecs/bytes.py` | — | `_decode_sync`/`_encode_sync` (was `_decode_single`/`_encode_single`) | -| `src/zarr/codecs/crc32c_.py` | — | `_decode_sync`/`_encode_sync` (was `_decode_single`/`_encode_single`) | -| `src/zarr/codecs/transpose.py` | — | `_decode_sync`/`_encode_sync`; `_decode_single`/`_encode_single` delegate to sync | -| `src/zarr/codecs/vlen_utf8.py` | — | `_decode_sync`/`_encode_sync` for `VLenUTF8Codec` and `VLenBytesCodec` | - - -## Performance - -Benchmarks on MemoryStore with `SyncCodecPipeline` vs `BatchedCodecPipeline`: - -**Non-sharded arrays** (zstd compression, 100x100 float64, 32x32 chunks): -- Single-chunk read: ~2-4x faster -- Full-array read: ~2-11x faster (varies with chunk count) -- Single-chunk write: ~2-3x faster - -**Sharded arrays** (4x4 shard of 8x8 inner chunks, zstd, MemoryStore): -- Single-chunk read: ~1.5-2.5x faster -- Full-array read: ~1.5-2x faster -- Single-chunk write: ~1.3-1.6x faster -- Full-array write: ~1.3-1.5x faster - -The sharded speedup is smaller because the shard index decode and -per-chunk byte-range reads add overhead that wasn't present in the -non-sharded path. Still, eliminating the event loop round-trip and -`asyncio.to_thread` for each inner chunk decode provides a meaningful -improvement. - - -## Design Tradeoffs - -**Duplication of `_get_selection` / `_set_selection`**: The sync versions -(`_get_selection_sync`, `_set_selection_sync`) duplicate the setup logic -(dtype resolution, buffer creation, value coercion) from the async originals. -This is intentional: extracting shared helpers would add complexity and -indirection to the hot path for no functional benefit. The two versions -should be kept in sync manually. - -**Sequential chunk processing**: `read_sync` and `write_sync` process chunks -sequentially in a for-loop, with no parallelism. For the target use case -(MemoryStore, LocalStore), this is optimal: MemoryStore is a dict lookup -(~1us), LocalStore is a file read that benefits from OS page cache, and -Python's GIL prevents true parallelism for CPU-bound codec work anyway. The -async path with `concurrent_map` is better for remote stores where IO latency -can be overlapped. - -**`Any` type annotations**: The `read_sync` and `write_sync` methods on -`SyncCodecPipeline` use `Any` for the byte_getter/byte_setter type in the -`batch_info` tuples. This avoids modifying the `ByteGetter`/`ByteSetter` -protocols, which are public API. The runtime type is always `StorePath` (or -`_ShardingByteGetter`/`_ShardingByteSetter` for inner-shard access), which -has the sync methods; the type system just can't express this constraint -through the existing protocol hierarchy. - -**Sync sharding — sequential chunk reads**: The sync partial decode path -fetches each chunk's bytes sequentially via `byte_getter.get_sync()` with -byte-range requests. The async path can overlap these reads via -`concurrent_map`. For MemoryStore this doesn't matter (dict lookup is ~1us). -For LocalStore, OS page cache means sequential reads are fast for warm data. -For remote stores where overlapping IO would help, `supports_sync` is False -and the async path is used automatically. - -**Inline shard index codec chain**: `_decode_shard_index_sync` and -`_encode_shard_index_sync` run the index codecs (BytesCodec + Crc32cCodec) -directly rather than constructing a temporary `CodecPipeline`. This avoids -the overhead of pipeline construction for a simple two-codec chain and keeps -the sync path self-contained. diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 78dd9add5d..d3b22ef252 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -482,10 +482,8 @@ async def write( # event loop, it overrides these methods and sets supports_sync_io # to True. This lets Array selection methods bypass sync() entirely. # - # The default implementations raise NotImplementedError, so - # BatchedCodecPipeline (the standard pipeline) is unaffected. - # - # See docs/design/sync-bypass.md for the full design rationale. + # The default implementations raise NotImplementedError. + # BatchedCodecPipeline overrides these when all codecs support sync. # ------------------------------------------------------------------- @property diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py index 2e89634f1b..260875789d 100644 --- a/src/zarr/experimental/sync_codecs.py +++ b/src/zarr/experimental/sync_codecs.py @@ -14,13 +14,7 @@ from dataclasses import dataclass -from zarr.core.codec_pipeline import ( - BatchedCodecPipeline, - _CODEC_DECODE_NS_PER_BYTE, # noqa: F401 - _CODEC_ENCODE_NS_PER_BYTE, # noqa: F401 - _choose_workers, # noqa: F401 - _estimate_chunk_work_ns, # noqa: F401 -) +from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.registry import register_pipeline __all__ = ["SyncCodecPipeline"] From 284e5e22a3404051a6bbaab2255acd904d2801f1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Feb 2026 20:25:42 +0100 Subject: [PATCH 14/23] lint --- src/zarr/codecs/sharding.py | 19 ++++---- src/zarr/core/array.py | 5 +- src/zarr/core/codec_pipeline.py | 17 ++++--- tests/test_sync_codec_pipeline.py | 78 +++++++++++++++---------------- 4 files changed, 63 insertions(+), 56 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index b7b8af2668..9fee51f740 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -544,7 +544,9 @@ def _encode_shard_dict_sync( if self.index_location == ShardingCodecIndexLocation.start: empty_chunks_mask = index.offsets_and_lengths[..., 0] == MAX_UINT_64 index.offsets_and_lengths[~empty_chunks_mask, 0] += len(index_bytes) - index_bytes = self._encode_shard_index_sync(index) # encode again with corrected offsets + index_bytes = self._encode_shard_index_sync( + index + ) # encode again with corrected offsets buffers.insert(0, index_bytes) else: buffers.append(index_bytes) @@ -966,10 +968,10 @@ def _decode_shard_index_sync( spec = bb.resolve_metadata(spec) # Decode: reverse bb, then ab, then reverse aa - chunk_bytes: Buffer | None = index_bytes + chunk_bytes: Buffer = index_bytes for bb_codec, s in reversed(bb_with_spec): chunk_bytes = bb_codec._decode_sync(chunk_bytes, s) - chunk_array = ab_codec._decode_sync(chunk_bytes, ab_spec) + chunk_array: NDBuffer = ab_codec._decode_sync(chunk_bytes, ab_spec) for aa_codec, s in reversed(aa_with_spec): chunk_array = aa_codec._decode_sync(chunk_array, s) @@ -984,18 +986,19 @@ def _encode_shard_index_sync(self, index: _ShardIndex) -> Buffer: aa_codecs, ab_codec, bb_codecs = codecs_from_list(list(self.index_codecs)) - chunk_array: NDBuffer | None = get_ndbuffer_class().from_numpy_array( - index.offsets_and_lengths - ) + aa_out: NDBuffer | None = get_ndbuffer_class().from_numpy_array(index.offsets_and_lengths) # Encode: aa forward, then ab, then bb forward spec = index_chunk_spec for aa_codec in aa_codecs: - chunk_array = aa_codec._encode_sync(chunk_array, spec) + assert aa_out is not None + aa_out = aa_codec._encode_sync(aa_out, spec) spec = aa_codec.resolve_metadata(spec) - chunk_bytes = ab_codec._encode_sync(chunk_array, spec) + assert aa_out is not None + chunk_bytes = ab_codec._encode_sync(aa_out, spec) spec = ab_codec.resolve_metadata(spec) for bb_codec in bb_codecs: + assert chunk_bytes is not None chunk_bytes = bb_codec._encode_sync(chunk_bytes, spec) spec = bb_codec.resolve_metadata(spec) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 66b146fea0..2d1276a888 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1993,9 +1993,8 @@ def _can_use_sync_path(self) -> bool: """ pipeline = self.async_array.codec_pipeline store_path = self.async_array.store_path - return ( - getattr(pipeline, "supports_sync_io", False) - and getattr(store_path, "supports_sync", False) + return getattr(pipeline, "supports_sync_io", False) and getattr( + store_path, "supports_sync", False ) @classmethod diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index c3a291dd41..a75f673027 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -322,15 +322,22 @@ def _encode_one( return None spec = chunk_spec + aa_out: NDBuffer | None = chunk_array for aa_codec in self.array_array_codecs: - chunk_array = aa_codec._encode_sync(chunk_array, spec) + if aa_out is None: + return None + aa_out = aa_codec._encode_sync(aa_out, spec) spec = aa_codec.resolve_metadata(spec) - chunk_bytes = self.array_bytes_codec._encode_sync(chunk_array, spec) + if aa_out is None: + return None + chunk_bytes = self.array_bytes_codec._encode_sync(aa_out, spec) spec = self.array_bytes_codec.resolve_metadata(spec) for bb_codec in self.bytes_bytes_codecs: + if chunk_bytes is None: + return None chunk_bytes = bb_codec._encode_sync(chunk_bytes, spec) spec = bb_codec.resolve_metadata(spec) @@ -694,7 +701,7 @@ async def _write_batch_compute( def _merge_and_filter( self, chunk_array_decoded: Iterable[NDBuffer | None], - batch_info: list, + batch_info: list[tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool]], value: NDBuffer, drop_axes: tuple[int, ...], ) -> list[NDBuffer | None]: @@ -717,9 +724,7 @@ def _merge_and_filter( ) in zip(chunk_array_decoded, batch_info, strict=False) ] chunk_array_batch: list[NDBuffer | None] = [] - for chunk_array, (_, chunk_spec, *_) in zip( - chunk_array_merged, batch_info, strict=False - ): + for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_merged, batch_info, strict=False): if chunk_array is None: chunk_array_batch.append(None) # type: ignore[unreachable] else: diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py index 661dc25f36..cbfc7867da 100644 --- a/tests/test_sync_codec_pipeline.py +++ b/tests/test_sync_codec_pipeline.py @@ -2,6 +2,8 @@ from __future__ import annotations +from typing import Any + import numpy as np import pytest @@ -17,9 +19,7 @@ from zarr.storage import MemoryStore -def _make_array_spec( - shape: tuple[int, ...], dtype: np.dtype -) -> ArraySpec: +def _make_array_spec(shape: tuple[int, ...], dtype: np.dtype[Any]) -> ArraySpec: zdtype = get_data_type_from_native_dtype(dtype) return ArraySpec( shape=shape, @@ -30,7 +30,7 @@ def _make_array_spec( ) -def _make_nd_buffer(arr: np.ndarray) -> zarr.core.buffer.NDBuffer: +def _make_nd_buffer(arr: np.ndarray[Any, Any]) -> zarr.core.buffer.NDBuffer: return default_buffer_prototype().nd_buffer.from_numpy_array(arr) @@ -40,19 +40,19 @@ def _make_nd_buffer(arr: np.ndarray) -> zarr.core.buffer.NDBuffer: class TestSupportsSync: - def test_gzip_supports_sync(self): + def test_gzip_supports_sync(self) -> None: assert GzipCodec().supports_sync - def test_zstd_supports_sync(self): + def test_zstd_supports_sync(self) -> None: assert ZstdCodec().supports_sync - def test_bytes_supports_sync(self): + def test_bytes_supports_sync(self) -> None: assert BytesCodec().supports_sync - def test_transpose_supports_sync(self): + def test_transpose_supports_sync(self) -> None: assert TransposeCodec(order=(0, 1)).supports_sync - def test_sharding_supports_sync(self): + def test_sharding_supports_sync(self) -> None: from zarr.codecs.sharding import ShardingCodec assert ShardingCodec(chunk_shape=(8,)).supports_sync @@ -64,7 +64,7 @@ def test_sharding_supports_sync(self): class TestGzipCodecSync: - def test_roundtrip(self): + def test_roundtrip(self) -> None: codec = GzipCodec(level=1) arr = np.arange(100, dtype="float64") spec = _make_array_spec(arr.shape, arr.dtype) @@ -78,7 +78,7 @@ def test_roundtrip(self): class TestZstdCodecSync: - def test_roundtrip(self): + def test_roundtrip(self) -> None: codec = ZstdCodec(level=1) arr = np.arange(100, dtype="float64") spec = _make_array_spec(arr.shape, arr.dtype) @@ -92,7 +92,7 @@ def test_roundtrip(self): class TestBytesCodecSync: - def test_roundtrip(self): + def test_roundtrip(self) -> None: codec = BytesCodec() arr = np.arange(100, dtype="float64") spec = _make_array_spec(arr.shape, arr.dtype) @@ -108,7 +108,7 @@ def test_roundtrip(self): class TestTransposeCodecSync: - def test_roundtrip(self): + def test_roundtrip(self) -> None: codec = TransposeCodec(order=(1, 0)) arr = np.arange(12, dtype="float64").reshape(3, 4) spec = _make_array_spec(arr.shape, arr.dtype) @@ -127,29 +127,31 @@ def test_roundtrip(self): class TestPipelineConstruction: - def test_from_codecs_valid(self): + def test_from_codecs_valid(self) -> None: pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) assert isinstance(pipeline, BatchedCodecPipeline) assert len(pipeline.bytes_bytes_codecs) == 1 assert isinstance(pipeline.array_bytes_codec, BytesCodec) - def test_from_codecs_accepts_sharding(self): + def test_from_codecs_accepts_sharding(self) -> None: from zarr.codecs.sharding import ShardingCodec pipeline = BatchedCodecPipeline.from_codecs([ShardingCodec(chunk_shape=(8,))]) assert isinstance(pipeline, BatchedCodecPipeline) assert pipeline._all_sync - def test_from_codecs_rejects_missing_array_bytes(self): + def test_from_codecs_rejects_missing_array_bytes(self) -> None: with pytest.raises(ValueError, match="Required ArrayBytesCodec"): BatchedCodecPipeline.from_codecs([GzipCodec()]) - def test_from_codecs_with_transpose(self): - pipeline = BatchedCodecPipeline.from_codecs([ - TransposeCodec(order=(1, 0)), - BytesCodec(), - GzipCodec(level=1), - ]) + def test_from_codecs_with_transpose(self) -> None: + pipeline = BatchedCodecPipeline.from_codecs( + [ + TransposeCodec(order=(1, 0)), + BytesCodec(), + GzipCodec(level=1), + ] + ) assert len(pipeline.array_array_codecs) == 1 assert isinstance(pipeline.array_array_codecs[0], TransposeCodec) @@ -161,7 +163,7 @@ def test_from_codecs_with_transpose(self): class TestPipelineRoundtrip: @pytest.mark.asyncio - async def test_encode_decode_single_chunk(self): + async def test_encode_decode_single_chunk(self) -> None: pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) arr = np.random.default_rng(42).standard_normal((32, 32)).astype("float64") spec = _make_array_spec(arr.shape, arr.dtype) @@ -169,13 +171,13 @@ async def test_encode_decode_single_chunk(self): nd_buf = _make_nd_buffer(arr) encoded = await pipeline.encode([(nd_buf, spec)]) - decoded = await pipeline.decode([(list(encoded)[0], spec)]) - result = list(decoded)[0] + decoded = await pipeline.decode([(next(iter(encoded)), spec)]) + result = next(iter(decoded)) assert result is not None np.testing.assert_array_equal(arr, result.as_numpy_array()) @pytest.mark.asyncio - async def test_encode_decode_multiple_chunks(self): + async def test_encode_decode_multiple_chunks(self) -> None: pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) rng = np.random.default_rng(42) spec = _make_array_spec((16, 16), np.dtype("float64")) @@ -185,12 +187,12 @@ async def test_encode_decode_multiple_chunks(self): encoded = list(await pipeline.encode([(buf, spec) for buf in nd_bufs])) decoded = list(await pipeline.decode([(enc, spec) for enc in encoded])) - for original, dec in zip(chunks, decoded): + for original, dec in zip(chunks, decoded, strict=False): assert dec is not None np.testing.assert_array_equal(original, dec.as_numpy_array()) @pytest.mark.asyncio - async def test_encode_decode_empty_batch(self): + async def test_encode_decode_empty_batch(self) -> None: pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) encoded = await pipeline.encode([]) assert list(encoded) == [] @@ -198,7 +200,7 @@ async def test_encode_decode_empty_batch(self): assert list(decoded) == [] @pytest.mark.asyncio - async def test_encode_decode_none_chunk(self): + async def test_encode_decode_none_chunk(self) -> None: pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) spec = _make_array_spec((8,), np.dtype("float64")) pipeline = pipeline.evolve_from_array_spec(spec) @@ -216,7 +218,7 @@ async def test_encode_decode_none_chunk(self): class TestDefaultPipelineSync: - def test_create_array_uses_batched_pipeline(self): + def test_create_array_uses_batched_pipeline(self) -> None: store = MemoryStore() arr = zarr.create_array( store, @@ -230,7 +232,7 @@ def test_create_array_uses_batched_pipeline(self): arr[:] = data np.testing.assert_array_equal(arr[:], data) - def test_open_uses_batched_pipeline(self): + def test_open_uses_batched_pipeline(self) -> None: store = MemoryStore() arr = zarr.create_array( store, @@ -245,7 +247,7 @@ def test_open_uses_batched_pipeline(self): assert isinstance(arr2.async_array.codec_pipeline, BatchedCodecPipeline) np.testing.assert_array_equal(arr2[:], data) - def test_from_array_uses_batched_pipeline(self): + def test_from_array_uses_batched_pipeline(self) -> None: store1 = MemoryStore() arr1 = zarr.create_array( store1, @@ -261,7 +263,7 @@ def test_from_array_uses_batched_pipeline(self): assert isinstance(arr2.async_array.codec_pipeline, BatchedCodecPipeline) np.testing.assert_array_equal(arr2[:], data) - def test_partial_write(self): + def test_partial_write(self) -> None: store = MemoryStore() arr = zarr.create_array( store, @@ -276,7 +278,7 @@ def test_partial_write(self): expected[5:15] = np.arange(10, dtype="int32") + 1 np.testing.assert_array_equal(result, expected) - def test_zstd_codec(self): + def test_zstd_codec(self) -> None: store = MemoryStore() arr = zarr.create_array( store, @@ -289,18 +291,16 @@ def test_zstd_codec(self): arr[:] = data np.testing.assert_array_equal(arr[:], data) - def test_supports_sync_io(self): + def test_supports_sync_io(self) -> None: """Default pipeline supports sync IO when all codecs are sync.""" pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) assert pipeline.supports_sync_io - def test_config_switch_to_sync_pipeline_compat(self): + def test_config_switch_to_sync_pipeline_compat(self) -> None: """Verify backwards compat: SyncCodecPipeline config path still works.""" from zarr.experimental.sync_codecs import SyncCodecPipeline - zarr.config.set( - {"codec_pipeline.path": "zarr.experimental.sync_codecs.SyncCodecPipeline"} - ) + zarr.config.set({"codec_pipeline.path": "zarr.experimental.sync_codecs.SyncCodecPipeline"}) try: store = MemoryStore() arr = zarr.create_array(store, shape=(10,), dtype="float64") From b1b876adba39028827c18f1eaa91659b8106c5ac Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Feb 2026 14:35:56 +0100 Subject: [PATCH 15/23] use protocols for new sync behavior --- src/zarr/abc/codec.py | 33 +++-- src/zarr/abc/store.py | 82 ++++------- src/zarr/codecs/sharding.py | 29 ++-- src/zarr/core/array.py | 14 +- src/zarr/core/codec_pipeline.py | 232 +++++++++++++++++++++--------- src/zarr/storage/_common.py | 19 +-- src/zarr/storage/_local.py | 4 - src/zarr/storage/_memory.py | 4 - tests/test_sync_codec_pipeline.py | 13 +- 9 files changed, 242 insertions(+), 188 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index d3b22ef252..b7271a13ef 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -2,7 +2,7 @@ from abc import abstractmethod from collections.abc import Mapping -from typing import TYPE_CHECKING, Generic, TypeGuard, TypeVar +from typing import TYPE_CHECKING, Generic, Protocol, TypeGuard, TypeVar, runtime_checkable from typing_extensions import ReadOnly, TypedDict @@ -32,6 +32,7 @@ "CodecInput", "CodecOutput", "CodecPipeline", + "SupportsSyncCodec", ] CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer) @@ -59,6 +60,19 @@ def _check_codecjson_v2(data: object) -> TypeGuard[CodecJSON_V2[str]]: """The widest type of JSON-like input that could specify a codec.""" +@runtime_checkable +class SupportsSyncCodec(Protocol): + """Protocol for codecs that support synchronous encode/decode.""" + + def _decode_sync( + self, chunk_data: NDBuffer | Buffer, chunk_spec: ArraySpec + ) -> NDBuffer | Buffer: ... + + def _encode_sync( + self, chunk_data: NDBuffer | Buffer, chunk_spec: ArraySpec + ) -> NDBuffer | Buffer | None: ... + + class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]): """Generic base class for codecs. @@ -137,21 +151,6 @@ def validate( The array chunk grid """ - def _decode_sync(self, chunk_data: CodecOutput, chunk_spec: ArraySpec) -> CodecInput: - """Synchronously decode a single chunk. Override in subclasses to enable - sync codec pipeline support.""" - raise NotImplementedError # pragma: no cover - - def _encode_sync(self, chunk_data: CodecInput, chunk_spec: ArraySpec) -> CodecOutput | None: - """Synchronously encode a single chunk. Override in subclasses to enable - sync codec pipeline support.""" - raise NotImplementedError # pragma: no cover - - @property - def supports_sync(self) -> bool: - """Whether this codec has synchronous encode/decode implementations.""" - return type(self)._decode_sync is not BaseCodec._decode_sync - async def _decode_single(self, chunk_data: CodecOutput, chunk_spec: ArraySpec) -> CodecInput: raise NotImplementedError # pragma: no cover @@ -491,7 +490,7 @@ def supports_sync_io(self) -> bool: """Whether this pipeline can run read/write entirely on the calling thread. True when: - - All codecs support synchronous encode/decode (_decode_sync/_encode_sync) + - All codecs implement ``SupportsSyncCodec`` - The pipeline's read_sync/write_sync methods are implemented Checked by ``Array._can_use_sync_path()`` to decide whether to bypass diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 8625c33536..104c63a6be 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -16,7 +16,14 @@ from zarr.core.buffer import Buffer, BufferPrototype -__all__ = ["ByteGetter", "ByteSetter", "Store", "set_or_delete"] +__all__ = [ + "ByteGetter", + "ByteSetter", + "Store", + "SyncByteGetter", + "SyncByteSetter", + "set_or_delete", +] @dataclass @@ -524,61 +531,6 @@ def supports_partial_writes(self) -> Literal[False]: """ return False - # ----------------------------------------------------------------------- - # Synchronous IO interface (opt-in) - # - # These methods enable the codec pipeline to bypass the event loop - # entirely for store IO. The default implementations raise - # NotImplementedError; stores that wrap fundamentally synchronous - # operations (MemoryStore, LocalStore) override them with direct - # implementations. Remote/cloud stores (FsspecStore) leave them as-is - # and remain async-only. - # ----------------------------------------------------------------------- - - @property - def supports_sync(self) -> bool: - """Whether this store has native synchronous get/set/delete methods. - - When True, the codec pipeline's ``read_sync`` / ``write_sync`` will - call ``get_sync`` / ``set_sync`` / ``delete_sync`` directly on the - calling thread, avoiding the event loop overhead of the async - equivalents. - - Subclasses that override the sync methods below should also override - this property to return True. - """ - return False - - def get_sync( - self, - key: str, - prototype: BufferPrototype, - byte_range: ByteRequest | None = None, - ) -> Buffer | None: - """Synchronous version of ``get()``. - - Called by the codec pipeline's ``read_sync`` to fetch chunk bytes without - going through the event loop. Only called when ``supports_sync`` is - True, so the default ``NotImplementedError`` is never hit in practice. - """ - raise NotImplementedError - - def set_sync(self, key: str, value: Buffer) -> None: - """Synchronous version of ``set()``. - - Called by the codec pipeline's ``write_sync`` to persist encoded chunk - bytes without going through the event loop. - """ - raise NotImplementedError - - def delete_sync(self, key: str) -> None: - """Synchronous version of ``delete()``. - - Called by the codec pipeline's ``write_sync`` when a chunk should be - removed (e.g. an empty chunk with ``write_empty_chunks=False``). - """ - raise NotImplementedError - @property @abstractmethod def supports_listing(self) -> bool: @@ -755,6 +707,24 @@ async def delete(self) -> None: ... async def set_if_not_exists(self, default: Buffer) -> None: ... +@runtime_checkable +class SyncByteGetter(Protocol): + """Protocol for stores that support synchronous byte reads.""" + + def get_sync( + self, prototype: BufferPrototype, byte_range: ByteRequest | None = None + ) -> Buffer | None: ... + + +@runtime_checkable +class SyncByteSetter(SyncByteGetter, Protocol): + """Protocol for stores that support synchronous byte reads, writes, and deletes.""" + + def set_sync(self, value: Buffer) -> None: ... + + def delete_sync(self) -> None: ... + + async def set_or_delete(byte_setter: ByteSetter, value: Buffer | None) -> None: """Set or delete a value in a byte setter diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 9fee51f740..9f5a83deb4 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -16,6 +16,7 @@ ArrayBytesCodecPartialEncodeMixin, Codec, CodecPipeline, + SupportsSyncCodec, ) from zarr.abc.store import ( ByteGetter, @@ -968,15 +969,15 @@ def _decode_shard_index_sync( spec = bb.resolve_metadata(spec) # Decode: reverse bb, then ab, then reverse aa - chunk_bytes: Buffer = index_bytes + bb_out: Any = index_bytes for bb_codec, s in reversed(bb_with_spec): - chunk_bytes = bb_codec._decode_sync(chunk_bytes, s) - chunk_array: NDBuffer = ab_codec._decode_sync(chunk_bytes, ab_spec) + bb_out = cast("SupportsSyncCodec", bb_codec)._decode_sync(bb_out, s) + ab_out: Any = cast("SupportsSyncCodec", ab_codec)._decode_sync(bb_out, ab_spec) for aa_codec, s in reversed(aa_with_spec): - chunk_array = aa_codec._decode_sync(chunk_array, s) + ab_out = cast("SupportsSyncCodec", aa_codec)._decode_sync(ab_out, s) - assert chunk_array is not None - return _ShardIndex(chunk_array.as_numpy_array()) + assert ab_out is not None + return _ShardIndex(ab_out.as_numpy_array()) def _encode_shard_index_sync(self, index: _ShardIndex) -> Buffer: """Encode shard index synchronously by running index codecs inline.""" @@ -986,25 +987,25 @@ def _encode_shard_index_sync(self, index: _ShardIndex) -> Buffer: aa_codecs, ab_codec, bb_codecs = codecs_from_list(list(self.index_codecs)) - aa_out: NDBuffer | None = get_ndbuffer_class().from_numpy_array(index.offsets_and_lengths) + aa_out: Any = get_ndbuffer_class().from_numpy_array(index.offsets_and_lengths) # Encode: aa forward, then ab, then bb forward spec = index_chunk_spec for aa_codec in aa_codecs: assert aa_out is not None - aa_out = aa_codec._encode_sync(aa_out, spec) + aa_out = cast("SupportsSyncCodec", aa_codec)._encode_sync(aa_out, spec) spec = aa_codec.resolve_metadata(spec) assert aa_out is not None - chunk_bytes = ab_codec._encode_sync(aa_out, spec) + bb_out: Any = cast("SupportsSyncCodec", ab_codec)._encode_sync(aa_out, spec) spec = ab_codec.resolve_metadata(spec) for bb_codec in bb_codecs: - assert chunk_bytes is not None - chunk_bytes = bb_codec._encode_sync(chunk_bytes, spec) + assert bb_out is not None + bb_out = cast("SupportsSyncCodec", bb_codec)._encode_sync(bb_out, spec) spec = bb_codec.resolve_metadata(spec) - assert chunk_bytes is not None - assert isinstance(chunk_bytes, Buffer) - return chunk_bytes + assert bb_out is not None + assert isinstance(bb_out, Buffer) + return bb_out async def _decode_shard_index( self, index_bytes: Buffer, chunks_per_shard: tuple[int, ...] diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2d1276a888..853ff6434a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1978,12 +1978,12 @@ def _can_use_sync_path(self) -> bool: Two conditions must hold: - 1. The codec pipeline supports fully synchronous IO (all codecs in - the chain have _decode_sync/_encode_sync). This is True for + 1. The codec pipeline supports fully synchronous IO (all codecs + implement ``SupportsSyncCodec``). This is True for BatchedCodecPipeline when all codecs support sync. - 2. The store supports synchronous operations (MemoryStore, LocalStore). - Remote stores like FsspecStore remain async-only. + 2. The store supports synchronous operations (has ``get_sync``). + MemoryStore and LocalStore provide this; remote stores do not. When both hold, the selection methods below call _get_selection_sync / _set_selection_sync directly, running the @@ -1992,10 +1992,8 @@ def _can_use_sync_path(self) -> bool: is used automatically. """ pipeline = self.async_array.codec_pipeline - store_path = self.async_array.store_path - return getattr(pipeline, "supports_sync_io", False) and getattr( - store_path, "supports_sync", False - ) + store = self.async_array.store_path.store + return getattr(pipeline, "supports_sync_io", False) and hasattr(store, "get_sync") @classmethod @deprecated("Use zarr.create_array instead.", category=ZarrDeprecationWarning) diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index a75f673027..26ea93784c 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -4,7 +4,7 @@ from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from itertools import islice, pairwise -from typing import TYPE_CHECKING, Any, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar, cast from warnings import warn from zarr.abc.codec import ( @@ -15,6 +15,7 @@ BytesBytesCodec, Codec, CodecPipeline, + SupportsSyncCodec, ) from zarr.core.common import concurrent_map, product from zarr.core.config import config @@ -211,8 +212,8 @@ class BatchedCodecPipeline(CodecPipeline): @property def _all_sync(self) -> bool: - """True when every codec in the chain supports synchronous dispatch.""" - return all(c.supports_sync for c in self) + """True when every codec in the chain implements SupportsSyncCodec.""" + return all(isinstance(c, SupportsSyncCodec) for c in self) def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return type(self).from_codecs(c.evolve_from_array_spec(array_spec=array_spec) for c in self) @@ -297,51 +298,62 @@ def _decode_one( ab_pair: tuple[ArrayBytesCodec, ArraySpec], bb_chain: list[tuple[BytesBytesCodec, ArraySpec]], ) -> NDBuffer | None: - """Decode a single chunk through the full codec chain, synchronously.""" + """Decode a single chunk through the full codec chain, synchronously. + + Only called when ``_all_sync`` is True, so every codec implements + ``SupportsSyncCodec``. + """ if chunk_bytes is None: return None + # Use Any to avoid verbose casts on every codec call — we know + # all codecs satisfy SupportsSyncCodec because _all_sync is True. + bb_out: Any = chunk_bytes for bb_codec, spec in reversed(bb_chain): - chunk_bytes = bb_codec._decode_sync(chunk_bytes, spec) + bb_out = cast("SupportsSyncCodec", bb_codec)._decode_sync(bb_out, spec) ab_codec, ab_spec = ab_pair - chunk_array = ab_codec._decode_sync(chunk_bytes, ab_spec) + ab_out: Any = cast("SupportsSyncCodec", ab_codec)._decode_sync(bb_out, ab_spec) for aa_codec, spec in reversed(aa_chain): - chunk_array = aa_codec._decode_sync(chunk_array, spec) + ab_out = cast("SupportsSyncCodec", aa_codec)._decode_sync(ab_out, spec) - return chunk_array + return ab_out # type: ignore[no-any-return] def _encode_one( self, chunk_array: NDBuffer | None, chunk_spec: ArraySpec, ) -> Buffer | None: - """Encode a single chunk through the full codec chain, synchronously.""" + """Encode a single chunk through the full codec chain, synchronously. + + Only called when ``_all_sync`` is True, so every codec implements + ``SupportsSyncCodec``. + """ if chunk_array is None: return None spec = chunk_spec - aa_out: NDBuffer | None = chunk_array + aa_out: Any = chunk_array for aa_codec in self.array_array_codecs: if aa_out is None: return None - aa_out = aa_codec._encode_sync(aa_out, spec) + aa_out = cast("SupportsSyncCodec", aa_codec)._encode_sync(aa_out, spec) spec = aa_codec.resolve_metadata(spec) if aa_out is None: return None - chunk_bytes = self.array_bytes_codec._encode_sync(aa_out, spec) + bb_out: Any = cast("SupportsSyncCodec", self.array_bytes_codec)._encode_sync(aa_out, spec) spec = self.array_bytes_codec.resolve_metadata(spec) for bb_codec in self.bytes_bytes_codecs: - if chunk_bytes is None: + if bb_out is None: return None - chunk_bytes = bb_codec._encode_sync(chunk_bytes, spec) + bb_out = cast("SupportsSyncCodec", bb_codec)._encode_sync(bb_out, spec) spec = bb_codec.resolve_metadata(spec) - return chunk_bytes + return bb_out # type: ignore[no-any-return] # ------------------------------------------------------------------- # Batched async decode/encode (layer-by-layer across all chunks) @@ -517,21 +529,62 @@ async def read_batch( out[out_selection] = fill_value_or_default(chunk_spec) return - # Phase 1: IO -- fetch bytes from store (always async) - chunk_bytes_batch = await concurrent_map( - [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], - lambda byte_getter, prototype: byte_getter.get(prototype), - config.get("async.concurrency"), - ) + if self._all_sync: + # Streaming per-chunk pipeline: each chunk flows through + # fetch → decode → scatter as a single task. Running N tasks + # concurrently overlaps IO with codec compute. + _, first_spec, *_ = batch_info[0] + aa_chain, ab_pair, bb_chain = self._resolve_metadata_chain(first_spec) - # Phase 2: Compute -- decode + scatter - decode_items = [ - (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch_info, strict=False) - ] + async def _read_chunk( + byte_getter: ByteGetter, + chunk_spec: ArraySpec, + chunk_selection: SelectorTuple, + out_selection: SelectorTuple, + ) -> None: + # 1) Fetch + chunk_bytes = await byte_getter.get(prototype=chunk_spec.prototype) + + # 2) Decode (full chain, sync) + chunk_array = self._decode_one(chunk_bytes, chunk_spec, aa_chain, ab_pair, bb_chain) + + # 3) Scatter + if chunk_array is not None: + tmp = chunk_array[chunk_selection] + if drop_axes != (): + tmp = tmp.squeeze(axis=drop_axes) + out[out_selection] = tmp + else: + out[out_selection] = fill_value_or_default(chunk_spec) + + await concurrent_map( + [ + (byte_getter, chunk_spec, chunk_selection, out_selection) + for byte_getter, chunk_spec, chunk_selection, out_selection, _ in batch_info + ], + _read_chunk, + config.get("async.concurrency"), + ) + else: + # Async fallback: fetch all → decode all (async codec API) → scatter. + # Used for codecs that don't implement _decode_sync (e.g. numcodecs). + + async def _fetch(byte_getter: ByteGetter, prototype: BufferPrototype) -> Buffer | None: + return await byte_getter.get(prototype=prototype) - chunk_array_batch_decoded: Iterable[NDBuffer | None] = await self.decode(decode_items) - self._scatter(chunk_array_batch_decoded, batch_info, out, drop_axes) + chunk_bytes_batch = await concurrent_map( + [(byte_getter, chunk_spec.prototype) for byte_getter, chunk_spec, *_ in batch_info], + _fetch, + config.get("async.concurrency"), + ) + chunk_array_batch = await self.decode_batch( + zip( + chunk_bytes_batch, + [chunk_spec for _, chunk_spec, *_ in batch_info], + strict=False, + ) + ) + self._scatter(chunk_array_batch, batch_info, out, drop_axes) @staticmethod def _scatter( @@ -630,50 +683,95 @@ async def write_batch( ) return - # Phase 1: IO -- read existing bytes for non-complete chunks - async def _read_key( - byte_setter: ByteSetter | None, prototype: BufferPrototype - ) -> Buffer | None: - if byte_setter is None: - return None - return await byte_setter.get(prototype=prototype) - - chunk_bytes_batch: list[Buffer | None] - chunk_bytes_batch = await concurrent_map( - [ - ( - None if is_complete_chunk else byte_setter, - chunk_spec.prototype, + if self._all_sync: + # Streaming per-chunk pipeline: each chunk flows through + # read_existing → decode → merge → encode → write as a single + # task. Running N tasks concurrently overlaps IO with compute. + async def _write_chunk( + byte_setter: ByteSetter, + chunk_spec: ArraySpec, + chunk_selection: SelectorTuple, + out_selection: SelectorTuple, + is_complete_chunk: bool, + ) -> None: + # 1) Read existing chunk (for partial writes) + existing_bytes: Buffer | None = None + if not is_complete_chunk: + existing_bytes = await byte_setter.get(prototype=chunk_spec.prototype) + + # 2) Compute: decode existing, merge, encode + chunk_bytes = self._write_chunk_compute( + existing_bytes, + chunk_spec, + chunk_selection, + out_selection, + is_complete_chunk, + value, + drop_axes, ) - for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info - ], - _read_key, - config.get("async.concurrency"), - ) - # Phase 2: Compute -- decode, merge, encode - decode_items = [ - (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch_info, strict=False) - ] + # 3) Write result + if chunk_bytes is _DELETED or chunk_bytes is None: + await byte_setter.delete() + else: + await byte_setter.set(chunk_bytes) # type: ignore[arg-type] - encoded_batch = await self._write_batch_compute(decode_items, batch_info, value, drop_axes) + await concurrent_map( + [ + (byte_setter, chunk_spec, chunk_selection, out_selection, is_complete_chunk) + for byte_setter, chunk_spec, chunk_selection, out_selection, is_complete_chunk in batch_info + ], + _write_chunk, + config.get("async.concurrency"), + ) + else: + # Async fallback: phased approach for codecs without sync support. + # Phase 1: Fetch existing chunks for partial writes. - # Phase 3: IO -- write to store - async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None: - if chunk_bytes is None: - await byte_setter.delete() - else: - await byte_setter.set(chunk_bytes) + async def _fetch_existing( + byte_setter: ByteSetter, chunk_spec: ArraySpec, is_complete_chunk: bool + ) -> Buffer | None: + if is_complete_chunk: + return None + return await byte_setter.get(prototype=chunk_spec.prototype) - await concurrent_map( - [ - (byte_setter, chunk_bytes) - for chunk_bytes, (byte_setter, *_) in zip(encoded_batch, batch_info, strict=False) - ], - _write_key, - config.get("async.concurrency"), - ) + existing_bytes_list: list[Buffer | None] = await concurrent_map( + [ + (byte_setter, chunk_spec, is_complete_chunk) + for byte_setter, chunk_spec, _, _, is_complete_chunk in batch_info + ], + _fetch_existing, + config.get("async.concurrency"), + ) + + # Phase 2: Decode → merge → encode (async codec API). + decode_items: list[tuple[Buffer | None, ArraySpec]] = [ + (existing_bytes if not is_complete_chunk else None, chunk_spec) + for existing_bytes, (_, chunk_spec, _, _, is_complete_chunk) in zip( + existing_bytes_list, batch_info, strict=False + ) + ] + encoded_list = await self._write_batch_compute( + decode_items, batch_info, value, drop_axes + ) + + # Phase 3: Write encoded chunks to store. + async def _write_out(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None: + if chunk_bytes is None: + await byte_setter.delete() + else: + await byte_setter.set(chunk_bytes) + + await concurrent_map( + [ + (byte_setter, chunk_bytes) + for (byte_setter, *_), chunk_bytes in zip( + batch_info, encoded_list, strict=False + ) + ], + _write_out, + config.get("async.concurrency"), + ) async def _write_batch_compute( self, diff --git a/src/zarr/storage/_common.py b/src/zarr/storage/_common.py index c57a717025..15a9b7846d 100644 --- a/src/zarr/storage/_common.py +++ b/src/zarr/storage/_common.py @@ -237,17 +237,12 @@ async def is_empty(self) -> bool: # self.path to the key and delegate to the underlying Store's sync # methods. # - # Note: The ByteGetter / ByteSetter protocols only define async - # methods. The sync pipeline uses `Any` type annotations to call - # these methods at runtime. See docs/design/sync-bypass.md for why - # we chose not to modify the protocols. + # Note: These methods satisfy the SyncByteGetter / SyncByteSetter + # protocols (from zarr.abc.store) only when the underlying Store + # also has get_sync / set_sync / delete_sync. Callers check the + # store before invoking these. # ------------------------------------------------------------------- - @property - def supports_sync(self) -> bool: - """Whether the underlying store supports synchronous operations.""" - return self.store.supports_sync - def get_sync( self, prototype: BufferPrototype | None = None, @@ -256,15 +251,15 @@ def get_sync( """Synchronous read — delegates to ``self.store.get_sync(self.path, ...)``.""" if prototype is None: prototype = default_buffer_prototype() - return self.store.get_sync(self.path, prototype=prototype, byte_range=byte_range) + return self.store.get_sync(self.path, prototype=prototype, byte_range=byte_range) # type: ignore[attr-defined, no-any-return] def set_sync(self, value: Buffer) -> None: """Synchronous write — delegates to ``self.store.set_sync(self.path, value)``.""" - self.store.set_sync(self.path, value) + self.store.set_sync(self.path, value) # type: ignore[attr-defined] def delete_sync(self) -> None: """Synchronous delete — delegates to ``self.store.delete_sync(self.path)``.""" - self.store.delete_sync(self.path) + self.store.delete_sync(self.path) # type: ignore[attr-defined] def __truediv__(self, other: str) -> StorePath: """Combine this store path with another path""" diff --git a/src/zarr/storage/_local.py b/src/zarr/storage/_local.py index 2296879cb2..28fea7ca0e 100644 --- a/src/zarr/storage/_local.py +++ b/src/zarr/storage/_local.py @@ -199,10 +199,6 @@ def __eq__(self, other: object) -> bool: # _open() from a sync context, so we replicate its logic here. # ------------------------------------------------------------------- - @property - def supports_sync(self) -> bool: - return True - def get_sync( self, key: str, diff --git a/src/zarr/storage/_memory.py b/src/zarr/storage/_memory.py index ebb57aeef0..1cb1da41f1 100644 --- a/src/zarr/storage/_memory.py +++ b/src/zarr/storage/_memory.py @@ -91,10 +91,6 @@ def __eq__(self, other: object) -> bool: # since MemoryStore._open() is a no-op beyond setting the flag. # ------------------------------------------------------------------- - @property - def supports_sync(self) -> bool: - return True - def get_sync( self, key: str, diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py index cbfc7867da..af12befb40 100644 --- a/tests/test_sync_codec_pipeline.py +++ b/tests/test_sync_codec_pipeline.py @@ -8,6 +8,7 @@ import pytest import zarr +from zarr.abc.codec import SupportsSyncCodec from zarr.codecs.bytes import BytesCodec from zarr.codecs.gzip import GzipCodec from zarr.codecs.transpose import TransposeCodec @@ -35,27 +36,27 @@ def _make_nd_buffer(arr: np.ndarray[Any, Any]) -> zarr.core.buffer.NDBuffer: # --------------------------------------------------------------------------- -# Unit tests: supports_sync property +# Unit tests: SupportsSyncCodec protocol # --------------------------------------------------------------------------- class TestSupportsSync: def test_gzip_supports_sync(self) -> None: - assert GzipCodec().supports_sync + assert isinstance(GzipCodec(), SupportsSyncCodec) def test_zstd_supports_sync(self) -> None: - assert ZstdCodec().supports_sync + assert isinstance(ZstdCodec(), SupportsSyncCodec) def test_bytes_supports_sync(self) -> None: - assert BytesCodec().supports_sync + assert isinstance(BytesCodec(), SupportsSyncCodec) def test_transpose_supports_sync(self) -> None: - assert TransposeCodec(order=(0, 1)).supports_sync + assert isinstance(TransposeCodec(order=(0, 1)), SupportsSyncCodec) def test_sharding_supports_sync(self) -> None: from zarr.codecs.sharding import ShardingCodec - assert ShardingCodec(chunk_shape=(8,)).supports_sync + assert isinstance(ShardingCodec(chunk_shape=(8,)), SupportsSyncCodec) # --------------------------------------------------------------------------- From 69962848a769b938e4c043cca76dc5f9cc7e675e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Feb 2026 16:49:33 +0100 Subject: [PATCH 16/23] remove batch size parameter; add changelog entry --- changes/3715.misc.md | 12 +++++ src/zarr/core/codec_pipeline.py | 66 +++++++++-------------- src/zarr/core/config.py | 6 ++- tests/package_with_entrypoint/__init__.py | 2 +- tests/test_config.py | 7 +-- 5 files changed, 45 insertions(+), 48 deletions(-) create mode 100644 changes/3715.misc.md diff --git a/changes/3715.misc.md b/changes/3715.misc.md new file mode 100644 index 0000000000..3c081a539e --- /dev/null +++ b/changes/3715.misc.md @@ -0,0 +1,12 @@ +Added several performance optimizations to chunk encoding and decoding. Low-latency stores that do not benefit from +`async` operations can now implement synchronous IO methods which will be used when available during chunk processing. +Similarly, codecs can implement a synchronous API which will be used if available during chunk processing. +These changes remove unnecessary interactions with the event loop. + +The synchronous chunk processing path optionally uses a thread pool to parallelize execution. The number of threads is chosen +based on the estimated compute load of each chunk, which takes into account known encoding and decoding profiles for +different codecs. This algorithm is aware of the latency required for setting up the thread pool, and for +single-chunk workloads we skip the thread pool entirely. + +Use of the thread pool can be disabled in the global configuration. The minimum number of threads +and the maximum number of threads can be set via the configuration as well. \ No newline at end of file diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 26ea93784c..c273bff8a1 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -3,7 +3,7 @@ import os from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass -from itertools import islice, pairwise +from itertools import pairwise from typing import TYPE_CHECKING, Any, TypeVar, cast from warnings import warn @@ -46,14 +46,6 @@ def _unzip2(iterable: Iterable[tuple[T, U]]) -> tuple[list[T], list[U]]: return (out0, out1) -def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]: - if n < 1: - raise ValueError("n must be at least one") - it = iter(iterable) - while batch := tuple(islice(it, n)): - yield batch - - def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[ArraySpec]: return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] @@ -153,25 +145,37 @@ def _choose_workers( *, is_encode: bool = False, ) -> int: - """Decide how many thread pool workers to use (0 = don't use pool).""" - if n_chunks < 2: + """Decide how many thread pool workers to use (0 = don't use pool). + + Respects ``threading.codec_workers`` config: + - ``enabled``: if False, always returns 0. + - ``min``: floor for the number of workers. + - ``max``: ceiling for the number of workers (default: ``os.cpu_count()``). + """ + codec_workers = config.get("threading.codec_workers") + if not codec_workers.get("enabled", True): return 0 + min_workers: int = codec_workers.get("min", 0) + max_workers: int = codec_workers.get("max") or os.cpu_count() or 4 + + if n_chunks < 2: + return min_workers + per_chunk_ns = _estimate_chunk_work_ns(chunk_nbytes, codecs, is_encode=is_encode) - if per_chunk_ns < _POOL_OVERHEAD_NS: + if per_chunk_ns < _POOL_OVERHEAD_NS and min_workers == 0: return 0 total_work_ns = per_chunk_ns * n_chunks total_dispatch_ns = n_chunks * 50_000 # ~50us per task - if total_work_ns < total_dispatch_ns * 3: + if total_work_ns < total_dispatch_ns * 3 and min_workers == 0: return 0 target_per_worker_ns = 1_000_000 # 1ms workers = max(1, int(total_work_ns / target_per_worker_ns)) - cpu_count = os.cpu_count() or 4 - return min(workers, n_chunks, cpu_count) + return max(min_workers, min(workers, n_chunks, max_workers)) def _get_pool(max_workers: int) -> ThreadPoolExecutor: @@ -208,7 +212,6 @@ class BatchedCodecPipeline(CodecPipeline): array_array_codecs: tuple[ArrayArrayCodec, ...] array_bytes_codec: ArrayBytesCodec bytes_bytes_codecs: tuple[BytesBytesCodec, ...] - batch_size: int @property def _all_sync(self) -> bool: @@ -219,14 +222,13 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return type(self).from_codecs(c.evolve_from_array_spec(array_spec=array_spec) for c in self) @classmethod - def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: + def from_codecs(cls, codecs: Iterable[Codec]) -> Self: array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(list(codecs)) return cls( array_array_codecs=array_array_codecs, array_bytes_codec=array_bytes_codec, bytes_bytes_codecs=bytes_bytes_codecs, - batch_size=batch_size or config.get("codec_pipeline.batch_size"), ) @property @@ -478,10 +480,7 @@ async def decode( ] # Async fallback: layer-by-layer across all chunks. - output: list[NDBuffer | None] = [] - for batch_info in batched(items, self.batch_size): - output.extend(await self.decode_batch(batch_info)) - return output + return list(await self.decode_batch(items)) async def encode( self, @@ -496,10 +495,7 @@ async def encode( return [self._encode_one(chunk_array, chunk_spec) for chunk_array, chunk_spec in items] # Async fallback: layer-by-layer across all chunks. - output: list[Buffer | None] = [] - for single_batch_info in batched(items, self.batch_size): - output.extend(await self.encode_batch(single_batch_info)) - return output + return list(await self.encode_batch(items)) # ------------------------------------------------------------------- # Async read / write (IO overlap via concurrent_map) @@ -610,14 +606,7 @@ async def read( out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: - await concurrent_map( - [ - (single_batch_info, out, drop_axes) - for single_batch_info in batched(batch_info, self.batch_size) - ], - self.read_batch, - config.get("async.concurrency"), - ) + await self.read_batch(batch_info, out, drop_axes) def _merge_chunk_array( self, @@ -840,14 +829,7 @@ async def write( value: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: - await concurrent_map( - [ - (single_batch_info, value, drop_axes) - for single_batch_info in batched(batch_info, self.batch_size) - ], - self.write_batch, - config.get("async.concurrency"), - ) + await self.write_batch(batch_info, value, drop_axes) # ------------------------------------------------------------------- # Fully synchronous read / write (no event loop) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index f8f8ea4f5f..f21637c495 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -99,11 +99,13 @@ def enable_gpu(self) -> ConfigSet: "target_shard_size_bytes": None, }, "async": {"concurrency": 10, "timeout": None}, - "threading": {"max_workers": None}, + "threading": { + "max_workers": None, + "codec_workers": {"enabled": True, "min": 0, "max": None}, + }, "json_indent": 2, "codec_pipeline": { "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", - "batch_size": 1, }, "codecs": { "blosc": "zarr.codecs.blosc.BloscCodec", diff --git a/tests/package_with_entrypoint/__init__.py b/tests/package_with_entrypoint/__init__.py index 7b5dfb5a1e..7394b2e5c8 100644 --- a/tests/package_with_entrypoint/__init__.py +++ b/tests/package_with_entrypoint/__init__.py @@ -40,7 +40,7 @@ def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> class TestEntrypointCodecPipeline(CodecPipeline): - def __init__(self, batch_size: int = 1) -> None: + def __init__(self) -> None: pass async def encode( diff --git a/tests/test_config.py b/tests/test_config.py index f8ea3c6487..ba74140b75 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -56,11 +56,13 @@ def test_config_defaults_set() -> None: "target_shard_size_bytes": None, }, "async": {"concurrency": 10, "timeout": None}, - "threading": {"max_workers": None}, + "threading": { + "max_workers": None, + "codec_workers": {"enabled": True, "min": 0, "max": None}, + }, "json_indent": 2, "codec_pipeline": { "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", - "batch_size": 1, }, "codecs": { "blosc": "zarr.codecs.blosc.BloscCodec", @@ -103,7 +105,6 @@ def test_config_defaults_set() -> None: assert config.get("array.order") == "C" assert config.get("async.concurrency") == 10 assert config.get("async.timeout") is None - assert config.get("codec_pipeline.batch_size") == 1 assert config.get("json_indent") == 2 From 204dda1eafd50f4d8c8634cca5ebbf8e4bc054e6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Feb 2026 20:34:22 +0100 Subject: [PATCH 17/23] prune dead code, make protocols useful --- src/zarr/__init__.py | 1 - src/zarr/core/array.py | 12 ++++++---- src/zarr/experimental/sync_codecs.py | 34 ---------------------------- tests/test_sync_codec_pipeline.py | 21 +++++------------ 4 files changed, 13 insertions(+), 55 deletions(-) delete mode 100644 src/zarr/experimental/sync_codecs.py diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index d10000ed29..3c6195c28f 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -37,7 +37,6 @@ from zarr.core.array import Array, AsyncArray from zarr.core.config import config from zarr.core.group import AsyncGroup, Group -from zarr.experimental.sync_codecs import SyncCodecPipeline # noqa: F401 (backwards compat) # in case setuptools scm screw up and find version to be 0.0.0 assert not __version__.startswith("0.0.0") diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 853ff6434a..8c8b645ece 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -25,6 +25,7 @@ import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.numcodec import Numcodec, _is_numcodec +from zarr.abc.store import SyncByteGetter from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec @@ -1982,8 +1983,9 @@ def _can_use_sync_path(self) -> bool: implement ``SupportsSyncCodec``). This is True for BatchedCodecPipeline when all codecs support sync. - 2. The store supports synchronous operations (has ``get_sync``). - MemoryStore and LocalStore provide this; remote stores do not. + 2. The store supports synchronous operations (implements + ``SyncByteGetter``). MemoryStore and LocalStore provide this; + remote stores do not. When both hold, the selection methods below call _get_selection_sync / _set_selection_sync directly, running the @@ -1993,7 +1995,7 @@ def _can_use_sync_path(self) -> bool: """ pipeline = self.async_array.codec_pipeline store = self.async_array.store_path.store - return getattr(pipeline, "supports_sync_io", False) and hasattr(store, "get_sync") + return getattr(pipeline, "supports_sync_io", False) and isinstance(store, SyncByteGetter) @classmethod @deprecated("Use zarr.create_array instead.", category=ZarrDeprecationWarning) @@ -3088,8 +3090,8 @@ def get_basic_selection( prototype=prototype, ) # Fallback: submit the async coroutine to the background event loop - # thread via sync(). Used for remote stores, sharded arrays, or when - # SyncCodecPipeline is not active. + # thread via sync(). Used for remote stores or when the sync bypass + # is not active. return sync( self.async_array._get_selection( indexer, diff --git a/src/zarr/experimental/sync_codecs.py b/src/zarr/experimental/sync_codecs.py deleted file mode 100644 index 260875789d..0000000000 --- a/src/zarr/experimental/sync_codecs.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Backwards-compatible alias for SyncCodecPipeline. - -The synchronous codec optimizations (inline per-chunk codec chains, thread pool -parallelism, fully synchronous read/write bypass) have been merged into -``BatchedCodecPipeline``. That pipeline now automatically selects the optimal -strategy based on codec and store capabilities — no configuration needed. - -``SyncCodecPipeline`` is retained as a subclass alias so that existing config -references (``codec_pipeline.path: zarr.experimental.sync_codecs.SyncCodecPipeline``) -and imports continue to work. -""" - -from __future__ import annotations - -from dataclasses import dataclass - -from zarr.core.codec_pipeline import BatchedCodecPipeline -from zarr.registry import register_pipeline - -__all__ = ["SyncCodecPipeline"] - - -@dataclass(frozen=True) -class SyncCodecPipeline(BatchedCodecPipeline): - """Backwards-compatible alias for BatchedCodecPipeline. - - All synchronous codec optimizations are now built into - ``BatchedCodecPipeline``. This subclass exists only so that - existing ``codec_pipeline.path`` config values and imports - continue to work. - """ - - -register_pipeline(SyncCodecPipeline) diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py index af12befb40..8fac3d54f6 100644 --- a/tests/test_sync_codec_pipeline.py +++ b/tests/test_sync_codec_pipeline.py @@ -297,18 +297,9 @@ def test_supports_sync_io(self) -> None: pipeline = BatchedCodecPipeline.from_codecs([BytesCodec(), GzipCodec(level=1)]) assert pipeline.supports_sync_io - def test_config_switch_to_sync_pipeline_compat(self) -> None: - """Verify backwards compat: SyncCodecPipeline config path still works.""" - from zarr.experimental.sync_codecs import SyncCodecPipeline - - zarr.config.set({"codec_pipeline.path": "zarr.experimental.sync_codecs.SyncCodecPipeline"}) - try: - store = MemoryStore() - arr = zarr.create_array(store, shape=(10,), dtype="float64") - assert isinstance(arr.async_array.codec_pipeline, SyncCodecPipeline) - # SyncCodecPipeline is-a BatchedCodecPipeline - assert isinstance(arr.async_array.codec_pipeline, BatchedCodecPipeline) - finally: - zarr.config.set( - {"codec_pipeline.path": "zarr.core.codec_pipeline.BatchedCodecPipeline"} - ) + def test_supports_sync_io_default(self) -> None: + """Default BatchedCodecPipeline is the sync pipeline — no config switch needed.""" + store = MemoryStore() + arr = zarr.create_array(store, shape=(10,), dtype="float64") + assert isinstance(arr.async_array.codec_pipeline, BatchedCodecPipeline) + assert arr.async_array.codec_pipeline.supports_sync_io From e9db616f58b16aa00af739932702ff93e213dbef Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Feb 2026 20:41:35 +0100 Subject: [PATCH 18/23] restore batch size but it's only there for warnings --- src/zarr/core/codec_pipeline.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index c273bff8a1..3c12b169d0 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -212,6 +212,16 @@ class BatchedCodecPipeline(CodecPipeline): array_array_codecs: tuple[ArrayArrayCodec, ...] array_bytes_codec: ArrayBytesCodec bytes_bytes_codecs: tuple[BytesBytesCodec, ...] + batch_size: int | None = None + + def __post_init__(self) -> None: + if self.batch_size is not None: + warn( + "The 'batch_size' parameter is deprecated and has no effect. " + "Batch size is now determined automatically.", + FutureWarning, + stacklevel=2, + ) @property def _all_sync(self) -> bool: From 01e1f7328fcc135803a740c97b8d0bb7ddb3cc41 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Feb 2026 21:09:08 +0100 Subject: [PATCH 19/23] fix type hints, prevent thread pool leakage, make codec pipeline introspection more efficient --- changes/3715.misc.md | 2 +- src/zarr/codecs/sharding.py | 5 ++++- src/zarr/core/codec_pipeline.py | 27 ++++++++++++++++----------- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/changes/3715.misc.md b/changes/3715.misc.md index 3c081a539e..8862d33605 100644 --- a/changes/3715.misc.md +++ b/changes/3715.misc.md @@ -9,4 +9,4 @@ different codecs. This algorithm is aware of the latency required for setting up single-chunk workloads we skip the thread pool entirely. Use of the thread pool can be disabled in the global configuration. The minimum number of threads -and the maximum number of threads can be set via the configuration as well. \ No newline at end of file +and the maximum number of threads can be set via the configuration as well. diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 9f5a83deb4..a5fe12921b 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -100,9 +100,12 @@ async def get( return value[start:stop] def get_sync( - self, prototype: BufferPrototype | None = None, byte_range: ByteRequest | None = None + self, prototype: BufferPrototype, byte_range: ByteRequest | None = None ) -> Buffer | None: # Sync equivalent of get() — just a dict lookup, no IO. + assert prototype == default_buffer_prototype(), ( + f"prototype is not supported within shards currently. diff: {prototype} != {default_buffer_prototype()}" + ) value = self.shard_dict.get(self.chunk_coords) if value is None: return None diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 3c12b169d0..d7cc1ca110 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -2,7 +2,7 @@ import os from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass +from dataclasses import dataclass, field from itertools import pairwise from typing import TYPE_CHECKING, Any, TypeVar, cast from warnings import warn @@ -182,7 +182,10 @@ def _get_pool(max_workers: int) -> ThreadPoolExecutor: """Get a thread pool with at most *max_workers* threads.""" global _pool if _pool is None or _pool._max_workers < max_workers: + old = _pool _pool = ThreadPoolExecutor(max_workers=max_workers) + if old is not None: + old.shutdown(wait=False) return _pool @@ -214,6 +217,8 @@ class BatchedCodecPipeline(CodecPipeline): bytes_bytes_codecs: tuple[BytesBytesCodec, ...] batch_size: int | None = None + _all_sync: bool = field(default=False, init=False, repr=False, compare=False) + def __post_init__(self) -> None: if self.batch_size is not None: warn( @@ -222,11 +227,12 @@ def __post_init__(self) -> None: FutureWarning, stacklevel=2, ) - - @property - def _all_sync(self) -> bool: - """True when every codec in the chain implements SupportsSyncCodec.""" - return all(isinstance(c, SupportsSyncCodec) for c in self) + # Compute once; frozen dataclass requires object.__setattr__. + object.__setattr__( + self, + "_all_sync", + all(isinstance(c, SupportsSyncCodec) for c in self), + ) def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return type(self).from_codecs(c.evolve_from_array_spec(array_spec=array_spec) for c in self) @@ -710,7 +716,7 @@ async def _write_chunk( ) # 3) Write result - if chunk_bytes is _DELETED or chunk_bytes is None: + if chunk_bytes is _DELETED: await byte_setter.delete() else: await byte_setter.set(chunk_bytes) # type: ignore[arg-type] @@ -1020,10 +1026,8 @@ def write_sync( for encoded, (byte_setter, *_) in zip(encoded_list, batch_info_list, strict=False): if encoded is _DELETED: byte_setter.delete_sync() - elif encoded is not None: - byte_setter.set_sync(encoded) else: - byte_setter.delete_sync() + byte_setter.set_sync(encoded) def codecs_from_list( @@ -1031,11 +1035,12 @@ def codecs_from_list( ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: from zarr.codecs.sharding import ShardingCodec + codecs = list(codecs) array_array: tuple[ArrayArrayCodec, ...] = () array_bytes_maybe: ArrayBytesCodec | None = None bytes_bytes: tuple[BytesBytesCodec, ...] = () - if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(tuple(codecs)) > 1: + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1: warn( "Combining a `sharding_indexed` codec disables partial reads and " "writes, which may lead to inefficient performance.", From 11534b07c6609f47231a7ee3dc0f85053e023b4d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Feb 2026 21:21:59 +0100 Subject: [PATCH 20/23] restore old comments / docstrings --- src/zarr/core/codec_pipeline.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index d7cc1ca110..a24a2397ed 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -249,12 +249,32 @@ def from_codecs(cls, codecs: Iterable[Codec]) -> Self: @property def supports_partial_decode(self) -> bool: + """Determines whether the codec pipeline supports partial decoding. + + Currently, only codec pipelines with a single ArrayBytesCodec that supports + partial decoding can support partial decoding. This limitation is due to the fact + that ArrayArrayCodecs can change the slice selection leading to non-contiguous + slices and BytesBytesCodecs can change the chunk bytes in a way that slice + selections cannot be attributed to byte ranges anymore which renders partial + decoding infeasible. + + This limitation may softened in the future.""" return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin ) @property def supports_partial_encode(self) -> bool: + """Determines whether the codec pipeline supports partial encoding. + + Currently, only codec pipelines with a single ArrayBytesCodec that supports + partial encoding can support partial encoding. This limitation is due to the fact + that ArrayArrayCodecs can change the slice selection leading to non-contiguous + slices and BytesBytesCodecs can change the chunk bytes in a way that slice + selections cannot be attributed to byte ranges anymore which renders partial + encoding infeasible. + + This limitation may softened in the future.""" return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin ) @@ -637,6 +657,7 @@ def _merge_chunk_array( if ( is_complete_chunk and value.shape == chunk_spec.shape + # Guard that this is not a partial chunk at the end with is_complete_chunk=True and value[out_selection].shape == chunk_spec.shape ): return value @@ -648,16 +669,20 @@ def _merge_chunk_array( fill_value=fill_value_or_default(chunk_spec), ) else: - chunk_array = existing_chunk_array.copy() + chunk_array = existing_chunk_array.copy() # make a writable copy if chunk_selection == () or is_scalar( value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype() ): chunk_value = value else: chunk_value = value[out_selection] + # handle missing singleton dimensions if drop_axes != (): item = tuple( - None if idx in drop_axes else slice(None) for idx in range(chunk_spec.ndim) + None # equivalent to np.newaxis + if idx in drop_axes + else slice(None) + for idx in range(chunk_spec.ndim) ) chunk_value = chunk_value[item] chunk_array[chunk_selection] = chunk_value @@ -672,6 +697,7 @@ async def write_batch( batch_info = list(batch_info) if self.supports_partial_encode: + # Pass scalar values as is if len(value.shape) == 0: await self.encode_partial_batch( [ From b40d53a78e1bba51a0f7d87071e140803a7054c6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Feb 2026 21:42:00 +0100 Subject: [PATCH 21/23] simplify threadpool management --- changes/3715.misc.md | 7 +- src/zarr/core/codec_pipeline.py | 138 ++++++++------------------------ 2 files changed, 38 insertions(+), 107 deletions(-) diff --git a/changes/3715.misc.md b/changes/3715.misc.md index 8862d33605..caf06d1c54 100644 --- a/changes/3715.misc.md +++ b/changes/3715.misc.md @@ -3,10 +3,9 @@ Added several performance optimizations to chunk encoding and decoding. Low-late Similarly, codecs can implement a synchronous API which will be used if available during chunk processing. These changes remove unnecessary interactions with the event loop. -The synchronous chunk processing path optionally uses a thread pool to parallelize execution. The number of threads is chosen -based on the estimated compute load of each chunk, which takes into account known encoding and decoding profiles for -different codecs. This algorithm is aware of the latency required for setting up the thread pool, and for -single-chunk workloads we skip the thread pool entirely. +The synchronous chunk processing path optionally uses a thread pool to parallelize codec work across chunks. +The pool is skipped for single-chunk operations and for pipelines that only contain cheap codecs (e.g. endian +swap, transpose, checksum). Use of the thread pool can be disabled in the global configuration. The minimum number of threads and the maximum number of threads can be set via the configuration as well. diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index a24a2397ed..960a232308 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -64,87 +64,28 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any: # --------------------------------------------------------------------------- -# Work estimation for thread pool sizing +# Thread pool for parallel codec compute # --------------------------------------------------------------------------- -# Approximate nanoseconds-per-byte for codec decode and encode, measured on -# typical hardware. These don't need to be exact — they just need to rank -# codecs correctly so the pool-sizing heuristic makes good decisions. -# -# Decode and encode have very different costs for many codecs: -# - gzip decode ~5-10 ns/byte vs encode ~50-100 ns/byte -# - zstd decode ~1-2 ns/byte vs encode ~2-10 ns/byte -# - blosc decode ~0.5-1 ns/byte vs encode ~1-5 ns/byte -# -# "Cheap" codecs (memcpy-like): BytesCodec, Crc32cCodec, TransposeCodec -# → ~0.1-1 ns/byte, dominated by memcpy; no benefit from threading. -# "Medium" codecs: ZstdCodec, BloscCodec -# → decode ~1-2 ns/byte, encode ~2-5 ns/byte; GIL released in C. -# "Expensive" codecs: GzipCodec -# → decode ~5-10 ns/byte, encode ~50-100 ns/byte; GIL released in C. -# -# For unknown codecs (e.g. third-party numcodecs wrappers), we assume -# "medium" cost — better to over-parallelize slightly than miss a win. - -_CODEC_DECODE_NS_PER_BYTE: dict[str, float] = { - # Near-zero cost — just reshaping/copying/checksumming - "BytesCodec": 0, - "Crc32cCodec": 0, - "TransposeCodec": 0, - "VLenUTF8Codec": 0, - "VLenBytesCodec": 0, - # Medium cost — fast C codecs, GIL released - "ZstdCodec": 1, - "BloscCodec": 0.5, - # High cost — slower C codecs, GIL released - "GzipCodec": 8, -} - -_CODEC_ENCODE_NS_PER_BYTE: dict[str, float] = { - # Near-zero cost — just reshaping/copying/checksumming - "BytesCodec": 0, - "Crc32cCodec": 0, - "TransposeCodec": 0, - "VLenUTF8Codec": 0, - "VLenBytesCodec": 0, - # Medium cost — fast C codecs, GIL released - "ZstdCodec": 3, - "BloscCodec": 2, - # High cost — slower C codecs, GIL released - "GzipCodec": 50, -} - -_DEFAULT_DECODE_NS_PER_BYTE = 1 # assume medium for unknown codecs -_DEFAULT_ENCODE_NS_PER_BYTE = 3 # encode is typically slower - -# Thread pool dispatch overhead in nanoseconds (~50-100us per task). -# We only parallelize when the estimated per-chunk work exceeds this. -_POOL_OVERHEAD_NS = 200_000 - - -def _estimate_chunk_work_ns( - chunk_nbytes: int, - codecs: Iterable[Codec], - *, - is_encode: bool = False, -) -> float: - """Estimate nanoseconds of codec work for one chunk.""" - table = _CODEC_ENCODE_NS_PER_BYTE if is_encode else _CODEC_DECODE_NS_PER_BYTE - default = _DEFAULT_ENCODE_NS_PER_BYTE if is_encode else _DEFAULT_DECODE_NS_PER_BYTE - total_ns_per_byte = 0.0 - for codec in codecs: - name = type(codec).__name__ - total_ns_per_byte += table.get(name, default) - return chunk_nbytes * total_ns_per_byte - - -def _choose_workers( - n_chunks: int, - chunk_nbytes: int, - codecs: Iterable[Codec], - *, - is_encode: bool = False, -) -> int: +# Codecs that are essentially free (memcpy / reshape / checksum). +# Threading overhead exceeds the work these do, so we skip the pool +# when these are the only codecs in the pipeline. +_CHEAP_CODECS: frozenset[str] = frozenset( + { + "BytesCodec", + "Crc32cCodec", + "TransposeCodec", + "VLenUTF8Codec", + "VLenBytesCodec", + } +) + +# Minimum chunk size (in bytes) to consider using the thread pool. +# Below this, per-chunk codec work is too small to offset dispatch overhead. +_MIN_CHUNK_NBYTES_FOR_POOL = 100_000 # 100 KB + + +def _choose_workers(n_chunks: int, chunk_nbytes: int, codecs: Iterable[Codec]) -> int: """Decide how many thread pool workers to use (0 = don't use pool). Respects ``threading.codec_workers`` config: @@ -162,30 +103,23 @@ def _choose_workers( if n_chunks < 2: return min_workers - per_chunk_ns = _estimate_chunk_work_ns(chunk_nbytes, codecs, is_encode=is_encode) - - if per_chunk_ns < _POOL_OVERHEAD_NS and min_workers == 0: + # Only use the pool when at least one codec does real work + # and the chunks are large enough to offset dispatch overhead. + has_expensive = any(type(c).__name__ not in _CHEAP_CODECS for c in codecs) + if not has_expensive and min_workers == 0: return 0 - - total_work_ns = per_chunk_ns * n_chunks - total_dispatch_ns = n_chunks * 50_000 # ~50us per task - if total_work_ns < total_dispatch_ns * 3 and min_workers == 0: + if chunk_nbytes < _MIN_CHUNK_NBYTES_FOR_POOL and min_workers == 0: return 0 - target_per_worker_ns = 1_000_000 # 1ms - workers = max(1, int(total_work_ns / target_per_worker_ns)) + return max(min_workers, min(n_chunks, max_workers)) - return max(min_workers, min(workers, n_chunks, max_workers)) - -def _get_pool(max_workers: int) -> ThreadPoolExecutor: - """Get a thread pool with at most *max_workers* threads.""" +def _get_pool() -> ThreadPoolExecutor: + """Get the module-level thread pool, creating it lazily.""" global _pool - if _pool is None or _pool._max_workers < max_workers: - old = _pool + if _pool is None: + max_workers: int = config.get("threading.codec_workers").get("max") or os.cpu_count() or 4 _pool = ThreadPoolExecutor(max_workers=max_workers) - if old is not None: - old.shutdown(wait=False) return _pool @@ -913,11 +847,10 @@ def read_sync( ] # Phase 2: Decode — run the codec chain for each chunk. - dtype_item_size = getattr(first_spec.dtype, "item_size", 1) - chunk_nbytes = product(first_spec.shape) * dtype_item_size + chunk_nbytes = product(first_spec.shape) * getattr(first_spec.dtype, "item_size", 1) n_workers = _choose_workers(len(batch_info_list), chunk_nbytes, self) if n_workers > 0: - pool = _get_pool(n_workers) + pool = _get_pool() chunk_arrays: list[NDBuffer | None] = list( pool.map( self._decode_one, @@ -1011,11 +944,10 @@ def write_sync( # Phase 2: Compute — decode existing, merge new data, encode. _, first_spec, *_ = batch_info_list[0] - dtype_item_size = getattr(first_spec.dtype, "item_size", 1) - chunk_nbytes = product(first_spec.shape) * dtype_item_size - n_workers = _choose_workers(len(batch_info_list), chunk_nbytes, self, is_encode=True) + chunk_nbytes = product(first_spec.shape) * getattr(first_spec.dtype, "item_size", 1) + n_workers = _choose_workers(len(batch_info_list), chunk_nbytes, self) if n_workers > 0: - pool = _get_pool(n_workers) + pool = _get_pool() encoded_list: list[Buffer | None | object] = list( pool.map( self._write_chunk_compute, From 83c1dc1902ef9fce56381cbd11608755bcc903ae Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 21 Feb 2026 21:49:01 +0100 Subject: [PATCH 22/23] use isinstance instead of explicit list of codec names --- src/zarr/core/codec_pipeline.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 960a232308..1ead5e4811 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -67,19 +67,6 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any: # Thread pool for parallel codec compute # --------------------------------------------------------------------------- -# Codecs that are essentially free (memcpy / reshape / checksum). -# Threading overhead exceeds the work these do, so we skip the pool -# when these are the only codecs in the pipeline. -_CHEAP_CODECS: frozenset[str] = frozenset( - { - "BytesCodec", - "Crc32cCodec", - "TransposeCodec", - "VLenUTF8Codec", - "VLenBytesCodec", - } -) - # Minimum chunk size (in bytes) to consider using the thread pool. # Below this, per-chunk codec work is too small to offset dispatch overhead. _MIN_CHUNK_NBYTES_FOR_POOL = 100_000 # 100 KB @@ -104,9 +91,9 @@ def _choose_workers(n_chunks: int, chunk_nbytes: int, codecs: Iterable[Codec]) - return min_workers # Only use the pool when at least one codec does real work + # (BytesBytesCodec = compression/checksum, which releases the GIL in C) # and the chunks are large enough to offset dispatch overhead. - has_expensive = any(type(c).__name__ not in _CHEAP_CODECS for c in codecs) - if not has_expensive and min_workers == 0: + if not any(isinstance(c, BytesBytesCodec) for c in codecs) and min_workers == 0: return 0 if chunk_nbytes < _MIN_CHUNK_NBYTES_FOR_POOL and min_workers == 0: return 0 From e8a0cc6bc0bb8d29c80520af8187b2faa52dfa8e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 21 Feb 2026 21:57:55 +0100 Subject: [PATCH 23/23] consolidate thread pool configuration --- src/zarr/core/codec_pipeline.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 1ead5e4811..26d56fa97a 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -72,6 +72,18 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any: _MIN_CHUNK_NBYTES_FOR_POOL = 100_000 # 100 KB +def _get_codec_worker_config() -> tuple[bool, int, int]: + """Read the ``threading.codec_workers`` config. + + Returns (enabled, min_workers, max_workers). + """ + codec_workers = config.get("threading.codec_workers") + enabled: bool = codec_workers.get("enabled", True) + min_workers: int = codec_workers.get("min", 0) + max_workers: int = max(codec_workers.get("max") or os.cpu_count() or 4, min_workers) + return enabled, min_workers, max_workers + + def _choose_workers(n_chunks: int, chunk_nbytes: int, codecs: Iterable[Codec]) -> int: """Decide how many thread pool workers to use (0 = don't use pool). @@ -80,13 +92,10 @@ def _choose_workers(n_chunks: int, chunk_nbytes: int, codecs: Iterable[Codec]) - - ``min``: floor for the number of workers. - ``max``: ceiling for the number of workers (default: ``os.cpu_count()``). """ - codec_workers = config.get("threading.codec_workers") - if not codec_workers.get("enabled", True): + enabled, min_workers, max_workers = _get_codec_worker_config() + if not enabled: return 0 - min_workers: int = codec_workers.get("min", 0) - max_workers: int = codec_workers.get("max") or os.cpu_count() or 4 - if n_chunks < 2: return min_workers @@ -105,7 +114,7 @@ def _get_pool() -> ThreadPoolExecutor: """Get the module-level thread pool, creating it lazily.""" global _pool if _pool is None: - max_workers: int = config.get("threading.codec_workers").get("max") or os.cpu_count() or 4 + _, _, max_workers = _get_codec_worker_config() _pool = ThreadPoolExecutor(max_workers=max_workers) return _pool