From 267a6c4511180bb0d24ca44d3a186af4a0e9a6e4 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 20 Feb 2026 13:45:33 -0500 Subject: [PATCH 1/6] tests: Add non-power-of-2 shard shapes to benchmarks Add (30,30,30) to large_morton_shards and (10,10,10), (20,20,20), (30,30,30) to morton_iter_shapes to benchmark the scalar fallback path for non-power-of-2 shapes, which are not fully covered by the vectorized hypercube path. Co-Authored-By: Claude Sonnet 4.6 --- tests/benchmarks/test_indexing.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py index d30d731f0f..57159076a6 100644 --- a/tests/benchmarks/test_indexing.py +++ b/tests/benchmarks/test_indexing.py @@ -106,7 +106,8 @@ def read_with_cache_clear() -> None: # Benchmark with larger chunks_per_shard to make Morton order impact more visible large_morton_shards = ( - (32,) * 3, # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard + (32,) * 3, # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard (power-of-2) + (30,) * 3, # With 1x1x1 chunks: 30x30x30 = 27000 chunks per shard (non-power-of-2) ) @@ -197,9 +198,12 @@ def read_with_cache_clear() -> None: # Benchmark for morton_order_iter directly (no I/O) morton_iter_shapes = ( - (8, 8, 8), # 512 elements - (16, 16, 16), # 4096 elements - (32, 32, 32), # 32768 elements + (8, 8, 8), # 512 elements (power-of-2) + (10, 10, 10), # 1000 elements (non-power-of-2) + (16, 16, 16), # 4096 elements (power-of-2) + (20, 20, 20), # 8000 elements (non-power-of-2) + (32, 32, 32), # 32768 elements (power-of-2) + (30, 30, 30), # 27000 elements (non-power-of-2) ) From 1dfd71dc1a2019eb0f90b521855ab6e83f2a9a0a Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 20 Feb 2026 13:55:49 -0500 Subject: [PATCH 2/6] tests: Add near-miss power-of-2 shape (33,33,33) to benchmarks Documents the performance penalty when a shard shape is just above a power-of-2 boundary, causing n_z to jump from 32,768 to 262,144. Co-Authored-By: Claude Sonnet 4.6 --- tests/benchmarks/test_indexing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py index 57159076a6..76278da7dd 100644 --- a/tests/benchmarks/test_indexing.py +++ b/tests/benchmarks/test_indexing.py @@ -108,6 +108,7 @@ def read_with_cache_clear() -> None: large_morton_shards = ( (32,) * 3, # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard (power-of-2) (30,) * 3, # With 1x1x1 chunks: 30x30x30 = 27000 chunks per shard (non-power-of-2) + (33,) * 3, # With 1x1x1 chunks: 33x33x33 = 35937 chunks per shard (near-miss: just above power-of-2) ) @@ -204,6 +205,7 @@ def read_with_cache_clear() -> None: (20, 20, 20), # 8000 elements (non-power-of-2) (32, 32, 32), # 32768 elements (power-of-2) (30, 30, 30), # 27000 elements (non-power-of-2) + (33, 33, 33), # 35937 elements (near-miss: just above power-of-2, n_z=262144) ) From 403c50b6275c6d5502f1cb367ecce21d358a6fc4 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 20 Feb 2026 16:48:53 -0500 Subject: [PATCH 3/6] style: Apply ruff format to benchmark file Co-Authored-By: Claude Sonnet 4.6 --- tests/benchmarks/test_indexing.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py index 76278da7dd..385a85b5b5 100644 --- a/tests/benchmarks/test_indexing.py +++ b/tests/benchmarks/test_indexing.py @@ -108,7 +108,8 @@ def read_with_cache_clear() -> None: large_morton_shards = ( (32,) * 3, # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard (power-of-2) (30,) * 3, # With 1x1x1 chunks: 30x30x30 = 27000 chunks per shard (non-power-of-2) - (33,) * 3, # With 1x1x1 chunks: 33x33x33 = 35937 chunks per shard (near-miss: just above power-of-2) + (33,) + * 3, # With 1x1x1 chunks: 33x33x33 = 35937 chunks per shard (near-miss: just above power-of-2) ) @@ -199,13 +200,13 @@ def read_with_cache_clear() -> None: # Benchmark for morton_order_iter directly (no I/O) morton_iter_shapes = ( - (8, 8, 8), # 512 elements (power-of-2) - (10, 10, 10), # 1000 elements (non-power-of-2) - (16, 16, 16), # 4096 elements (power-of-2) - (20, 20, 20), # 8000 elements (non-power-of-2) - (32, 32, 32), # 32768 elements (power-of-2) - (30, 30, 30), # 27000 elements (non-power-of-2) - (33, 33, 33), # 35937 elements (near-miss: just above power-of-2, n_z=262144) + (8, 8, 8), # 512 elements (power-of-2) + (10, 10, 10), # 1000 elements (non-power-of-2) + (16, 16, 16), # 4096 elements (power-of-2) + (20, 20, 20), # 8000 elements (non-power-of-2) + (32, 32, 32), # 32768 elements (power-of-2) + (30, 30, 30), # 27000 elements (non-power-of-2) + (33, 33, 33), # 35937 elements (near-miss: just above power-of-2, n_z=262144) ) From ffa30657981af2364daa76fd895577b82975256c Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 20 Feb 2026 19:25:40 -0500 Subject: [PATCH 4/6] changes: Add changelog entry for PR #3717 Co-Authored-By: Claude Sonnet 4.6 --- changes/3717.misc.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/3717.misc.md diff --git a/changes/3717.misc.md b/changes/3717.misc.md new file mode 100644 index 0000000000..5fed76b2b7 --- /dev/null +++ b/changes/3717.misc.md @@ -0,0 +1 @@ +Add benchmarks for Morton order computation with non-power-of-2 and near-miss shard shapes, covering both pure computation and end-to-end read/write performance. From f339319483c39edf9dbad2baa7b136651a54c7f2 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 20 Feb 2026 14:41:11 -0500 Subject: [PATCH 5/6] perf: Fix near-miss penalty in _morton_order with hybrid ceiling+argsort strategy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For shapes just above a power-of-2 (e.g. (33,33,33)), the ceiling-only approach generates n_z=262,144 Morton codes for only 35,937 valid coordinates (7.3× overgeneration). The floor+scalar approach is even worse since the scalar loop iterates n_z-n_floor times (229,376 for (33,33,33)), not n_total-n_floor. The fix: when n_z > 4*n_total, use an argsort strategy that enumerates all n_total valid coordinates via meshgrid, encodes each to a Morton code using vectorized bit manipulation, then sorts by Morton code. This avoids the large overgeneration while remaining fully vectorized. Result for test_morton_order_iter: (30,30,30): 24ms (ceiling, ratio=1.21) (32,32,32): 28ms (ceiling, ratio=1.00) (33,33,33): 32ms (argsort, ratio=7.3 → fixed from ~820ms with scalar) Co-Authored-By: Claude Sonnet 4.6 --- src/zarr/core/indexing.py | 85 ++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 454f7e2290..3a23737f91 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1512,54 +1512,47 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> npt.NDArray[np.intp]: out.flags.writeable = False return out - # Optimization: Remove singleton dimensions to enable magic number usage - # for shapes like (1,1,32,32,32). Compute Morton on squeezed shape, then expand. - singleton_dims = tuple(i for i, s in enumerate(chunk_shape) if s == 1) - if singleton_dims: - squeezed_shape = tuple(s for s in chunk_shape if s != 1) - if squeezed_shape: - # Compute Morton order on squeezed shape, then expand singleton dims (always 0) - squeezed_order = np.asarray(_morton_order(squeezed_shape)) - out = np.zeros((n_total, n_dims), dtype=np.intp) - squeezed_col = 0 - for full_col in range(n_dims): - if chunk_shape[full_col] != 1: - out[:, full_col] = squeezed_order[:, squeezed_col] - squeezed_col += 1 - else: - # All dimensions are singletons, just return the single point - out = np.zeros((1, n_dims), dtype=np.intp) - out.flags.writeable = False - return out - - # Find the largest power-of-2 hypercube that fits within chunk_shape. - # Within this hypercube, Morton codes are guaranteed to be in bounds. - min_dim = min(chunk_shape) - if min_dim >= 1: - power = min_dim.bit_length() - 1 # floor(log2(min_dim)) - hypercube_size = 1 << power # 2^power - n_hypercube = hypercube_size**n_dims + # Ceiling hypercube: smallest power-of-2 hypercube whose Morton codes span + # all valid coordinates in chunk_shape. (c-1).bit_length() gives the number + # of bits needed to index c values (0 for singleton dims). n_z = 2**total_bits + # is the size of this hypercube. + total_bits = sum((c - 1).bit_length() for c in chunk_shape) + n_z = 1 << total_bits if total_bits > 0 else 1 + + # Decode all Morton codes in the ceiling hypercube, then filter to valid coords. + # This is fully vectorized. For shapes with n_z >> n_total (e.g. (33,33,33): + # n_z=262144, n_total=35937), consider the argsort strategy below. + if n_z <= 4 * n_total: + # Ceiling strategy: decode all n_z codes vectorized, filter in-bounds. + # Works well when the overgeneration ratio n_z/n_total is small (≤4). + z_values = np.arange(n_z, dtype=np.intp) + all_coords = decode_morton_vectorized(z_values, chunk_shape) + shape_arr = np.array(chunk_shape, dtype=np.intp) + valid_mask = np.all(all_coords < shape_arr, axis=1) + order = all_coords[valid_mask] else: - n_hypercube = 0 + # Argsort strategy: enumerate all n_total valid coordinates directly, + # encode each to a Morton code, then sort by code. Avoids the 8× or + # larger overgeneration penalty for near-miss shapes like (33,33,33). + # Cost: O(n_total * bits) encode + O(n_total log n_total) sort, + # vs O(n_z * bits) = O(8 * n_total * bits) for ceiling. + grids = np.meshgrid(*[np.arange(c, dtype=np.intp) for c in chunk_shape], indexing="ij") + all_coords = np.stack([g.ravel() for g in grids], axis=1) + + # Encode all coordinates to Morton codes (vectorized). + bits_per_dim = tuple((c - 1).bit_length() for c in chunk_shape) + max_coord_bits = max(bits_per_dim) + z_codes = np.zeros(n_total, dtype=np.intp) + output_bit = 0 + for coord_bit in range(max_coord_bits): + for dim in range(n_dims): + if coord_bit < bits_per_dim[dim]: + z_codes |= ((all_coords[:, dim] >> coord_bit) & 1) << output_bit + output_bit += 1 + + sort_idx = np.argsort(z_codes, kind="stable") + order = all_coords[sort_idx] - # Within the hypercube, no bounds checking needed - use vectorized decoding - if n_hypercube > 0: - z_values = np.arange(n_hypercube, dtype=np.intp) - order: npt.NDArray[np.intp] = decode_morton_vectorized(z_values, chunk_shape) - else: - order = np.empty((0, n_dims), dtype=np.intp) - - # For remaining elements outside the hypercube, bounds checking is needed - remaining: list[tuple[int, ...]] = [] - i = n_hypercube - while len(order) + len(remaining) < n_total: - m = decode_morton(i, chunk_shape) - if all(x < y for x, y in zip(m, chunk_shape, strict=False)): - remaining.append(m) - i += 1 - - if remaining: - order = np.vstack([order, np.array(remaining, dtype=np.intp)]) order.flags.writeable = False return order From 9f51a3176666dda1ce819befc454fb01bba6267d Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Sat, 21 Feb 2026 12:52:35 -0500 Subject: [PATCH 6/6] fix: Address pre-commit CI failures in _morton_order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace Unicode multiplication sign × with ASCII x in comment (RUF003) - Add explicit type annotation for np.argsort result to satisfy mypy Co-Authored-By: Claude Sonnet 4.6 --- src/zarr/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 3a23737f91..aaddc58e34 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -1532,7 +1532,7 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> npt.NDArray[np.intp]: order = all_coords[valid_mask] else: # Argsort strategy: enumerate all n_total valid coordinates directly, - # encode each to a Morton code, then sort by code. Avoids the 8× or + # encode each to a Morton code, then sort by code. Avoids the 8x or # larger overgeneration penalty for near-miss shapes like (33,33,33). # Cost: O(n_total * bits) encode + O(n_total log n_total) sort, # vs O(n_z * bits) = O(8 * n_total * bits) for ceiling. @@ -1550,7 +1550,7 @@ def _morton_order(chunk_shape: tuple[int, ...]) -> npt.NDArray[np.intp]: z_codes |= ((all_coords[:, dim] >> coord_bit) & 1) << output_bit output_bit += 1 - sort_idx = np.argsort(z_codes, kind="stable") + sort_idx: npt.NDArray[np.intp] = np.argsort(z_codes, kind="stable") order = all_coords[sort_idx] order.flags.writeable = False