diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 95a2780b30..ce9dda61cc 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -13,6 +13,7 @@ from typing import TYPE_CHECKING, Any, Literal, Self, TypeAlias, TypedDict, Union, cast import numpy as np +import numpy.typing as npt import zarr from zarr.abc.metadata import Metadata @@ -427,6 +428,28 @@ def array_index_to_chunk_coord( Coordinates of the chunk containing the array index. """ + @abstractmethod + def array_indices_to_chunk_dim( + self, array_shape: tuple[int, ...], dim: int, indices: npt.NDArray[np.intp] + ) -> npt.NDArray[np.intp]: + """ + Map an array of indices along one dimension to chunk coordinates (vectorized). + + Parameters + ---------- + array_shape : tuple[int, ...] + Shape of the full array. + dim : int + Dimension index. + indices : np.ndarray + Array of indices along the given dimension. + + Returns + ------- + np.ndarray + Array of chunk coordinates, same shape as indices. + """ + @abstractmethod def chunks_per_dim(self, array_shape: tuple[int, ...], dim: int) -> int: """ @@ -534,6 +557,19 @@ def array_index_to_chunk_coord( for idx, size in zip(array_index, self.chunk_shape, strict=False) ) + def array_indices_to_chunk_dim( + self, array_shape: tuple[int, ...], dim: int, indices: npt.NDArray[np.intp] + ) -> npt.NDArray[np.intp]: + """ + Vectorized mapping of array indices to chunk coordinates along one dimension. + + For RegularChunkGrid, this is simply indices // chunk_size. + """ + chunk_size = self.chunk_shape[dim] + if chunk_size == 0: + return np.zeros_like(indices) + return indices // chunk_size + def chunks_per_dim(self, array_shape: tuple[int, ...], dim: int) -> int: """ Get the number of chunks along a specific dimension. @@ -1071,6 +1107,17 @@ def array_index_to_chunk_coord( return tuple(result) + def array_indices_to_chunk_dim( + self, array_shape: tuple[int, ...], dim: int, indices: npt.NDArray[np.intp] + ) -> npt.NDArray[np.intp]: + """ + Vectorized mapping of array indices to chunk coordinates along one dimension. + + For RectilinearChunkGrid, uses np.searchsorted on cumulative sizes. + """ + cumsum = np.asarray(self._cumulative_sizes[dim]) + return np.searchsorted(cumsum, indices, side="right").astype(np.intp) - 1 + def chunks_in_selection( self, array_shape: tuple[int, ...], selection: tuple[slice, ...] ) -> Iterator[tuple[int, ...]]: diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index 256aa6ea12..bbe7623173 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -864,12 +864,7 @@ def __init__( boundscheck_indices(dim_sel, dim_len) # determine which chunk is needed for each selection item - # Use chunk grid to map each index to its chunk coordinate - dim_sel_chunk = np.empty(len(dim_sel), dtype=np.intp) - for i, idx in enumerate(dim_sel): - full_index = tuple(int(idx) if j == dim else 0 for j in range(len(array_shape))) - chunk_coords = chunk_grid.array_index_to_chunk_coord(array_shape, full_index) - dim_sel_chunk[i] = chunk_coords[dim] + dim_sel_chunk = chunk_grid.array_indices_to_chunk_dim(array_shape, dim, dim_sel) # determine order of indices if order == Order.UNKNOWN: @@ -1315,19 +1310,10 @@ def __init__( selection_broadcast = tuple(dim_sel.reshape(-1) for dim_sel in selection_broadcast) # compute chunk index for each point in the selection using chunk grid - # For each point, we need to find which chunk it belongs to - npoints = selection_broadcast[0].size - chunks_multi_index_list = [] - for dim in range(len(shape)): - dim_chunk_indices = np.empty(npoints, dtype=np.intp) - for i in range(npoints): - # Build full coordinate for this point - point_coords = tuple(int(selection_broadcast[d][i]) for d in range(len(shape))) - # Map to chunk coordinates - chunk_coords = chunk_grid.array_index_to_chunk_coord(shape, point_coords) - dim_chunk_indices[i] = chunk_coords[dim] - chunks_multi_index_list.append(dim_chunk_indices) - chunks_multi_index_broadcast = tuple(chunks_multi_index_list) + chunks_multi_index_broadcast = tuple( + chunk_grid.array_indices_to_chunk_dim(shape, dim, selection_broadcast[dim]) + for dim in range(len(shape)) + ) # ravel chunk indices chunks_raveled_indices = np.ravel_multi_index( diff --git a/tests/test_chunk_grids/test_common.py b/tests/test_chunk_grids/test_common.py index d5bf9b6074..95893c1d61 100644 --- a/tests/test_chunk_grids/test_common.py +++ b/tests/test_chunk_grids/test_common.py @@ -95,3 +95,52 @@ def test_normalize_chunks_dask_style_irregular_multiple_dims() -> None: # Should use first chunk size from each dimension assert result == (10, 20) + + +# ============================================================================= +# Tests for _is_nested_sequence() +# ============================================================================= + + +def test_is_nested_sequence_basic() -> None: + """Test _is_nested_sequence with typical inputs.""" + from zarr.core.chunk_grids import _is_nested_sequence + + # Nested sequences → True + assert _is_nested_sequence([[10, 20], [5, 5]]) is True + assert _is_nested_sequence([(10, 20), (5, 5)]) is True + assert _is_nested_sequence(([10, 20], [5, 5])) is True + + # Flat sequences → False + assert _is_nested_sequence((10, 10)) is False + assert _is_nested_sequence([10, 10]) is False + + +def test_is_nested_sequence_non_sequences() -> None: + """Test _is_nested_sequence with non-sequence types.""" + from zarr.core.chunk_grids import _is_nested_sequence + + assert _is_nested_sequence(10) is False + assert _is_nested_sequence("auto") is False + assert _is_nested_sequence(None) is False + assert _is_nested_sequence(3.14) is False + + +def test_is_nested_sequence_chunk_grid_instance() -> None: + """Test _is_nested_sequence with ChunkGrid instances.""" + from zarr.core.chunk_grids import RectilinearChunkGrid, RegularChunkGrid, _is_nested_sequence + + assert _is_nested_sequence(RegularChunkGrid(chunk_shape=(10, 10))) is False + assert _is_nested_sequence(RectilinearChunkGrid(chunk_shapes=[[10, 20], [5, 5]])) is False + + +def test_is_nested_sequence_empty_iterables() -> None: + """Test _is_nested_sequence with empty iterables. + + Empty sequences return False because there's no first element to inspect. + """ + from zarr.core.chunk_grids import _is_nested_sequence + + assert _is_nested_sequence([]) is False + assert _is_nested_sequence(()) is False + assert _is_nested_sequence([[]]) is True # outer has one element which is a list diff --git a/tests/test_chunk_grids/test_rectilinear.py b/tests/test_chunk_grids/test_rectilinear.py index 6221a23223..4fec2a1a1e 100644 --- a/tests/test_chunk_grids/test_rectilinear.py +++ b/tests/test_chunk_grids/test_rectilinear.py @@ -1804,3 +1804,142 @@ def test_invalid_chunk_coordinates_raise() -> None: with pytest.raises(IndexError): grid.get_chunk_start(array_shape, (0, 2)) # Only 2 chunks in dim 1 (0,1) + + +# =================================================================== +# End-to-end indexing tests through full zarr Array pipeline +# (SliceDimIndexer, IntDimIndexer, etc. with rectilinear grids) +# =================================================================== + + +def test_slice_at_exact_chunk_boundaries() -> None: + """Test slicing exactly at rectilinear chunk boundaries. + + This exercises the SliceDimIndexer code path end-to-end, verifying + that the refactored boundary calculation (stop-1 → +1 pattern) is correct. + Chunk boundaries for rows: [0:10, 10:30, 30:60] + Chunk boundaries for cols: [0:25, 25:50, 50:75, 75:100] + """ + store = MemoryStore() + data = np.arange(6000, dtype="float64").reshape(60, 100) + arr = zarr.create_array( + store=store, + shape=(60, 100), + chunks=[[10, 20, 30], [25, 25, 25, 25]], + dtype="float64", + zarr_format=3, + ) + arr[:] = data + + # Slice exactly one chunk (second row-chunk, first col-chunk) + np.testing.assert_array_equal(arr[10:30, 0:25], data[10:30, 0:25]) + + # Slice exactly at a boundary start + np.testing.assert_array_equal(arr[30:60, 75:100], data[30:60, 75:100]) + + # Slice spanning exactly two chunk boundaries + np.testing.assert_array_equal(arr[0:30, 0:50], data[0:30, 0:50]) + np.testing.assert_array_equal(arr[10:60, 25:75], data[10:60, 25:75]) + + # Single-element slices at each boundary + np.testing.assert_array_equal(arr[9:10, :], data[9:10, :]) # last row of chunk 0 + np.testing.assert_array_equal(arr[10:11, :], data[10:11, :]) # first row of chunk 1 + np.testing.assert_array_equal(arr[29:30, :], data[29:30, :]) # last row of chunk 1 + np.testing.assert_array_equal(arr[30:31, :], data[30:31, :]) # first row of chunk 2 + + +def test_strided_slicing_rectilinear() -> None: + """Test slicing with step > 1 on rectilinear arrays.""" + store = MemoryStore() + data = np.arange(6000, dtype="float64").reshape(60, 100) + arr = zarr.create_array( + store=store, + shape=(60, 100), + chunks=[[10, 20, 30], [25, 25, 25, 25]], + dtype="float64", + zarr_format=3, + ) + arr[:] = data + + np.testing.assert_array_equal(arr[::2, ::3], data[::2, ::3]) + np.testing.assert_array_equal(arr[5:55:5, 10:90:10], data[5:55:5, 10:90:10]) + np.testing.assert_array_equal(arr[::7, ::13], data[::7, ::13]) + + +def test_fancy_indexing_rectilinear() -> None: + """Test oindex and vindex on rectilinear arrays through the full Array pipeline.""" + store = MemoryStore() + data = np.arange(6000, dtype="float64").reshape(60, 100) + arr = zarr.create_array( + store=store, + shape=(60, 100), + chunks=[[10, 20, 30], [25, 25, 25, 25]], + dtype="float64", + zarr_format=3, + ) + arr[:] = data + + # oindex: orthogonal indexing with integer arrays spanning multiple chunks + rows = np.array([0, 9, 10, 29, 30, 59]) + cols = np.array([0, 24, 25, 49, 50, 99]) + np.testing.assert_array_equal(arr.oindex[rows, cols], data[np.ix_(rows, cols)]) + + # vindex: coordinate-based indexing + r = np.array([0, 10, 30, 59]) + c = np.array([0, 25, 50, 99]) + np.testing.assert_array_equal(arr.vindex[r, c], data[r, c]) + + # oindex with slice on one axis, int array on other + np.testing.assert_array_equal(arr.oindex[rows, 50:75], data[np.ix_(rows, np.arange(50, 75))]) + np.testing.assert_array_equal(arr.oindex[10:30, cols], data[np.ix_(np.arange(10, 30), cols)]) + + +def test_boolean_mask_indexing_rectilinear() -> None: + """Test boolean mask indexing on rectilinear arrays.""" + store = MemoryStore() + data = np.arange(6000, dtype="float64").reshape(60, 100) + arr = zarr.create_array( + store=store, + shape=(60, 100), + chunks=[[10, 20, 30], [25, 25, 25, 25]], + dtype="float64", + zarr_format=3, + ) + arr[:] = data + + # Full 2D boolean mask + mask = data > 3000 + np.testing.assert_array_equal(arr.vindex[mask], data[mask]) + + # 1D boolean mask on rows via oindex + row_mask = np.zeros(60, dtype=bool) + row_mask[[0, 9, 10, 29, 30, 59]] = True # hits all chunk boundaries + np.testing.assert_array_equal(arr.oindex[row_mask, :], data[row_mask, :]) + + # 1D boolean mask on cols via oindex + col_mask = np.zeros(100, dtype=bool) + col_mask[[0, 24, 25, 49, 50, 99]] = True # hits all chunk boundaries + np.testing.assert_array_equal(arr.oindex[:, col_mask], data[:, col_mask]) + + +def test_block_indexing_rectilinear() -> None: + """Test block (chunk-level) indexing on rectilinear arrays.""" + store = MemoryStore() + data = np.arange(6000, dtype="float64").reshape(60, 100) + arr = zarr.create_array( + store=store, + shape=(60, 100), + chunks=[[10, 20, 30], [25, 25, 25, 25]], + dtype="float64", + zarr_format=3, + ) + arr[:] = data + + # Individual blocks + np.testing.assert_array_equal(arr.blocks[0, 0], data[0:10, 0:25]) + np.testing.assert_array_equal(arr.blocks[1, 0], data[10:30, 0:25]) + np.testing.assert_array_equal(arr.blocks[2, 3], data[30:60, 75:100]) + + # Block range + np.testing.assert_array_equal(arr.blocks[0:2, 0:2], data[0:30, 0:50]) + np.testing.assert_array_equal(arr.blocks[1:3, 2:4], data[10:60, 50:100]) diff --git a/tests/test_indexing.py b/tests/test_indexing.py index c0bf7dd270..da8316eb4e 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -2138,3 +2138,173 @@ async def test_async_invalid_indexer(self, store): with pytest.raises(IndexError): await async_zarr.oindex.getitem("invalid_indexer") + + +# =================================================================== +# Rectilinear chunk grid indexing through the full Array pipeline +# =================================================================== + + +class TestRectilinearIndexing: + """Test that all indexing modes work correctly with RectilinearChunkGrid. + + These tests verify that the refactored indexing code (which delegates + to ChunkGrid methods instead of using direct division) produces correct + results when used with variable-sized chunks. + """ + + @pytest.fixture + def rectilinear_1d(self, store: StorePath) -> tuple[Array, npt.NDArray[Any]]: + """1D array with rectilinear chunks [5, 10, 15] (total=30).""" + a = np.arange(30, dtype=int) + z = zarr.create_array( + store=store / str(uuid4()), + shape=(30,), + dtype=a.dtype, + chunks=[[5, 10, 15]], + zarr_format=3, + ) + z[:] = a + return z, a + + @pytest.fixture + def rectilinear_2d(self, store: StorePath) -> tuple[Array, npt.NDArray[Any]]: + """2D array with rectilinear chunks [[10, 20, 30], [25, 25, 25, 25]] (60x100).""" + a = np.arange(6000, dtype=int).reshape(60, 100) + z = zarr.create_array( + store=store / str(uuid4()), + shape=(60, 100), + dtype=a.dtype, + chunks=[[10, 20, 30], [25, 25, 25, 25]], + zarr_format=3, + ) + z[:] = a + return z, a + + # --- Basic selection --- + + def test_basic_selection_1d(self, rectilinear_1d: tuple[Array, npt.NDArray[Any]]) -> None: + z, a = rectilinear_1d + for sel in [0, 4, 5, 14, 15, 29, -1, slice(None), slice(3, 18), slice(0, 0)]: + assert_array_equal(z[sel], a[sel]) + + def test_basic_selection_1d_strided( + self, rectilinear_1d: tuple[Array, npt.NDArray[Any]] + ) -> None: + z, a = rectilinear_1d + for sel in [slice(None, None, 2), slice(1, 25, 3), slice(0, 30, 7)]: + assert_array_equal(z[sel], a[sel]) + + def test_basic_selection_2d(self, rectilinear_2d: tuple[Array, npt.NDArray[Any]]) -> None: + z, a = rectilinear_2d + selections = [ + 42, + -1, + (9, 24), + (10, 25), + (30, 50), + (59, 99), + slice(None), + (slice(5, 35), slice(20, 80)), + (slice(0, 10), slice(0, 25)), # within one chunk + (slice(10, 10), slice(None)), # empty + (slice(None, None, 3), slice(None, None, 7)), # strided + ] + for sel in selections: + assert_array_equal(z[sel], a[sel]) + + # --- Orthogonal selection --- + + def test_orthogonal_selection_1d_bool( + self, rectilinear_1d: tuple[Array, npt.NDArray[Any]] + ) -> None: + z, a = rectilinear_1d + ix = np.zeros(30, dtype=bool) + ix[[0, 4, 5, 14, 15, 29]] = True + assert_array_equal(z.oindex[ix], a[ix]) + + def test_orthogonal_selection_1d_int( + self, rectilinear_1d: tuple[Array, npt.NDArray[Any]] + ) -> None: + z, a = rectilinear_1d + ix = np.array([0, 4, 5, 14, 15, 29]) + assert_array_equal(z.oindex[ix], a[ix]) + # Wraparound + ix_neg = np.array([0, -1, -15, -25]) + assert_array_equal(z.oindex[ix_neg], a[ix_neg]) + + def test_orthogonal_selection_2d_bool( + self, rectilinear_2d: tuple[Array, npt.NDArray[Any]] + ) -> None: + z, a = rectilinear_2d + ix0 = np.zeros(60, dtype=bool) + ix0[[0, 9, 10, 29, 30, 59]] = True + ix1 = np.zeros(100, dtype=bool) + ix1[[0, 24, 25, 49, 50, 99]] = True + assert_array_equal(z.oindex[ix0, ix1], a[np.ix_(ix0, ix1)]) + + def test_orthogonal_selection_2d_int( + self, rectilinear_2d: tuple[Array, npt.NDArray[Any]] + ) -> None: + z, a = rectilinear_2d + ix0 = np.array([0, 9, 10, 29, 30, 59]) + ix1 = np.array([0, 24, 25, 49, 50, 99]) + assert_array_equal(z.oindex[ix0, ix1], a[np.ix_(ix0, ix1)]) + + def test_orthogonal_selection_2d_mixed( + self, rectilinear_2d: tuple[Array, npt.NDArray[Any]] + ) -> None: + z, a = rectilinear_2d + ix = np.array([0, 9, 10, 29, 30, 59]) + assert_array_equal(z.oindex[ix, slice(25, 75)], a[np.ix_(ix, range(25, 75))]) + assert_array_equal(z.oindex[slice(10, 30), ix[:4]], a[np.ix_(range(10, 30), ix[:4])]) + + # --- Coordinate (vindex) selection --- + + def test_coordinate_selection_1d(self, rectilinear_1d: tuple[Array, npt.NDArray[Any]]) -> None: + z, a = rectilinear_1d + ix = np.array([0, 4, 5, 14, 15, 29]) + assert_array_equal(z.vindex[ix], a[ix]) + + def test_coordinate_selection_2d(self, rectilinear_2d: tuple[Array, npt.NDArray[Any]]) -> None: + z, a = rectilinear_2d + r = np.array([0, 9, 10, 29, 30, 59]) + c = np.array([0, 24, 25, 49, 50, 99]) + assert_array_equal(z.vindex[r, c], a[r, c]) + + def test_coordinate_selection_2d_bool_mask( + self, rectilinear_2d: tuple[Array, npt.NDArray[Any]] + ) -> None: + z, a = rectilinear_2d + mask = a > 3000 + assert_array_equal(z.vindex[mask], a[mask]) + + # --- Block selection --- + + def test_block_selection(self, rectilinear_2d: tuple[Array, npt.NDArray[Any]]) -> None: + z, a = rectilinear_2d + # Individual blocks + assert_array_equal(z.blocks[0, 0], a[0:10, 0:25]) + assert_array_equal(z.blocks[1, 2], a[10:30, 50:75]) + assert_array_equal(z.blocks[2, 3], a[30:60, 75:100]) + # Block range + assert_array_equal(z.blocks[0:2, 1:3], a[0:30, 25:75]) + + # --- Set selection --- + + def test_set_basic_selection(self, rectilinear_2d: tuple[Array, npt.NDArray[Any]]) -> None: + z, a = rectilinear_2d + # Write across chunk boundaries + new_data = np.full((20, 50), -1, dtype=int) + z[5:25, 10:60] = new_data + a[5:25, 10:60] = new_data + assert_array_equal(z[:], a) + + def test_set_orthogonal_selection(self, rectilinear_2d: tuple[Array, npt.NDArray[Any]]) -> None: + z, a = rectilinear_2d + rows = np.array([0, 10, 30]) + cols = np.array([0, 25, 50, 75]) + val = np.full((3, 4), -99, dtype=int) + z.oindex[rows, cols] = val + a[np.ix_(rows, cols)] = val + assert_array_equal(z[:], a)