Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude/sweep-performance-state.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ aspect,2026-05-29,SAFE,compute-bound,1,2688,"dask+cupy geodesic densified full l
balanced_allocation,2026-04-16T12:00:00Z,WILL OOM,memory-bound,8,1114,"Re-audit 2026-04-16 after PR 1203 float32 fix. 8 HIGH found (friction.compute L339, argmin.compute in iter loop L182, double all_nan recompute L206, stacked cost_surfaces allocation). Covered by existing documented limitation on #1114. Not refiled."
bilateral,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
bump,2026-04-16T12:00:00Z,SAFE,compute-bound,0,1206,Re-audit 2026-04-16: fix verified SAFE. No HIGH findings. MEDIUM: CuPy backend runs CPU kernel then transfers to GPU (documented limitation).
classify,2026-04-16T18:00:00Z,SAFE,compute-bound,0,fixed-in-tree,"Fixed-in-tree 2026-04-16: _run_dask_head_tail_breaks now persists data_clean once and fuses mean+head_count per iter (912ms -> 339ms, 0.37x IMPROVED); added _run_dask_box_plot that samples via _generate_sample_indices instead of boolean fancy indexing on dask array; _run_dask_cupy_box_plot likewise. 85 existing classify tests pass."
classify,2026-06-20,RISKY,graph-bound,1,3412,"Re-audit 2026-06-20 (CUDA host). 1 HIGH: _generate_sample_indices >10M branch used RandomState.choice(replace=False) which builds a full arange(num_data) permutation -> O(num_data) host alloc (160MB for 20M pop, OOM at 30TB) despite docstring claiming O(num_sample). Backed dask/dask+cupy natural_breaks/maximum_breaks/quantile/percentiles/box_plot. Fixed via np.random.default_rng().choice (Floyd, O(num_sample), still deterministic); peak 160MB->0.4MB. Other paths SAFE: head_tail_breaks already persists+fuses; box_plot samples; cupy kernels low-register; no .values/np.asarray-on-dask/.compute-in-loop. 93 classify tests pass incl GPU."
contour,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
convolution,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
corridor,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
Expand Down
10 changes: 7 additions & 3 deletions xrspatial/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,17 +745,21 @@ def _generate_sample_indices(num_data, num_sample, seed=1234567890):

For small datasets (<=10M elements), uses the same linspace+shuffle
approach as the numpy backend for exact reproducibility.
For large datasets, uses memory-efficient RandomState.choice()
which is O(num_sample) rather than O(num_data).
For large datasets, draws the sample with ``Generator.choice`` (Floyd's
algorithm) so peak host memory is O(num_sample) rather than O(num_data).
The legacy ``RandomState.choice`` builds a full ``arange(num_data)``
permutation internally, which materialises the whole index space on the
driver host and OOMs on very large dask arrays.
"""
generator = np.random.RandomState(seed)
if num_data <= 10_000_000:
generator = np.random.RandomState(seed)
idx = np.linspace(
0, num_data, num_data, endpoint=False, dtype=np.uint32
)
generator.shuffle(idx)
sample_idx = idx[:num_sample]
else:
generator = np.random.default_rng(seed)
sample_idx = generator.choice(
num_data, size=num_sample, replace=False
)
Expand Down
45 changes: 45 additions & 0 deletions xrspatial/tests/test_classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -1078,3 +1078,48 @@ def test_percentiles_dask_no_unknown_chunks():
dask_result.data.compute(),
equal_nan=True,
)


# ===================================================================
# Regression test: large-array sampler must be O(num_sample) in memory
# ===================================================================

def test_generate_sample_indices_large_is_sublinear_memory():
"""The >10M-element branch of _generate_sample_indices must not
allocate an O(num_data) index permutation on the driver host.

The legacy RandomState.choice(replace=False) builds a full
arange(num_data) internally, so peak host memory scaled with the
whole array and OOM'd on very large dask arrays. The Generator.choice
(Floyd) path keeps peak memory proportional to num_sample.
"""
import tracemalloc
from xrspatial.classify import _generate_sample_indices

num_data = 20_000_000 # exceeds the 10M small-branch threshold
num_sample = 20_000

tracemalloc.start()
idx = _generate_sample_indices(num_data, num_sample)
_, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

assert idx.size == num_sample
# sorted ascending for efficient dask chunk access
assert np.all(np.diff(idx) >= 0)
assert idx.max() < num_data
# A full int64 permutation of num_data would be ~160 MB. The Floyd
# sampler stays far below that; allow generous head-room.
assert peak < 20 * 1024 ** 2, (
f"peak host memory {peak / 1024 ** 2:.1f} MB scales with num_data, "
"not num_sample"
)


def test_generate_sample_indices_large_is_deterministic():
"""The large-array sampler is seeded and reproducible across calls."""
from xrspatial.classify import _generate_sample_indices

a = _generate_sample_indices(20_000_000, 20_000)
b = _generate_sample_indices(20_000_000, 20_000)
np.testing.assert_array_equal(a, b)
Loading