From 1225d7b5fd7260638794a3629ce90cc8b2825b75 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sat, 20 Jun 2026 02:52:55 -0700 Subject: [PATCH] classify: keep large-array sampler O(num_sample) in host memory _generate_sample_indices used RandomState.choice(replace=False) on the >10M-element branch, which builds a full arange(num_data) permutation internally. Peak driver-host memory scaled with num_data instead of num_sample, so the sample-index step OOMed on very large dask arrays - the opposite of the branch's documented intent. Switch the large branch to np.random.default_rng().choice, which uses Floyd's algorithm and stays O(num_sample) when num_sample << num_data. It remains seeded and deterministic, and the small-array reproducibility branch is unchanged. Peak for a 20k sample from a 20M population drops from ~160 MB to under 1 MB. Backs the dask and dask+cupy paths of natural_breaks, maximum_breaks, quantile, percentiles, and box_plot. Closes #3412 --- .claude/sweep-performance-state.csv | 2 +- xrspatial/classify.py | 10 +++++-- xrspatial/tests/test_classify.py | 45 +++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv index 266f9a8d0..bd3d19559 100644 --- a/.claude/sweep-performance-state.csv +++ b/.claude/sweep-performance-state.csv @@ -3,7 +3,7 @@ aspect,2026-05-29,SAFE,compute-bound,1,2688,"dask+cupy geodesic densified full l balanced_allocation,2026-04-16T12:00:00Z,WILL OOM,memory-bound,8,1114,"Re-audit 2026-04-16 after PR 1203 float32 fix. 8 HIGH found (friction.compute L339, argmin.compute in iter loop L182, double all_nan recompute L206, stacked cost_surfaces allocation). Covered by existing documented limitation on #1114. Not refiled." bilateral,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, bump,2026-04-16T12:00:00Z,SAFE,compute-bound,0,1206,Re-audit 2026-04-16: fix verified SAFE. No HIGH findings. MEDIUM: CuPy backend runs CPU kernel then transfers to GPU (documented limitation). -classify,2026-04-16T18:00:00Z,SAFE,compute-bound,0,fixed-in-tree,"Fixed-in-tree 2026-04-16: _run_dask_head_tail_breaks now persists data_clean once and fuses mean+head_count per iter (912ms -> 339ms, 0.37x IMPROVED); added _run_dask_box_plot that samples via _generate_sample_indices instead of boolean fancy indexing on dask array; _run_dask_cupy_box_plot likewise. 85 existing classify tests pass." +classify,2026-06-20,RISKY,graph-bound,1,3412,"Re-audit 2026-06-20 (CUDA host). 1 HIGH: _generate_sample_indices >10M branch used RandomState.choice(replace=False) which builds a full arange(num_data) permutation -> O(num_data) host alloc (160MB for 20M pop, OOM at 30TB) despite docstring claiming O(num_sample). Backed dask/dask+cupy natural_breaks/maximum_breaks/quantile/percentiles/box_plot. Fixed via np.random.default_rng().choice (Floyd, O(num_sample), still deterministic); peak 160MB->0.4MB. Other paths SAFE: head_tail_breaks already persists+fuses; box_plot samples; cupy kernels low-register; no .values/np.asarray-on-dask/.compute-in-loop. 93 classify tests pass incl GPU." contour,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, convolution,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, corridor,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, diff --git a/xrspatial/classify.py b/xrspatial/classify.py index 8144a4afa..fd26c5a82 100644 --- a/xrspatial/classify.py +++ b/xrspatial/classify.py @@ -745,17 +745,21 @@ def _generate_sample_indices(num_data, num_sample, seed=1234567890): For small datasets (<=10M elements), uses the same linspace+shuffle approach as the numpy backend for exact reproducibility. - For large datasets, uses memory-efficient RandomState.choice() - which is O(num_sample) rather than O(num_data). + For large datasets, draws the sample with ``Generator.choice`` (Floyd's + algorithm) so peak host memory is O(num_sample) rather than O(num_data). + The legacy ``RandomState.choice`` builds a full ``arange(num_data)`` + permutation internally, which materialises the whole index space on the + driver host and OOMs on very large dask arrays. """ - generator = np.random.RandomState(seed) if num_data <= 10_000_000: + generator = np.random.RandomState(seed) idx = np.linspace( 0, num_data, num_data, endpoint=False, dtype=np.uint32 ) generator.shuffle(idx) sample_idx = idx[:num_sample] else: + generator = np.random.default_rng(seed) sample_idx = generator.choice( num_data, size=num_sample, replace=False ) diff --git a/xrspatial/tests/test_classify.py b/xrspatial/tests/test_classify.py index 950409e5c..0ba007a12 100644 --- a/xrspatial/tests/test_classify.py +++ b/xrspatial/tests/test_classify.py @@ -1078,3 +1078,48 @@ def test_percentiles_dask_no_unknown_chunks(): dask_result.data.compute(), equal_nan=True, ) + + +# =================================================================== +# Regression test: large-array sampler must be O(num_sample) in memory +# =================================================================== + +def test_generate_sample_indices_large_is_sublinear_memory(): + """The >10M-element branch of _generate_sample_indices must not + allocate an O(num_data) index permutation on the driver host. + + The legacy RandomState.choice(replace=False) builds a full + arange(num_data) internally, so peak host memory scaled with the + whole array and OOM'd on very large dask arrays. The Generator.choice + (Floyd) path keeps peak memory proportional to num_sample. + """ + import tracemalloc + from xrspatial.classify import _generate_sample_indices + + num_data = 20_000_000 # exceeds the 10M small-branch threshold + num_sample = 20_000 + + tracemalloc.start() + idx = _generate_sample_indices(num_data, num_sample) + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + assert idx.size == num_sample + # sorted ascending for efficient dask chunk access + assert np.all(np.diff(idx) >= 0) + assert idx.max() < num_data + # A full int64 permutation of num_data would be ~160 MB. The Floyd + # sampler stays far below that; allow generous head-room. + assert peak < 20 * 1024 ** 2, ( + f"peak host memory {peak / 1024 ** 2:.1f} MB scales with num_data, " + "not num_sample" + ) + + +def test_generate_sample_indices_large_is_deterministic(): + """The large-array sampler is seeded and reproducible across calls.""" + from xrspatial.classify import _generate_sample_indices + + a = _generate_sample_indices(20_000_000, 20_000) + b = _generate_sample_indices(20_000_000, 20_000) + np.testing.assert_array_equal(a, b)