From 1225d7b5fd7260638794a3629ce90cc8b2825b75 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Sat, 20 Jun 2026 02:52:55 -0700
Subject: [PATCH] classify: keep large-array sampler O(num_sample) in host
 memory

_generate_sample_indices used RandomState.choice(replace=False) on the
>10M-element branch, which builds a full arange(num_data) permutation
internally. Peak driver-host memory scaled with num_data instead of
num_sample, so the sample-index step OOMed on very large dask arrays -
the opposite of the branch's documented intent.

Switch the large branch to np.random.default_rng().choice, which uses
Floyd's algorithm and stays O(num_sample) when num_sample << num_data.
It remains seeded and deterministic, and the small-array reproducibility
branch is unchanged. Peak for a 20k sample from a 20M population drops
from ~160 MB to under 1 MB.

Backs the dask and dask+cupy paths of natural_breaks, maximum_breaks,
quantile, percentiles, and box_plot.

Closes #3412
---
 .claude/sweep-performance-state.csv |  2 +-
 xrspatial/classify.py               | 10 +++++--
 xrspatial/tests/test_classify.py    | 45 +++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv
index 266f9a8d0..bd3d19559 100644
--- a/.claude/sweep-performance-state.csv
+++ b/.claude/sweep-performance-state.csv
@@ -3,7 +3,7 @@ aspect,2026-05-29,SAFE,compute-bound,1,2688,"dask+cupy geodesic densified full l
 balanced_allocation,2026-04-16T12:00:00Z,WILL OOM,memory-bound,8,1114,"Re-audit 2026-04-16 after PR 1203 float32 fix. 8 HIGH found (friction.compute L339, argmin.compute in iter loop L182, double all_nan recompute L206, stacked cost_surfaces allocation). Covered by existing documented limitation on #1114. Not refiled."
 bilateral,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 bump,2026-04-16T12:00:00Z,SAFE,compute-bound,0,1206,Re-audit 2026-04-16: fix verified SAFE. No HIGH findings. MEDIUM: CuPy backend runs CPU kernel then transfers to GPU (documented limitation).
-classify,2026-04-16T18:00:00Z,SAFE,compute-bound,0,fixed-in-tree,"Fixed-in-tree 2026-04-16: _run_dask_head_tail_breaks now persists data_clean once and fuses mean+head_count per iter (912ms -> 339ms, 0.37x IMPROVED); added _run_dask_box_plot that samples via _generate_sample_indices instead of boolean fancy indexing on dask array; _run_dask_cupy_box_plot likewise. 85 existing classify tests pass."
+classify,2026-06-20,RISKY,graph-bound,1,3412,"Re-audit 2026-06-20 (CUDA host). 1 HIGH: _generate_sample_indices >10M branch used RandomState.choice(replace=False) which builds a full arange(num_data) permutation -> O(num_data) host alloc (160MB for 20M pop, OOM at 30TB) despite docstring claiming O(num_sample). Backed dask/dask+cupy natural_breaks/maximum_breaks/quantile/percentiles/box_plot. Fixed via np.random.default_rng().choice (Floyd, O(num_sample), still deterministic); peak 160MB->0.4MB. Other paths SAFE: head_tail_breaks already persists+fuses; box_plot samples; cupy kernels low-register; no .values/np.asarray-on-dask/.compute-in-loop. 93 classify tests pass incl GPU."
 contour,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 convolution,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 corridor,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
diff --git a/xrspatial/classify.py b/xrspatial/classify.py
index 8144a4afa..fd26c5a82 100644
--- a/xrspatial/classify.py
+++ b/xrspatial/classify.py
@@ -745,17 +745,21 @@ def _generate_sample_indices(num_data, num_sample, seed=1234567890):
 
     For small datasets (<=10M elements), uses the same linspace+shuffle
     approach as the numpy backend for exact reproducibility.
-    For large datasets, uses memory-efficient RandomState.choice()
-    which is O(num_sample) rather than O(num_data).
+    For large datasets, draws the sample with ``Generator.choice`` (Floyd's
+    algorithm) so peak host memory is O(num_sample) rather than O(num_data).
+    The legacy ``RandomState.choice`` builds a full ``arange(num_data)``
+    permutation internally, which materialises the whole index space on the
+    driver host and OOMs on very large dask arrays.
     """
-    generator = np.random.RandomState(seed)
     if num_data <= 10_000_000:
+        generator = np.random.RandomState(seed)
         idx = np.linspace(
             0, num_data, num_data, endpoint=False, dtype=np.uint32
         )
         generator.shuffle(idx)
         sample_idx = idx[:num_sample]
     else:
+        generator = np.random.default_rng(seed)
         sample_idx = generator.choice(
             num_data, size=num_sample, replace=False
         )
diff --git a/xrspatial/tests/test_classify.py b/xrspatial/tests/test_classify.py
index 950409e5c..0ba007a12 100644
--- a/xrspatial/tests/test_classify.py
+++ b/xrspatial/tests/test_classify.py
@@ -1078,3 +1078,48 @@ def test_percentiles_dask_no_unknown_chunks():
         dask_result.data.compute(),
         equal_nan=True,
     )
+
+
+# ===================================================================
+# Regression test: large-array sampler must be O(num_sample) in memory
+# ===================================================================
+
+def test_generate_sample_indices_large_is_sublinear_memory():
+    """The >10M-element branch of _generate_sample_indices must not
+    allocate an O(num_data) index permutation on the driver host.
+
+    The legacy RandomState.choice(replace=False) builds a full
+    arange(num_data) internally, so peak host memory scaled with the
+    whole array and OOM'd on very large dask arrays. The Generator.choice
+    (Floyd) path keeps peak memory proportional to num_sample.
+    """
+    import tracemalloc
+    from xrspatial.classify import _generate_sample_indices
+
+    num_data = 20_000_000  # exceeds the 10M small-branch threshold
+    num_sample = 20_000
+
+    tracemalloc.start()
+    idx = _generate_sample_indices(num_data, num_sample)
+    _, peak = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+
+    assert idx.size == num_sample
+    # sorted ascending for efficient dask chunk access
+    assert np.all(np.diff(idx) >= 0)
+    assert idx.max() < num_data
+    # A full int64 permutation of num_data would be ~160 MB. The Floyd
+    # sampler stays far below that; allow generous head-room.
+    assert peak < 20 * 1024 ** 2, (
+        f"peak host memory {peak / 1024 ** 2:.1f} MB scales with num_data, "
+        "not num_sample"
+    )
+
+
+def test_generate_sample_indices_large_is_deterministic():
+    """The large-array sampler is seeded and reproducible across calls."""
+    from xrspatial.classify import _generate_sample_indices
+
+    a = _generate_sample_indices(20_000_000, 20_000)
+    b = _generate_sample_indices(20_000_000, 20_000)
+    np.testing.assert_array_equal(a, b)