From 267a6c4511180bb0d24ca44d3a186af4a0e9a6e4 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <kittisopikulm@janelia.hhmi.org>
Date: Fri, 20 Feb 2026 13:45:33 -0500
Subject: [PATCH 1/4] tests: Add non-power-of-2 shard shapes to benchmarks

Add (30,30,30) to large_morton_shards and (10,10,10), (20,20,20),
(30,30,30) to morton_iter_shapes to benchmark the scalar fallback path
for non-power-of-2 shapes, which are not fully covered by the vectorized
hypercube path.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/benchmarks/test_indexing.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py
index d30d731f0f..57159076a6 100644
--- a/tests/benchmarks/test_indexing.py
+++ b/tests/benchmarks/test_indexing.py
@@ -106,7 +106,8 @@ def read_with_cache_clear() -> None:
 
 # Benchmark with larger chunks_per_shard to make Morton order impact more visible
 large_morton_shards = (
-    (32,) * 3,  # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard
+    (32,) * 3,  # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard (power-of-2)
+    (30,) * 3,  # With 1x1x1 chunks: 30x30x30 = 27000 chunks per shard (non-power-of-2)
 )
 
 
@@ -197,9 +198,12 @@ def read_with_cache_clear() -> None:
 
 # Benchmark for morton_order_iter directly (no I/O)
 morton_iter_shapes = (
-    (8, 8, 8),  # 512 elements
-    (16, 16, 16),  # 4096 elements
-    (32, 32, 32),  # 32768 elements
+    (8, 8, 8),    # 512 elements    (power-of-2)
+    (10, 10, 10), # 1000 elements   (non-power-of-2)
+    (16, 16, 16), # 4096 elements   (power-of-2)
+    (20, 20, 20), # 8000 elements   (non-power-of-2)
+    (32, 32, 32), # 32768 elements  (power-of-2)
+    (30, 30, 30), # 27000 elements  (non-power-of-2)
 )
 
 

From 1dfd71dc1a2019eb0f90b521855ab6e83f2a9a0a Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <kittisopikulm@janelia.hhmi.org>
Date: Fri, 20 Feb 2026 13:55:49 -0500
Subject: [PATCH 2/4] tests: Add near-miss power-of-2 shape (33,33,33) to
 benchmarks

Documents the performance penalty when a shard shape is just above a
power-of-2 boundary, causing n_z to jump from 32,768 to 262,144.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/benchmarks/test_indexing.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py
index 57159076a6..76278da7dd 100644
--- a/tests/benchmarks/test_indexing.py
+++ b/tests/benchmarks/test_indexing.py
@@ -108,6 +108,7 @@ def read_with_cache_clear() -> None:
 large_morton_shards = (
     (32,) * 3,  # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard (power-of-2)
     (30,) * 3,  # With 1x1x1 chunks: 30x30x30 = 27000 chunks per shard (non-power-of-2)
+    (33,) * 3,  # With 1x1x1 chunks: 33x33x33 = 35937 chunks per shard (near-miss: just above power-of-2)
 )
 
 
@@ -204,6 +205,7 @@ def read_with_cache_clear() -> None:
     (20, 20, 20), # 8000 elements   (non-power-of-2)
     (32, 32, 32), # 32768 elements  (power-of-2)
     (30, 30, 30), # 27000 elements  (non-power-of-2)
+    (33, 33, 33), # 35937 elements  (near-miss: just above power-of-2, n_z=262144)
 )
 
 

From 403c50b6275c6d5502f1cb367ecce21d358a6fc4 Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <kittisopikulm@janelia.hhmi.org>
Date: Fri, 20 Feb 2026 16:48:53 -0500
Subject: [PATCH 3/4] style: Apply ruff format to benchmark file

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/benchmarks/test_indexing.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py
index 76278da7dd..385a85b5b5 100644
--- a/tests/benchmarks/test_indexing.py
+++ b/tests/benchmarks/test_indexing.py
@@ -108,7 +108,8 @@ def read_with_cache_clear() -> None:
 large_morton_shards = (
     (32,) * 3,  # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard (power-of-2)
     (30,) * 3,  # With 1x1x1 chunks: 30x30x30 = 27000 chunks per shard (non-power-of-2)
-    (33,) * 3,  # With 1x1x1 chunks: 33x33x33 = 35937 chunks per shard (near-miss: just above power-of-2)
+    (33,)
+    * 3,  # With 1x1x1 chunks: 33x33x33 = 35937 chunks per shard (near-miss: just above power-of-2)
 )
 
 
@@ -199,13 +200,13 @@ def read_with_cache_clear() -> None:
 
 # Benchmark for morton_order_iter directly (no I/O)
 morton_iter_shapes = (
-    (8, 8, 8),    # 512 elements    (power-of-2)
-    (10, 10, 10), # 1000 elements   (non-power-of-2)
-    (16, 16, 16), # 4096 elements   (power-of-2)
-    (20, 20, 20), # 8000 elements   (non-power-of-2)
-    (32, 32, 32), # 32768 elements  (power-of-2)
-    (30, 30, 30), # 27000 elements  (non-power-of-2)
-    (33, 33, 33), # 35937 elements  (near-miss: just above power-of-2, n_z=262144)
+    (8, 8, 8),  # 512 elements    (power-of-2)
+    (10, 10, 10),  # 1000 elements   (non-power-of-2)
+    (16, 16, 16),  # 4096 elements   (power-of-2)
+    (20, 20, 20),  # 8000 elements   (non-power-of-2)
+    (32, 32, 32),  # 32768 elements  (power-of-2)
+    (30, 30, 30),  # 27000 elements  (non-power-of-2)
+    (33, 33, 33),  # 35937 elements  (near-miss: just above power-of-2, n_z=262144)
 )
 
 

From ffa30657981af2364daa76fd895577b82975256c Mon Sep 17 00:00:00 2001
From: Mark Kittisopikul <kittisopikulm@janelia.hhmi.org>
Date: Fri, 20 Feb 2026 19:25:40 -0500
Subject: [PATCH 4/4] changes: Add changelog entry for PR #3717

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 changes/3717.misc.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changes/3717.misc.md

diff --git a/changes/3717.misc.md b/changes/3717.misc.md
new file mode 100644
index 0000000000..5fed76b2b7
--- /dev/null
+++ b/changes/3717.misc.md
@@ -0,0 +1 @@
+Add benchmarks for Morton order computation with non-power-of-2 and near-miss shard shapes, covering both pure computation and end-to-end read/write performance.