diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31e4838f31..c3f01d9e74 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,6 +147,13 @@ add_feature_info(TENSOR_MEM_PROFILE TA_TENSOR_MEM_PROFILE "instrumented profilin
 option(TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED "Turn on TA_ASSERT that no mutable operations occur on TA::{Tensor,Tile} objects that share data" OFF)
 add_feature_info(TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED "TA_ASSERT that no mutable operations occur on TA::{Tensor,Tile} objects that share data")
 
+option(TA_STRIDED_DGEMM_COUNT
+       "Compile-in atomic counters that witness strided-DGEMM firing (tests/benches)"
+       OFF)
+if (TA_STRIDED_DGEMM_COUNT)
+  add_compile_definitions(TA_STRIDED_DGEMM_COUNT)
+endif()
+
 option(TA_EXPERT "TiledArray Expert mode: disables automatically downloading or building dependencies" OFF)
 
 redefaultable_option(TA_WERROR "Treat compiler warnings as errors when compiling TiledArray's own translation units (does not propagate to consumers of installed TiledArray targets)" OFF)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d240192893..0a14b99fa6 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -37,3 +37,4 @@ add_subdirectory (fock)
 add_subdirectory (mpi_tests)
 add_subdirectory (pmap_test)
 add_subdirectory (vector_tests)
+add_subdirectory (tot_bench)
diff --git a/examples/tot_bench/CMakeLists.txt b/examples/tot_bench/CMakeLists.txt
new file mode 100644
index 0000000000..5236b48e2d
--- /dev/null
+++ b/examples/tot_bench/CMakeLists.txt
@@ -0,0 +1,26 @@
+#
+#  This file is a part of TiledArray.
+#  Copyright (C) 2013  Virginia Tech
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+#  CMakeLists.txt
+#
+
+# Strided-DGEMM ToT throughput benches (strided-vs-current arena DGEMM).
+
+foreach(_exec opa_strided_arena_dgemm opb_strided_arena_dgemm regime_a_hce_e_strided_bench ce_ce_segmented_strided_bench)
+  add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray")
+  add_dependencies(examples-tiledarray ${_exec})
+endforeach()
diff --git a/examples/tot_bench/ce_ce_segmented_strided_bench.cpp b/examples/tot_bench/ce_ce_segmented_strided_bench.cpp
new file mode 100644
index 0000000000..5732b95dcb
--- /dev/null
+++ b/examples/tot_bench/ce_ce_segmented_strided_bench.cpp
@@ -0,0 +1,266 @@
+// ce_ce_segmented_strided_bench.cpp
+// ---------------------------------------------------------------------------
+// Tile/BLAS-level benchmark for the hce+ce per-k SEGMENTED strided DGEMM
+// (arena_strided_dgemm_ce_ce_right). It times the SAME kernel on the SAME
+// hole-containing arena operands under the two states of the runtime kill
+// switch TiledArray::detail::ce_ce_strided_disabled():
+//
+//   segmented (disabled=false): per k, walk μ̃ and emit one strided GEMM per
+//             maximal contiguous present+uniform-stride segment; skip holes.
+//   per-cell  (disabled=true) : the legacy path -- one length-Q GEMV per
+//             present (μ̃) cell (what TA did before, reverting to per-cell
+//             whenever results/operands contained holes).
+//
+// The ONLY variable between the two timings is that toggle, so the ratio
+// isolates the kernel strategy swap. Operands model the measured CSV-CCk
+// fallback regime: present cells are CLUSTERED (mean segment length ~ --cluster)
+// and per-k MISALIGNED (each k shifts its hole phase), the pattern the old
+// all-or-nothing gate fell back to scalar on. The right-kernel walker is
+// identical to the left's, so this speedup represents hce+ce overall.
+// ---------------------------------------------------------------------------
+
+#include <tiledarray.h>
+#include <TiledArray/math/blas.h>
+#include <TiledArray/tensor/arena_einsum.h>
+#include <TiledArray/tensor/arena_tensor.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace TA = TiledArray;
+namespace tablas = TA::math::blas;
+
+using Inner = TA::ArenaTensor<double, TA::Range>;
+using Outer = TA::Tensor<Inner>;
+
+using clock_type = std::chrono::steady_clock;
+static double ms_since(clock_type::time_point t0) {
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(clock_type::now() -
+                                                              t0)
+             .count() /
+         1.0e6;
+}
+
+struct Cli {
+  int reps = 30;       // timed reps per path
+  int warmup = 5;      // untimed warmup reps per path
+  long Mmu = 256;      // strided axis (right external) -> BLAS M
+  long nK = 8;         // outer-contraction slabs (looped, beta=1)
+  long P = 16;         // result inner free (a1)
+  long Q = 16;         // contraction inner (a4)
+  long cluster = 6;    // mean present-run length (~ mean segment M)
+  double c_fraction = 0.0;  // fraction of otherwise-present C cells to drop to holes (sparse result)
+};
+
+static void usage() {
+  std::fprintf(stderr,
+               "ce_ce_segmented_strided_bench\n"
+               "  --reps R       timed reps per path     (default 30)\n"
+               "  --warmup W     untimed warmup reps      (default 5)\n"
+               "  --Mmu N        strided axis extent      (default 256)\n"
+               "  --nK N         outer-contraction slabs  (default 8)\n"
+               "  --P N          result inner free a1     (default 16)\n"
+               "  --Q N          contraction inner a4     (default 16)\n"
+               "  --cluster N    mean present-run length  (default 6)\n"
+               "  --c_fraction F fraction of C cells dropped to holes (default 0.0)\n");
+}
+
+static Cli parse_cli(int argc, char** argv) {
+  Cli c;
+  for (int i = 1; i < argc; ++i) {
+    std::string a = argv[i];
+    auto need = [&]() -> std::string {
+      if (i + 1 >= argc) { usage(); std::exit(1); }
+      return argv[++i];
+    };
+    if (a == "--reps") c.reps = std::stoi(need());
+    else if (a == "--warmup") c.warmup = std::stoi(need());
+    else if (a == "--Mmu") c.Mmu = std::stol(need());
+    else if (a == "--nK") c.nK = std::stol(need());
+    else if (a == "--P") c.P = std::stol(need());
+    else if (a == "--Q") c.Q = std::stol(need());
+    else if (a == "--cluster") c.cluster = std::stol(need());
+    else if (a == "--c_fraction") c.c_fraction = std::stod(need());
+    else if (a == "-h" || a == "--help") { usage(); std::exit(0); }
+    else { std::fprintf(stderr, "unknown flag: %s\n", a.c_str()); usage();
+           std::exit(1); }
+  }
+  return c;
+}
+
+// Build an arena Outer with holes: dense_shape(o) unless is_hole(o) -> Range{}.
+static Outer make_sparse(const TA::Range& outer_range, std::size_t nbatch,
+                         const std::function<TA::Range(std::size_t)>& dense_shape,
+                         const std::function<bool(std::size_t)>& is_hole,
+                         double base) {
+  Outer t = TA::detail::arena_outer_init<Outer>(
+      outer_range, nbatch,
+      [&](std::size_t o) { return is_hole(o) ? TA::Range{} : dense_shape(o); });
+  for (std::size_t o = 0; o < t.range().volume() * nbatch; ++o) {
+    Inner& c = t.data()[o];
+    if (!c) continue;
+    for (std::size_t e = 0; e < c.size(); ++e) c.data()[e] = base + 0.001 * o + e;
+  }
+  return t;
+}
+
+int main(int argc, char** argv) {
+  Cli cli = parse_cli(argc, argv);
+  if (cli.reps < 1) { std::fprintf(stderr, "--reps must be >= 1\n"); return 1; }
+  auto& world = TA_SCOPED_INITIALIZE(argc, argv);
+  (void)world;
+
+  const std::size_t Mo = 1;
+  const std::size_t Mmu = static_cast<std::size_t>(cli.Mmu);
+  const std::size_t nK = static_cast<std::size_t>(cli.nK);
+  const long P = cli.P, Q = cli.Q;
+  const long cl = std::max<long>(1, cli.cluster);
+
+  // Clustered + per-k-misaligned presence on R[mu,k] (canonical mu slow, k fast,
+  // ordinal o = mu*nK + k). A cell is a hole when, within its k-shifted phase,
+  // it falls in the 1-wide gap after each run of length `cl`. period = cl + 1.
+  auto rhole = [&](std::size_t o) {
+    const std::size_t mu = o / nK, k = o % nK;
+    const long period = cl + 1;
+    const long phase = (static_cast<long>(mu) + static_cast<long>(k) * 2) % period;
+    return phase == cl;  // the single gap cell each period
+  };
+  const double c_frac = std::max(0.0, std::min(1.0, cli.c_fraction));
+  // C[mu] present iff present for at least one k (the union the kernel writes),
+  // optionally thinned: a deterministic fraction c_frac of otherwise-present
+  // cells are dropped to holes to model a genuinely SPARSE result. A hole C cell
+  // is absent regardless of operand presence (its (k,mu) contributions skip).
+  auto chole = [&](std::size_t o) {
+    bool union_present = false;
+    for (std::size_t k = 0; k < nK; ++k)
+      if (!rhole(o * nK + k)) { union_present = true; break; }
+    if (!union_present) return true;            // absent for all k -> hole
+    if (c_frac > 0.0) {
+      const std::size_t h = (o * 2654435761ull) & 0xffffull;  // cheap hash
+      if (static_cast<double>(h) / 65536.0 < c_frac) return true;
+    }
+    return false;
+  };
+
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK}, 1, [&](std::size_t) {
+        return TA::Range{static_cast<std::size_t>(P),
+                         static_cast<std::size_t>(Q)};
+      });
+  for (std::size_t o = 0; o < L.range().volume(); ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.001 * o + e;
+  Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+                        [&](std::size_t){ return TA::Range{static_cast<std::size_t>(Q)}; },
+                        rhole, 2.0);
+  Outer Ctemplate = make_sparse(TA::Range{Mmu}, 1,
+                        [&](std::size_t){ return TA::Range{static_cast<std::size_t>(P)}; },
+                        chole, 0.0);
+
+  std::size_t present_C = 0;
+  for (std::size_t o = 0; o < Ctemplate.range().volume(); ++o)
+    if (Ctemplate.data()[o]) ++present_C;
+  std::size_t present_R = 0;
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    if (R.data()[o]) ++present_R;
+
+  std::printf("=== hce+ce segmented-vs-per-cell strided DGEMM bench ===\n");
+  std::printf("Mmu=%zu nK=%zu P=%ld Q=%ld cluster=%ld  "
+              "present C=%zu/%zu  present R=%zu/%zu\n",
+              Mmu, nK, P, Q, cl, present_C, Ctemplate.range().volume(),
+              present_R, R.range().volume());
+  std::printf("reps=%d warmup=%d\n", cli.reps, cli.warmup);
+
+  // FLOP estimate: 2*P*Q per present (mu,k) contributing cell.
+  double flop = 0.0;
+  for (std::size_t mu = 0; mu < Mmu; ++mu)
+    for (std::size_t k = 0; k < nK; ++k)
+      if (R.data()[mu * nK + k] && Ctemplate.data()[mu])
+        flop += 2.0 * P * Q;
+
+  auto zero_C = [&](Outer& C) {
+    for (std::size_t o = 0; o < C.range().volume(); ++o) {
+      Inner& c = C.data()[o];
+      if (!c) continue;
+      for (std::size_t e = 0; e < c.size(); ++e) c.data()[e] = 0.0;
+    }
+  };
+  auto make_C = [&]() {
+    return make_sparse(TA::Range{Mmu}, 1,
+        [&](std::size_t){ return TA::Range{static_cast<std::size_t>(P)}; },
+        chole, 0.0);
+  };
+  auto median = [](std::vector<double> v) -> double {
+    std::sort(v.begin(), v.end());
+    const std::size_t n = v.size();
+    if (n == 0) return 0.0;
+    return (n % 2) ? v[n / 2] : 0.5 * (v[n / 2 - 1] + v[n / 2]);
+  };
+  // Time ONLY the kernel call. C is allocated once per path and re-zeroed
+  // OUTSIDE the timed window each rep (beta=1 needs a zero start), so the
+  // measured time isolates the segment-walker vs per-cell strategy, not the
+  // per-rep tile allocation (which is identical on both sides and would
+  // otherwise dominate this ~0.1 ms kernel and compress the ratio).
+  auto time_path = [&](bool disabled) {
+    Outer C = make_C();
+    TA::detail::ce_ce_strided_disabled() = disabled;
+    for (int w = 0; w < cli.warmup; ++w) { zero_C(C);
+      TA::detail::arena_strided_dgemm_ce_ce_right(
+          C, L, R, Mo, Mmu, nK, tablas::NoTranspose, tablas::Transpose, 1.0); }
+    std::vector<double> ms;
+    ms.reserve(cli.reps);
+    for (int r = 0; r < cli.reps; ++r) {
+      zero_C(C);  // untimed
+      auto t0 = clock_type::now();
+      TA::detail::arena_strided_dgemm_ce_ce_right(
+          C, L, R, Mo, Mmu, nK, tablas::NoTranspose, tablas::Transpose, 1.0);
+      ms.push_back(ms_since(t0));
+    }
+    return ms;
+  };
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TA::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+#endif
+  auto seg_ms = time_path(/*disabled=*/false);
+#ifdef TA_STRIDED_DGEMM_COUNT
+  const std::size_t seg_calls =
+      TA::detail::g_strided_dgemm_ce_ce_right_calls.load();
+#endif
+  auto pc_ms = time_path(/*disabled=*/true);
+  TA::detail::ce_ce_strided_disabled() = false;  // restore production default
+
+  const double seg_min = *std::min_element(seg_ms.begin(), seg_ms.end());
+  const double seg_med = median(seg_ms);
+  const double pc_min = *std::min_element(pc_ms.begin(), pc_ms.end());
+  const double pc_med = median(pc_ms);
+
+  std::printf("\n--- results (per kernel call, ms) ---\n");
+  std::printf("per-cell  : min=%9.5f ms  median=%9.5f ms  (%.2f GFLOP/s)\n",
+              pc_min, pc_med, flop / (pc_min * 1e6));
+  std::printf("segmented : min=%9.5f ms  median=%9.5f ms  (%.2f GFLOP/s)\n",
+              seg_min, seg_med, flop / (seg_min * 1e6));
+  std::printf("speedup   : min=%6.3fx  median=%6.3fx  (per-cell / segmented)\n",
+              seg_min > 0 ? pc_min / seg_min : 0.0,
+              seg_med > 0 ? pc_med / seg_med : 0.0);
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  std::printf("\n--- firing witness (TA_STRIDED_DGEMM_COUNT) ---\n");
+  std::printf("segment GEMMs over %d+%d (warmup+timed) reps = %zu  "
+              "(mean %.1f per rep)\n",
+              cli.warmup, cli.reps, seg_calls,
+              double(seg_calls) / double(cli.warmup + cli.reps));
+  if (seg_calls == 0) {
+    std::fprintf(stderr, "ERROR: segmented path never issued a GEMM -- the "
+                 "reported speedup would reflect a silent fallback.\n");
+    std::abort();
+  }
+  std::printf("OK: segmented path fired (counter > 0).\n");
+#endif
+  return 0;
+}
diff --git a/examples/tot_bench/opa_strided_arena_dgemm.cpp b/examples/tot_bench/opa_strided_arena_dgemm.cpp
new file mode 100644
index 0000000000..e36100064b
--- /dev/null
+++ b/examples/tot_bench/opa_strided_arena_dgemm.cpp
@@ -0,0 +1,860 @@
+// opa_strided_arena_dgemm.cpp
+// ---------------------------------------------------------------------------
+// Standalone tile/BLAS-level benchmark for the op_A ToT x ToT -> ToT
+// product, on ArenaTensor inner cells. Companion to opb_strided_arena_dgemm.cpp.
+//
+//   op_A:  I(i_2,i_1,Κ; a_1,a_4) * I(i_2,i_1,μ̃,Κ; a_4) -> I(μ̃,i_2,i_1; a_1)
+//          Hadamard = {i_1,i_2}, outer-contracted = {Κ}, outer-external = {μ̃},
+//          inner-contracted = {a_4}, inner-external (kept) = {a_1}.
+//          P = |a_1|, Q = |a_4|  depend (jaggedly) on the Hadamard slice.
+//
+//   C(μ̃,i_2,i_1; a_1) = sum_Κ sum_{a_4} L(i_2,i_1,Κ; a_1,a_4)*R(i_2,i_1,μ̃,Κ; a_4)
+//
+// Unlike op_B (inner OUTER-product), op_A's inner part is a real CONTRACTION
+// over a_4, and the full reduction spans an OUTER index (Κ) AND an INNER index
+// (a_4). That means the *direct* op_B-style fusion of the outer contraction Κ
+// would have to merge (Κ ⊗ a_4) into a single BLAS K axis -- a two-level stride
+// no single leading-dimension can express -> a pack/deep-clone is required.
+//
+// But a DIFFERENT, ZERO-COPY fusion exists: ride the outer EXTERNAL μ̃ into the
+// GEMM M axis (one outer index, via inter-cell stride), keep a_4 as the
+// contiguous inner K, keep a_1 as inner N, and loop the outer contraction Κ
+// with beta-accumulation:
+//
+//   for Κ:  C̃[μ̃, a_1] += R̃_Κ[μ̃, a_4] · L_Κ[a_1, a_4]^T      (M=|μ̃|, N=P, K=Q)
+//
+// This benchmark compares four ways to evaluate one Hadamard slice, all reading
+// already-fusable arena slabs (operands laid contracted/contiguous-friendly):
+//
+//   current_gemm : Mμ·nK tiny (P x 1, K=Q) GEMMs -- exactly what TA dispatches
+//                  today (per result cell, per Κ: one inner gemm with N=1).
+//   current_gemv : Mμ·nK BLAS dgemv calls (the natural mat-vec primitive).
+//   strided        : nK strided GEMMs (μ̃ ridden into M), zero-copy, beta-accum.
+//   packed       : pack (Κ,a_4) contiguous then ONE GEMM (M=Mμ,N=P,K=nK·Q);
+//                  the pack cost (= the deep-clone tradeoff) is timed too.
+//
+// Work units {P,Q,Mμ,nK} per Hadamard slice are reconstructed from op_dump.txt.
+// (Idealization: the μ̃ x Κ block per slice is treated as dense; Mμ and nK are
+//  the per-slice nonzero μ̃ / Κ counts. The shape distribution -- which is what
+//  drives the GEMV->GEMM win -- is faithful.)
+// ---------------------------------------------------------------------------
+
+#include <tiledarray.h>
+#include <TiledArray/math/blas.h>
+#include <TiledArray/tensor/arena_kernels.h>
+#include <TiledArray/tensor/arena_tensor.h>
+
+#include <blas.hh>
+#include <btas/zb/range.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <limits>
+#include <map>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace TA = TiledArray;
+
+using Inner = TA::ArenaTensor<double>;
+using InnerRange = typename Inner::range_type;
+using Outer = TA::Tensor<Inner>;
+
+using clock_type = std::chrono::steady_clock;
+static double ms_since(clock_type::time_point t0) {
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(clock_type::now() -
+                                                              t0)
+             .count() /
+         1.0e6;
+}
+
+// ===========================================================================
+// op_dump.txt parser (same structure as opb_strided_arena_dgemm.cpp)
+// ===========================================================================
+
+struct InnerRangeKey {
+  std::vector<long> lo, hi;
+};
+struct TileSpec {
+  long outer_vol = 0;
+  std::vector<InnerRangeKey> distinct;
+  std::vector<int> cell_labels;
+};
+struct ArraySpec {
+  int outer_rank = 0;
+  std::vector<std::vector<long>> dim_tile_bounds;
+  std::map<std::size_t, TileSpec> tiles;
+};
+struct DumpEntry {
+  std::string annot_left, annot_right, annot_out;
+  double t_einsum = 0;
+  ArraySpec L, R, C;
+};
+
+static std::vector<long> parse_long_list(std::string s) {
+  std::vector<long> v;
+  std::string buf;
+  for (char c : s) {
+    if (c == '[' || c == ' ') continue;
+    if (c == ',' || c == ']') {
+      if (!buf.empty()) {
+        v.push_back(std::stol(buf));
+        buf.clear();
+      }
+      if (c == ']') break;
+    } else
+      buf.push_back(c);
+  }
+  if (!buf.empty()) v.push_back(std::stol(buf));
+  return v;
+}
+static std::string get_kv(std::string const& line, std::string const& key) {
+  auto pos = line.find(key + "=");
+  if (pos == std::string::npos) return "";
+  pos += key.size() + 1;
+  auto end = line.find_first_of(" \t\n", pos);
+  return line.substr(pos, end == std::string::npos ? std::string::npos
+                                                   : end - pos);
+}
+static std::string get_kv_bracket(std::string const& line,
+                                  std::string const& key) {
+  auto pos = line.find(key + "=[");
+  if (pos == std::string::npos) return "";
+  pos += key.size() + 1;
+  auto end = line.find(']', pos);
+  if (end == std::string::npos) return "";
+  return line.substr(pos, end - pos + 1);
+}
+struct CellRun {
+  std::size_t lo, hi;
+  int label;
+};
+static std::vector<CellRun> parse_cells_rle(std::string const& s) {
+  std::vector<CellRun> runs;
+  std::size_t i = 0;
+  while (i < s.size()) {
+    if (s[i] != '[') {
+      ++i;
+      continue;
+    }
+    auto end = s.find(']', i);
+    if (end == std::string::npos) break;
+    auto inner = s.substr(i + 1, end - i - 1);
+    auto dotdot = inner.find("..");
+    auto colon = inner.find(':');
+    if (dotdot == std::string::npos || colon == std::string::npos) {
+      i = end + 1;
+      continue;
+    }
+    CellRun r;
+    r.lo = std::stoull(inner.substr(0, dotdot));
+    r.hi = std::stoull(inner.substr(dotdot + 2, colon - dotdot - 2));
+    auto lab = inner.substr(colon + 1);
+    r.label = (lab == "E") ? -1 : std::stoi(lab.substr(1));
+    runs.push_back(r);
+    i = end + 1;
+  }
+  return runs;
+}
+static std::vector<DumpEntry> parse_dump(std::string const& path) {
+  std::ifstream f(path);
+  if (!f) {
+    std::fprintf(stderr, "ERROR: cannot open %s\n", path.c_str());
+    std::exit(1);
+  }
+  std::vector<DumpEntry> entries;
+  DumpEntry* cur = nullptr;
+  ArraySpec* arr = nullptr;
+  TileSpec* tile = nullptr;
+  std::string line;
+  bool in_tiles = false;
+  while (std::getline(f, line)) {
+    if (line.rfind("===== OP DUMP id=", 0) == 0) {
+      entries.emplace_back();
+      cur = &entries.back();
+      auto t = line.find("t_einsum=");
+      if (t != std::string::npos) cur->t_einsum = std::stod(line.substr(t + 9));
+      arr = nullptr;
+      tile = nullptr;
+      in_tiles = false;
+    } else if (!cur) {
+      continue;
+    } else if (line.rfind("annot_left=", 0) == 0) {
+      cur->annot_left = line.substr(11);
+    } else if (line.rfind("annot_right=", 0) == 0) {
+      cur->annot_right = line.substr(12);
+    } else if (line.rfind("annot_out=", 0) == 0) {
+      cur->annot_out = line.substr(10);
+    } else if (line.rfind("L.outer_rank=", 0) == 0) {
+      arr = &cur->L;
+      arr->outer_rank = std::stoi(line.substr(13));
+      in_tiles = false;
+    } else if (line.rfind("R.outer_rank=", 0) == 0) {
+      arr = &cur->R;
+      arr->outer_rank = std::stoi(line.substr(13));
+      in_tiles = false;
+    } else if (line.rfind("C.outer_rank=", 0) == 0) {
+      arr = &cur->C;
+      arr->outer_rank = std::stoi(line.substr(13));
+      in_tiles = false;
+    } else if (arr && line.find(".dim[") != std::string::npos &&
+               line.find("tile_bounds=") != std::string::npos) {
+      arr->dim_tile_bounds.push_back(
+          parse_long_list(get_kv_bracket(line, "tile_bounds")));
+    } else if (arr && line.find(".tiles_BEGIN") != std::string::npos) {
+      in_tiles = true;
+      tile = nullptr;
+    } else if (arr && line.find(".tiles_END") != std::string::npos) {
+      in_tiles = false;
+      tile = nullptr;
+    } else if (in_tiles && arr && line.rfind("  ord=", 0) == 0) {
+      std::size_t ord = std::stoull(get_kv(line, "ord"));
+      auto& ts = arr->tiles[ord];
+      ts.outer_vol = std::stol(get_kv(line, "outer_vol"));
+      ts.cell_labels.assign(ts.outer_vol, -1);
+      tile = &ts;
+    } else if (in_tiles && tile && line.rfind("    range[", 0) == 0) {
+      InnerRangeKey key;
+      key.lo = parse_long_list(get_kv_bracket(line, "inner_lo"));
+      key.hi = parse_long_list(get_kv_bracket(line, "inner_hi"));
+      tile->distinct.push_back(std::move(key));
+    } else if (in_tiles && tile && line.rfind("    cells_rle=", 0) == 0) {
+      for (auto& r : parse_cells_rle(line.substr(14)))
+        for (std::size_t k = r.lo; k <= r.hi && k < tile->cell_labels.size();
+             ++k)
+          tile->cell_labels[k] = r.label;
+    }
+  }
+  for (auto& e : entries)
+    for (auto* a : {&e.L, &e.R, &e.C})
+      for (auto& kv : a->tiles) {
+        auto& ts = kv.second;
+        bool any = false;
+        for (int l : ts.cell_labels)
+          if (l >= 0) {
+            any = true;
+            break;
+          }
+        if (!any && !ts.distinct.empty())
+          std::fill(ts.cell_labels.begin(), ts.cell_labels.end(), 0);
+      }
+  return entries;
+}
+
+// ===========================================================================
+// Fast cell-presence queries
+// ===========================================================================
+
+struct DimTiling {
+  std::vector<long> bounds;
+  std::vector<int> elem_to_tile;
+  long extent() const { return bounds.empty() ? 0 : bounds.back(); }
+  int ntiles() const { return static_cast<int>(bounds.size()) - 1; }
+};
+static DimTiling make_dim_tiling(std::vector<long> const& bounds) {
+  DimTiling dt;
+  dt.bounds = bounds;
+  long n = bounds.empty() ? 0 : bounds.back();
+  dt.elem_to_tile.assign(n, 0);
+  for (int t = 0; t + 1 < (int)bounds.size(); ++t)
+    for (long e = bounds[t]; e < bounds[t + 1]; ++e) dt.elem_to_tile[e] = t;
+  return dt;
+}
+struct OperandIndex {
+  ArraySpec const* spec = nullptr;
+  std::vector<DimTiling> dims;
+  std::vector<std::size_t> tile_strides;
+  void build(ArraySpec const& s) {
+    spec = &s;
+    dims.clear();
+    for (auto const& b : s.dim_tile_bounds) dims.push_back(make_dim_tiling(b));
+    int rank = s.outer_rank;
+    tile_strides.assign(rank, 1);
+    for (int d = rank - 2; d >= 0; --d)
+      tile_strides[d] = tile_strides[d + 1] * dims[d + 1].ntiles();
+  }
+  int rank() const { return spec->outer_rank; }
+  long dim_extent(int d) const { return dims[d].extent(); }
+  int query(std::vector<long> const& idx,
+            std::vector<long>* out_hi = nullptr) const {
+    int rank = spec->outer_rank;
+    std::size_t tord = 0;
+    std::vector<long> tile_lo(rank), tile_ext(rank);
+    for (int d = 0; d < rank; ++d) {
+      int t = dims[d].elem_to_tile[idx[d]];
+      tord += static_cast<std::size_t>(t) * tile_strides[d];
+      tile_lo[d] = dims[d].bounds[t];
+      tile_ext[d] = dims[d].bounds[t + 1] - dims[d].bounds[t];
+    }
+    auto it = spec->tiles.find(tord);
+    if (it == spec->tiles.end()) return -1;
+    auto const& ts = it->second;
+    std::size_t k = 0;
+    for (int d = 0; d < rank; ++d)
+      k = k * static_cast<std::size_t>(tile_ext[d]) + (idx[d] - tile_lo[d]);
+    if (k >= ts.cell_labels.size()) return -1;
+    int lbl = ts.cell_labels[k];
+    if (lbl >= 0 && out_hi) *out_hi = ts.distinct[lbl].hi;
+    return lbl;
+  }
+};
+
+// ===========================================================================
+// Annotation classification
+// ===========================================================================
+
+static void split_annot(std::string const& annot, std::vector<std::string>& o,
+                        std::vector<std::string>& in) {
+  o.clear();
+  in.clear();
+  auto semi = annot.find(';');
+  std::string os = annot.substr(0, semi);
+  std::string is = (semi == std::string::npos) ? "" : annot.substr(semi + 1);
+  auto split = [](std::string const& s, std::vector<std::string>& out) {
+    std::string buf;
+    for (char c : s) {
+      if (c == ',') {
+        if (!buf.empty()) out.push_back(buf);
+        buf.clear();
+      } else
+        buf.push_back(c);
+    }
+    if (!buf.empty()) out.push_back(buf);
+  };
+  split(os, o);
+  split(is, in);
+}
+static int find_idx(std::vector<std::string> const& v, std::string const& s) {
+  for (int i = 0; i < (int)v.size(); ++i)
+    if (v[i] == s) return i;
+  return -1;
+}
+
+// ===========================================================================
+// Per-Hadamard-slice work unit
+// ===========================================================================
+
+struct WorkUnit {
+  long P, Q, Mmu, nK;
+};
+
+// Ready-to-fuse arena operands for one slice.
+struct SliceOperand {
+  Outer Lslab;   // outer {nK},        inner {P,Q}: L_Κ[a_1,a_4]
+  Outer Rslab;   // outer {nK, Mμ},    inner {Q}  : R(Κ,μ̃)[a_4], μ̃ fastest
+  Outer Cslab;   // outer {Mμ},        inner {P}  : C(μ̃)[a_1]
+  long P, Q, Mmu, nK;
+  std::ptrdiff_t sR;  // inter-μ̃-cell stride within a Κ block (elements)
+  std::ptrdiff_t sC;  // inter-μ̃-cell stride of C (elements)
+  std::vector<double> ref;  // golden Mμ*P reference
+  // scratch pack buffers (reused across reps for the packed path)
+  std::vector<double> Lpacked, Rpacked;
+};
+
+static void build_slice(SliceOperand& s, WorkUnit const& wu,
+                        std::mt19937_64& rng) {
+  s.P = wu.P;
+  s.Q = wu.Q;
+  s.Mmu = wu.Mmu;
+  s.nK = wu.nK;
+  const long P = wu.P, Q = wu.Q, Mmu = wu.Mmu, nK = wu.nK;
+
+  s.Lslab = TA::detail::arena_outer_init<Outer>(
+      TA::Range{static_cast<std::size_t>(nK)}, 1,
+      [P, Q](std::size_t) { return InnerRange{P, Q}; }, /*zero_init=*/false);
+  s.Rslab = TA::detail::arena_outer_init<Outer>(
+      TA::Range{static_cast<std::size_t>(nK), static_cast<std::size_t>(Mmu)}, 1,
+      [Q](std::size_t) { return InnerRange{Q}; }, /*zero_init=*/false);
+  s.Cslab = TA::detail::arena_outer_init<Outer>(
+      TA::Range{static_cast<std::size_t>(Mmu)}, 1,
+      [P](std::size_t) { return InnerRange{P}; }, /*zero_init=*/true);
+
+  std::uniform_real_distribution<double> dist(-1.0, 1.0);
+  for (long k = 0; k < nK; ++k) {
+    double* l = s.Lslab.data()[k].data();
+    for (long e = 0; e < P * Q; ++e) l[e] = dist(rng);
+    for (long mu = 0; mu < Mmu; ++mu) {
+      double* r = s.Rslab.data()[k * Mmu + mu].data();
+      for (long q = 0; q < Q; ++q) r[q] = dist(rng);
+    }
+  }
+  // strides (constant within a uniform-Q / uniform-P slice)
+  s.sR = (Mmu > 1)
+             ? (s.Rslab.data()[1].data() - s.Rslab.data()[0].data())  // μ̃ fast
+             : Q;
+  s.sC = (Mmu > 1) ? (s.Cslab.data()[1].data() - s.Cslab.data()[0].data()) : P;
+
+  // golden reference: C[μ̃,a_1] = sum_Κ sum_{a_4} L_Κ[a_1,a_4] * R(Κ,μ̃)[a_4]
+  s.ref.assign(static_cast<std::size_t>(Mmu) * P, 0.0);
+  for (long k = 0; k < nK; ++k) {
+    const double* l = s.Lslab.data()[k].data();  // P x Q row-major
+    for (long mu = 0; mu < Mmu; ++mu) {
+      const double* r = s.Rslab.data()[k * Mmu + mu].data();  // Q
+      double* c = &s.ref[mu * P];
+      for (long a1 = 0; a1 < P; ++a1) {
+        double acc = 0;
+        const double* lr = l + a1 * Q;
+        for (long a4 = 0; a4 < Q; ++a4) acc += lr[a4] * r[a4];
+        c[a1] += acc;
+      }
+    }
+  }
+  s.Lpacked.assign(static_cast<std::size_t>(P) * nK * Q, 0.0);
+  s.Rpacked.assign(static_cast<std::size_t>(Mmu) * nK * Q, 0.0);
+}
+
+// ===========================================================================
+// The four evaluation paths (one slice)
+// ===========================================================================
+
+namespace tamb = TiledArray::math::blas;
+using integer = tamb::integer;
+
+static void zero_C(SliceOperand& s) {
+  for (long mu = 0; mu < s.Mmu; ++mu)
+    std::memset(s.Cslab.data()[mu].data(), 0, sizeof(double) * s.P);
+}
+
+// current_gemm: Mμ·nK tiny (P x 1, K=Q) GEMMs -- TA's actual inner-gemm shape.
+static void eval_current_gemm(SliceOperand& s) {
+  const integer P = s.P, Q = s.Q;
+  for (long k = 0; k < s.nK; ++k) {
+    const double* Lk = s.Lslab.data()[k].data();  // P x Q
+    for (long mu = 0; mu < s.Mmu; ++mu) {
+      const double* r = s.Rslab.data()[k * s.Mmu + mu].data();  // Q
+      double* c = s.Cslab.data()[mu].data();                    // P
+      // C(P x 1) += L(P x Q) * R(Q x 1)
+      tamb::gemm(tamb::Op::NoTrans, tamb::Op::NoTrans, /*M=*/P, /*N=*/1,
+                 /*K=*/Q, 1.0, /*A=*/Lk, /*lda=*/Q, /*B=*/r, /*ldb=*/1,
+                 /*beta=*/1.0, /*C=*/c, /*ldc=*/1);
+    }
+  }
+}
+
+// current_gemv: Mμ·nK BLAS dgemv calls.
+static void eval_current_gemv(SliceOperand& s) {
+  const integer P = s.P, Q = s.Q;
+  for (long k = 0; k < s.nK; ++k) {
+    const double* Lk = s.Lslab.data()[k].data();
+    for (long mu = 0; mu < s.Mmu; ++mu) {
+      const double* r = s.Rslab.data()[k * s.Mmu + mu].data();
+      double* c = s.Cslab.data()[mu].data();
+      // y(P) += L(P x Q) * x(Q)
+      ::blas::gemv(::blas::Layout::RowMajor, ::blas::Op::NoTrans, P, Q, 1.0, Lk,
+                   Q, r, 1, 1.0, c, 1);
+    }
+  }
+}
+
+// strided: nK strided GEMMs, μ̃ ridden into M, a_4 the contiguous inner K.
+//   C̃[μ̃,a_1] += R̃_Κ[μ̃,a_4] · L_Κ[a_1,a_4]^T
+//   R̃_Κ: M x K = Mμ x Q, lda = sR (μ̃ rows strided; a_4 contiguous)
+//   L_Κ: stored a_1 x a_4 = N x K, op=Trans, ldb = Q
+//   C̃ : M x N = Mμ x P, ldc = sC
+static void eval_strided(SliceOperand& s) {
+  const integer Mmu = s.Mmu, P = s.P, Q = s.Q;
+  for (long k = 0; k < s.nK; ++k) {
+    const double* Rk = s.Rslab.data()[k * s.Mmu].data();  // base of Κ block
+    const double* Lk = s.Lslab.data()[k].data();
+    double* C = s.Cslab.data()[0].data();
+    tamb::gemm(tamb::Op::NoTrans, tamb::Op::Trans, /*M=*/Mmu, /*N=*/P, /*K=*/Q,
+               1.0, /*A=*/Rk, /*lda=*/static_cast<integer>(s.sR), /*B=*/Lk,
+               /*ldb=*/Q, /*beta=*/(k == 0 ? 0.0 : 1.0), /*C=*/C,
+               /*ldc=*/static_cast<integer>(s.sC));
+  }
+}
+
+// packed: pack (Κ,a_4) contiguous, then ONE GEMM. Pack cost is timed (this is
+// the deep-clone tradeoff). C̃[μ̃,a_1] = Rp[μ̃,(Κ,a_4)] · Lp[a_1,(Κ,a_4)]^T
+static void eval_packed(SliceOperand& s) {
+  const integer Mmu = s.Mmu, P = s.P, Q = s.Q, nK = s.nK;
+  const integer KK = nK * Q;
+  double* Lp = s.Lpacked.data();  // P x KK row-major
+  double* Rp = s.Rpacked.data();  // Mμ x KK row-major
+  for (long k = 0; k < nK; ++k) {
+    const double* Lk = s.Lslab.data()[k].data();  // P x Q
+    for (long a1 = 0; a1 < P; ++a1)
+      std::memcpy(Lp + a1 * KK + k * Q, Lk + a1 * Q, sizeof(double) * Q);
+    for (long mu = 0; mu < Mmu; ++mu) {
+      const double* r = s.Rslab.data()[k * Mmu + mu].data();
+      std::memcpy(Rp + mu * KK + k * Q, r, sizeof(double) * Q);
+    }
+  }
+  double* C = s.Cslab.data()[0].data();
+  tamb::gemm(tamb::Op::NoTrans, tamb::Op::Trans, /*M=*/Mmu, /*N=*/P, /*K=*/KK,
+             1.0, /*A=*/Rp, /*lda=*/KK, /*B=*/Lp, /*ldb=*/KK, /*beta=*/0.0,
+             /*C=*/C, /*ldc=*/static_cast<integer>(s.sC));
+}
+
+static double max_abs_diff_ref(SliceOperand const& s) {
+  double d = 0;
+  for (long mu = 0; mu < s.Mmu; ++mu) {
+    const double* c = s.Cslab.data()[mu].data();
+    const double* ref = &s.ref[mu * s.P];
+    for (long a1 = 0; a1 < s.P; ++a1) d = std::max(d, std::abs(c[a1] - ref[a1]));
+  }
+  return d;
+}
+
+// ===========================================================================
+// CLI
+// ===========================================================================
+
+struct Cli {
+  std::string dump =
+      "/Users/zhihaodeng/packages/mpqc4/agent/experiments/C6H14/profile/"
+      "op_dump.txt";
+  long max_slices = 16;  // timed-pool cap (slices can be large); 0 = all
+  int repeats = 5;
+  int warmup = 1;
+  int seed = 42;
+  std::string mode = "all";  // current_gemm|current_gemv|strided|packed|all
+  bool check = true;
+};
+static void usage() {
+  std::fprintf(stderr,
+               "opa_strided_arena_dgemm\n"
+               "  --dump PATH        op_dump.txt path\n"
+               "  --max_slices N     timed-pool cap (0=all)  (default 16)\n"
+               "  --repeats R        timed reps              (default 5)\n"
+               "  --warmup W         untimed warmup reps      (default 1)\n"
+               "  --seed S           RNG seed                 (default 42)\n"
+               "  --mode M  current_gemm|current_gemv|strided|packed|all\n"
+               "  --no_check         skip correctness check\n");
+}
+static Cli parse_cli(int argc, char** argv) {
+  Cli c;
+  for (int i = 1; i < argc; ++i) {
+    std::string a = argv[i];
+    auto need = [&]() -> std::string {
+      if (i + 1 >= argc) {
+        usage();
+        std::exit(1);
+      }
+      return argv[++i];
+    };
+    if (a == "--dump")
+      c.dump = need();
+    else if (a == "--max_slices")
+      c.max_slices = std::stol(need());
+    else if (a == "--repeats")
+      c.repeats = std::stoi(need());
+    else if (a == "--warmup")
+      c.warmup = std::stoi(need());
+    else if (a == "--seed")
+      c.seed = std::stoi(need());
+    else if (a == "--mode")
+      c.mode = need();
+    else if (a == "--no_check")
+      c.check = false;
+    else if (a == "-h" || a == "--help") {
+      usage();
+      std::exit(0);
+    } else {
+      std::fprintf(stderr, "unknown flag: %s\n", a.c_str());
+      usage();
+      std::exit(1);
+    }
+  }
+  return c;
+}
+
+// ===========================================================================
+// main
+// ===========================================================================
+
+int main(int argc, char** argv) {
+  Cli cli = parse_cli(argc, argv);
+  auto& world = TA_SCOPED_INITIALIZE(argc, argv);
+  (void)world;
+
+  std::printf("=== op_A strided-vs-current arena DGEMM bench ===\n");
+  std::printf("dump=%s\n", cli.dump.c_str());
+  auto dump = parse_dump(cli.dump);
+
+  // op_A: result annotation has exactly 1 inner index.
+  auto n_inner = [](std::string const& annot) {
+    auto p = annot.find(';');
+    if (p == std::string::npos) return 0;
+    auto in = annot.substr(p + 1);
+    if (in.empty()) return 0;
+    int n = 1;
+    for (char c : in)
+      if (c == ',') ++n;
+    return n;
+  };
+  DumpEntry const* op = nullptr;
+  for (auto const& e : dump)
+    if (n_inner(e.annot_out) == 1) {
+      op = &e;
+      break;
+    }
+  if (!op) {
+    std::fprintf(stderr, "ERROR: no op_A (1 inner index) in dump\n");
+    return 1;
+  }
+
+  std::vector<std::string> Lo, Li, Ro, Ri, Co, Ci;
+  split_annot(op->annot_left, Lo, Li);
+  split_annot(op->annot_right, Ro, Ri);
+  split_annot(op->annot_out, Co, Ci);
+
+  std::vector<std::string> hadamard, contracted, ext_left, ext_right;
+  for (auto const& s : Lo) {
+    bool inR = find_idx(Ro, s) >= 0, inC = find_idx(Co, s) >= 0;
+    if (inR && inC)
+      hadamard.push_back(s);
+    else if (inR && !inC)
+      contracted.push_back(s);
+    else if (!inR && inC)
+      ext_left.push_back(s);
+  }
+  for (auto const& s : Ro)
+    if (find_idx(Lo, s) < 0 && find_idx(Co, s) >= 0) ext_right.push_back(s);
+
+  auto join = [](std::vector<std::string> const& v) {
+    std::string s;
+    for (std::size_t i = 0; i < v.size(); ++i) s += (i ? "," : "") + v[i];
+    return s;
+  };
+  std::printf("  annot_left  = %s\n", op->annot_left.c_str());
+  std::printf("  annot_right = %s\n", op->annot_right.c_str());
+  std::printf("  annot_out   = %s\n", op->annot_out.c_str());
+  std::printf(
+      "  hadamard={%s} outer-contracted={%s} ext_left={%s} ext_right={%s}\n",
+      join(hadamard).c_str(), join(contracted).c_str(), join(ext_left).c_str(),
+      join(ext_right).c_str());
+  std::printf("  inner-contracted={%s} inner-kept={%s}\n",
+              [&] {  // R inner not in C inner = contracted
+                std::string s;
+                for (auto const& x : Ri)
+                  if (find_idx(Ci, x) < 0) s += (s.empty() ? "" : ",") + x;
+                return s;
+              }()
+                  .c_str(),
+              join(Ci).c_str());
+
+  if (contracted.size() != 1 || ext_right.size() != 1 || hadamard.empty()) {
+    std::fprintf(stderr,
+                 "ERROR: expected op_A form (1 outer-contracted, 1 outer-right-"
+                 "external, >=1 Hadamard); got c=%zu er=%zu h=%zu\n",
+                 contracted.size(), ext_right.size(), hadamard.size());
+    return 1;
+  }
+
+  OperandIndex Lidx, Ridx, Cidx;
+  Lidx.build(op->L);
+  Ridx.build(op->R);
+  Cidx.build(op->C);
+  (void)Ridx;
+
+  // position maps (index name -> dim position) for L and C queries
+  std::unordered_map<std::string, int> Lpos, Cpos;
+  for (int d = 0; d < (int)Lo.size(); ++d) Lpos[Lo[d]] = d;
+  for (int d = 0; d < (int)Co.size(); ++d) Cpos[Co[d]] = d;
+
+  const std::string Kname = contracted[0];   // Κ
+  const std::string MUname = ext_right[0];    // μ̃
+  const long Kext = Lidx.dim_extent(Lpos[Kname]);
+  const long MUext = Cidx.dim_extent(Cpos[MUname]);
+  std::vector<long> hext;
+  for (auto const& hn : hadamard) hext.push_back(Lidx.dim_extent(Lpos[hn]));
+  std::printf("  Κ extent=%ld   μ̃ extent=%ld   hadamard extents=", Kext, MUext);
+  for (long h : hext) std::printf("%ld ", h);
+  std::printf("\n");
+
+  // --- reconstruct per-Hadamard-slice work units ---
+  std::printf("\nreconstructing per-slice work units from dump ...\n");
+  auto t_recon = clock_type::now();
+  std::vector<WorkUnit> units;
+  long total_slices = 0, total_result_cells = 0, total_current_calls = 0,
+       total_strided_calls = 0;
+  double total_flops = 0;
+  std::map<long, long> P_hist, Mmu_hist, nK_hist;
+
+  long n_had = 1;
+  for (long h : hext) n_had *= h;
+  std::vector<long> hidx(hadamard.size());
+  std::vector<long> lq(Lo.size()), cq(Co.size()), hi;
+  for (long hlin = 0; hlin < n_had; ++hlin) {
+    long rem = hlin;
+    for (int d = (int)hadamard.size() - 1; d >= 0; --d) {
+      hidx[d] = rem % hext[d];
+      rem /= hext[d];
+    }
+    // place hadamard values into L and C query templates
+    for (int d = 0; d < (int)hadamard.size(); ++d) {
+      lq[Lpos[hadamard[d]]] = hidx[d];
+      cq[Cpos[hadamard[d]]] = hidx[d];
+    }
+    // P,Q and nK from L(i_2,i_1,Κ)
+    long P = -1, Q = -1, nK = 0;
+    for (long k = 0; k < Kext; ++k) {
+      lq[Lpos[Kname]] = k;
+      int lbl = Lidx.query(lq, &hi);
+      if (lbl >= 0) {
+        ++nK;
+        if (P < 0) {
+          P = hi[0];
+          Q = (hi.size() > 1 ? hi[1] : hi[0]);
+        }
+      }
+    }
+    if (nK == 0 || P <= 0 || Q <= 0) continue;
+    // Mμ from C(μ̃,i_2,i_1)
+    long Mmu = 0;
+    for (long mu = 0; mu < MUext; ++mu) {
+      cq[Cpos[MUname]] = mu;
+      if (Cidx.query(cq) >= 0) ++Mmu;
+    }
+    if (Mmu == 0) continue;
+    units.push_back({P, Q, Mmu, nK});
+    ++total_slices;
+    total_result_cells += Mmu;
+    total_current_calls += Mmu * nK;
+    total_strided_calls += nK;
+    total_flops += 2.0 * P * Q * Mmu * nK;
+    ++P_hist[P];
+    ++Mmu_hist[Mmu];
+    ++nK_hist[nK];
+  }
+  std::printf("  reconstructed %ld active Hadamard slices in %.1f ms\n",
+              total_slices, ms_since(t_recon));
+  if (units.empty()) {
+    std::fprintf(stderr, "ERROR: no work units\n");
+    return 1;
+  }
+
+  std::printf("\n--- FAITHFUL whole-op work (from dump) ---\n");
+  std::printf("  Hadamard slices        : %ld\n", total_slices);
+  std::printf("  result cells (=ΣMμ)    : %ld\n", total_result_cells);
+  std::printf("  current calls (=ΣMμ·nK): %ld\n", total_current_calls);
+  std::printf("  strided calls   (=ΣnK)   : %ld   (reduction %.1fx)\n",
+              total_strided_calls,
+              double(total_current_calls) / double(total_strided_calls));
+  std::printf("  packed calls  (=slices): %ld\n", total_slices);
+  std::printf("  total flops            : %.3e\n", total_flops);
+  auto dump_hist = [](const char* lbl, std::map<long, long> const& h) {
+    std::printf("  %-22s: ", lbl);
+    int n = 0;
+    for (auto const& kv : h) {
+      std::printf("%ld:%ld  ", kv.first, kv.second);
+      if (++n >= 16) {
+        std::printf("...");
+        break;
+      }
+    }
+    std::printf("\n");
+  };
+  dump_hist("P(=|a_1|) dist", P_hist);
+  dump_hist("Mμ dist", Mmu_hist);
+  dump_hist("nK dist", nK_hist);
+
+  // --- bounded timed sample (stride-sampled to span the distribution) ---
+  long n_sample =
+      (cli.max_slices <= 0) ? total_slices
+                            : std::min(cli.max_slices, total_slices);
+  std::vector<WorkUnit> sample;
+  sample.reserve(n_sample);
+  double sample_flops = 0;
+  long sample_current_calls = 0, sample_strided_calls = 0;
+  {
+    double step = double(total_slices) / double(n_sample);
+    for (long s = 0; s < n_sample; ++s) {
+      long i = std::min<long>(total_slices - 1, (long)std::llround(s * step));
+      sample.push_back(units[i]);
+      sample_flops += 2.0 * units[i].P * units[i].Q * units[i].Mmu * units[i].nK;
+      sample_current_calls += units[i].Mmu * units[i].nK;
+      sample_strided_calls += units[i].nK;
+    }
+  }
+  const double extrap = double(total_slices) / double(n_sample);
+  std::printf("\n--- timed sample ---\n");
+  std::printf("  sampling %ld / %ld slices (extrapolation x%.2f)\n", n_sample,
+              total_slices, extrap);
+
+  std::mt19937_64 rng(cli.seed);
+  std::printf("  building arena slice pool ...\n");
+  auto t_pool = clock_type::now();
+  std::vector<SliceOperand> pool(n_sample);
+  for (long s = 0; s < n_sample; ++s) build_slice(pool[s], sample[s], rng);
+  std::printf("  pool built in %.1f ms\n", ms_since(t_pool));
+
+  if (cli.check) {
+    auto check_mode = [&](const char* name, void (*fn)(SliceOperand&)) {
+      double d = 0;
+      for (auto& s : pool) {
+        zero_C(s);
+        fn(s);
+        d = std::max(d, max_abs_diff_ref(s));
+      }
+      std::printf("  check %-13s max_abs_diff=%.3e  %s\n", name, d,
+                  d < 1e-9 ? "pass" : "FAIL");
+      return d < 1e-9;
+    };
+    bool ok = true;
+    ok &= check_mode("current_gemm", eval_current_gemm);
+    ok &= check_mode("current_gemv", eval_current_gemv);
+    ok &= check_mode("strided", eval_strided);
+    ok &= check_mode("packed", eval_packed);
+    if (!ok) {
+      std::fprintf(stderr, "correctness FAILED -- aborting timing\n");
+      return 2;
+    }
+  }
+
+  std::printf("\nresults (min/median over %d reps; sample of %ld slices)\n",
+              cli.repeats, n_sample);
+  double g_cg = 0, g_cv = 0, g_fu = 0, g_pk = 0;
+  auto run = [&](const char* name, void (*fn)(SliceOperand&), long calls,
+                 double& slot) {
+    if (cli.mode != "all" && cli.mode != name) return;
+    for (int w = 0; w < cli.warmup; ++w)
+      for (auto& s : pool) {
+        zero_C(s);
+        fn(s);
+      }
+    std::vector<double> times;
+    for (int r = 0; r < cli.repeats; ++r) {
+      for (auto& s : pool) zero_C(s);
+      auto t0 = clock_type::now();
+      for (auto& s : pool) fn(s);
+      times.push_back(ms_since(t0));
+    }
+    std::sort(times.begin(), times.end());
+    double mn = times.front(), md = times[times.size() / 2];
+    double gf = (sample_flops / 1e9) / (mn / 1e3);
+    slot = mn;
+    std::printf(
+        "  %-13s  min=%8.2f ms  median=%8.2f ms  %7.2f GFLOPS  calls=%ld   "
+        "(whole-op est: %8.1f ms)\n",
+        name, mn, md, gf, calls, mn * extrap);
+  };
+  run("current_gemm", eval_current_gemm, sample_current_calls, g_cg);
+  run("current_gemv", eval_current_gemv, sample_current_calls, g_cv);
+  run("strided", eval_strided, sample_strided_calls, g_fu);
+  run("packed", eval_packed, n_sample, g_pk);
+
+  if (cli.mode == "all") {
+    std::printf("\n--- speedups (min time) ---\n");
+    if (g_fu > 0) {
+      std::printf("  strided  vs current_gemm : %.2fx\n", g_cg / g_fu);
+      std::printf("  strided  vs current_gemv : %.2fx\n", g_cv / g_fu);
+    }
+    if (g_pk > 0) {
+      std::printf("  packed vs current_gemm : %.2fx\n", g_cg / g_pk);
+      std::printf("  packed vs strided        : %.2fx  (pack incl.)\n",
+                  g_fu / g_pk);
+    }
+  }
+  std::printf("\n");
+  return 0;
+}
diff --git a/examples/tot_bench/opb_strided_arena_dgemm.cpp b/examples/tot_bench/opb_strided_arena_dgemm.cpp
new file mode 100644
index 0000000000..d4b08a3966
--- /dev/null
+++ b/examples/tot_bench/opb_strided_arena_dgemm.cpp
@@ -0,0 +1,811 @@
+// opb_strided_arena_dgemm.cpp
+// ---------------------------------------------------------------------------
+// Standalone tile/BLAS-level benchmark for the op_B ToT x ToT -> ToT
+// outer-contraction with an inner OUTER-PRODUCT, on ArenaTensor inner cells.
+//
+//   op_B:  I(i_2,i_1,μ̃,Κ; a_1) * I(μ̃,i_1,i_2; a_4) -> I(i_2,i_1,Κ; a_1,a_4)
+//          Hadamard = {i_1,i_2}, contracted = {μ̃} (=J), ext-left = {Κ}
+//          P = |a_1|, Q = |a_4|  depend (jaggedly) on the Hadamard slice.
+//
+//   Per result cell (i_2,i_1,Κ):
+//       C(p,q) = sum_{μ̃} L(i_2,i_1,μ̃,Κ)(p) * R(μ̃,i_1,i_2)(q)
+//   i.e. an accumulation over J=|μ̃| rank-1 outer products into a P x Q block.
+//
+// We compare three ways to evaluate one result cell, ALL reading the SAME
+// already-fusable arena slabs (operands built contracted-index-fastest, so the
+// J reduced cells are one contiguous, constant-stride run -- the "already
+// permuted" precondition; achieving that layout from the real scattered operand
+// is a separate cost, not measured here):
+//
+//   SEQ-gemm : per cell, J tiny PxQ GEMMs with K=1, beta=1 (TA's current
+//              per-cell inner-op shape).
+//   SEQ-ger  : per cell, J BLAS dger rank-1 updates.
+//   FUSED    : per cell, ONE PxQ GEMM with K=J, riding μ̃ into the BLAS K axis
+//              by passing the inter-cell slab stride as the leading dimension
+//              (zero-copy). #BLAS calls drops from J to 1 per cell.
+//
+// Work units {P,Q,J} are reconstructed FAITHFULLY from op_dump.txt (jagged P,Q
+// per Hadamard slice, real per-cell J after sparsity). The full distribution is
+// reported; timing runs over a bounded sample (--max_cells) and is extrapolated
+// to the whole op.
+// ---------------------------------------------------------------------------
+
+#include <tiledarray.h>
+#include <TiledArray/math/blas.h>
+#include <TiledArray/tensor/arena_kernels.h>
+#include <TiledArray/tensor/arena_tensor.h>
+
+#include <blas.hh>
+#include <btas/zb/range.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <limits>
+#include <map>
+#include <random>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace TA = TiledArray;
+
+using Inner = TA::ArenaTensor<double>;  // default btas::zb range, matches MPQC
+using InnerRange = typename Inner::range_type;
+using Outer = TA::Tensor<Inner>;
+
+using clock_type = std::chrono::steady_clock;
+static double ms_since(clock_type::time_point t0) {
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(clock_type::now() -
+                                                              t0)
+             .count() /
+         1.0e6;
+}
+
+// ===========================================================================
+// op_dump.txt parser (subset of cck_bottleneck_bench.cpp -- structure only)
+// ===========================================================================
+
+struct InnerRangeKey {
+  std::vector<long> lo, hi;
+};
+
+struct TileSpec {
+  long outer_vol = 0;
+  std::vector<InnerRangeKey> distinct;
+  std::vector<int> cell_labels;  // size outer_vol; -1=empty else index distinct
+};
+
+struct ArraySpec {
+  int outer_rank = 0;
+  std::vector<std::vector<long>> dim_tile_bounds;  // per outer dim
+  std::map<std::size_t, TileSpec> tiles;           // ord -> tile
+};
+
+struct DumpEntry {
+  std::string annot_left, annot_right, annot_out;
+  double t_einsum = 0;
+  ArraySpec L, R, C;
+};
+
+static std::vector<long> parse_long_list(std::string s) {
+  std::vector<long> v;
+  std::string buf;
+  for (char c : s) {
+    if (c == '[' || c == ' ') continue;
+    if (c == ',' || c == ']') {
+      if (!buf.empty()) {
+        v.push_back(std::stol(buf));
+        buf.clear();
+      }
+      if (c == ']') break;
+    } else
+      buf.push_back(c);
+  }
+  if (!buf.empty()) v.push_back(std::stol(buf));
+  return v;
+}
+
+static std::string get_kv(std::string const& line, std::string const& key) {
+  auto pos = line.find(key + "=");
+  if (pos == std::string::npos) return "";
+  pos += key.size() + 1;
+  auto end = line.find_first_of(" \t\n", pos);
+  return line.substr(pos, end == std::string::npos ? std::string::npos
+                                                   : end - pos);
+}
+
+static std::string get_kv_bracket(std::string const& line,
+                                  std::string const& key) {
+  auto pos = line.find(key + "=[");
+  if (pos == std::string::npos) return "";
+  pos += key.size() + 1;
+  auto end = line.find(']', pos);
+  if (end == std::string::npos) return "";
+  return line.substr(pos, end - pos + 1);
+}
+
+struct CellRun {
+  std::size_t lo, hi;
+  int label;  // -1 = E
+};
+
+static std::vector<CellRun> parse_cells_rle(std::string const& s) {
+  std::vector<CellRun> runs;
+  std::size_t i = 0;
+  while (i < s.size()) {
+    if (s[i] != '[') {
+      ++i;
+      continue;
+    }
+    auto end = s.find(']', i);
+    if (end == std::string::npos) break;
+    auto inner = s.substr(i + 1, end - i - 1);
+    auto dotdot = inner.find("..");
+    auto colon = inner.find(':');
+    if (dotdot == std::string::npos || colon == std::string::npos) {
+      i = end + 1;
+      continue;
+    }
+    CellRun r;
+    r.lo = std::stoull(inner.substr(0, dotdot));
+    r.hi = std::stoull(inner.substr(dotdot + 2, colon - dotdot - 2));
+    auto lab = inner.substr(colon + 1);
+    r.label = (lab == "E") ? -1 : std::stoi(lab.substr(1));
+    runs.push_back(r);
+    i = end + 1;
+  }
+  return runs;
+}
+
+static std::vector<DumpEntry> parse_dump(std::string const& path) {
+  std::ifstream f(path);
+  if (!f) {
+    std::fprintf(stderr, "ERROR: cannot open %s\n", path.c_str());
+    std::exit(1);
+  }
+  std::vector<DumpEntry> entries;
+  DumpEntry* cur = nullptr;
+  ArraySpec* arr = nullptr;
+  TileSpec* tile = nullptr;
+  std::string line;
+  bool in_tiles = false;
+
+  while (std::getline(f, line)) {
+    if (line.rfind("===== OP DUMP id=", 0) == 0) {
+      entries.emplace_back();
+      cur = &entries.back();
+      auto t = line.find("t_einsum=");
+      if (t != std::string::npos) cur->t_einsum = std::stod(line.substr(t + 9));
+      arr = nullptr;
+      tile = nullptr;
+      in_tiles = false;
+    } else if (!cur) {
+      continue;
+    } else if (line.rfind("annot_left=", 0) == 0) {
+      cur->annot_left = line.substr(11);
+    } else if (line.rfind("annot_right=", 0) == 0) {
+      cur->annot_right = line.substr(12);
+    } else if (line.rfind("annot_out=", 0) == 0) {
+      cur->annot_out = line.substr(10);
+    } else if (line.rfind("L.outer_rank=", 0) == 0) {
+      arr = &cur->L;
+      arr->outer_rank = std::stoi(line.substr(13));
+      in_tiles = false;
+    } else if (line.rfind("R.outer_rank=", 0) == 0) {
+      arr = &cur->R;
+      arr->outer_rank = std::stoi(line.substr(13));
+      in_tiles = false;
+    } else if (line.rfind("C.outer_rank=", 0) == 0) {
+      arr = &cur->C;
+      arr->outer_rank = std::stoi(line.substr(13));
+      in_tiles = false;
+    } else if (arr && line.find(".dim[") != std::string::npos &&
+               line.find("tile_bounds=") != std::string::npos) {
+      arr->dim_tile_bounds.push_back(
+          parse_long_list(get_kv_bracket(line, "tile_bounds")));
+    } else if (arr && line.find(".tiles_BEGIN") != std::string::npos) {
+      in_tiles = true;
+      tile = nullptr;
+    } else if (arr && line.find(".tiles_END") != std::string::npos) {
+      in_tiles = false;
+      tile = nullptr;
+    } else if (in_tiles && arr && line.rfind("  ord=", 0) == 0) {
+      std::size_t ord = std::stoull(get_kv(line, "ord"));
+      auto& ts = arr->tiles[ord];
+      ts.outer_vol = std::stol(get_kv(line, "outer_vol"));
+      ts.cell_labels.assign(ts.outer_vol, -1);
+      tile = &ts;
+    } else if (in_tiles && tile && line.rfind("    range[", 0) == 0) {
+      InnerRangeKey key;
+      key.lo = parse_long_list(get_kv_bracket(line, "inner_lo"));
+      key.hi = parse_long_list(get_kv_bracket(line, "inner_hi"));
+      tile->distinct.push_back(std::move(key));
+    } else if (in_tiles && tile && line.rfind("    cells_rle=", 0) == 0) {
+      for (auto& r : parse_cells_rle(line.substr(14)))
+        for (std::size_t k = r.lo; k <= r.hi && k < tile->cell_labels.size();
+             ++k)
+          tile->cell_labels[k] = r.label;
+    }
+  }
+  // Tiles with a single distinct range and no emitted RLE are fully uniform.
+  for (auto& e : entries)
+    for (auto* a : {&e.L, &e.R, &e.C})
+      for (auto& kv : a->tiles) {
+        auto& ts = kv.second;
+        bool any = false;
+        for (int l : ts.cell_labels)
+          if (l >= 0) {
+            any = true;
+            break;
+          }
+        if (!any && !ts.distinct.empty())
+          std::fill(ts.cell_labels.begin(), ts.cell_labels.end(), 0);
+      }
+  return entries;
+}
+
+// ===========================================================================
+// Fast cell-presence / inner-range queries over an ArraySpec
+// ===========================================================================
+
+// A flattened tiled-range for one outer dim: element -> tile index, and the
+// per-tile element extent, all from the dump's tile_bounds.
+struct DimTiling {
+  std::vector<long> bounds;          // size ntiles+1
+  std::vector<int> elem_to_tile;     // size n_elem
+  long extent() const { return bounds.empty() ? 0 : bounds.back(); }
+  int ntiles() const { return static_cast<int>(bounds.size()) - 1; }
+};
+
+static DimTiling make_dim_tiling(std::vector<long> const& bounds) {
+  DimTiling dt;
+  dt.bounds = bounds;
+  long n = bounds.empty() ? 0 : bounds.back();
+  dt.elem_to_tile.assign(n, 0);
+  for (int t = 0; t + 1 < (int)bounds.size(); ++t)
+    for (long e = bounds[t]; e < bounds[t + 1]; ++e) dt.elem_to_tile[e] = t;
+  return dt;
+}
+
+// Indexable view of one operand: maps a full element multi-index (in this
+// operand's dim order) to (present?, inner-range hi extents).
+struct OperandIndex {
+  ArraySpec const* spec = nullptr;
+  std::vector<DimTiling> dims;
+  std::vector<std::size_t> tile_strides;  // row-major over tile grid
+
+  void build(ArraySpec const& s) {
+    spec = &s;
+    dims.clear();
+    for (auto const& b : s.dim_tile_bounds) dims.push_back(make_dim_tiling(b));
+    int rank = s.outer_rank;
+    tile_strides.assign(rank, 1);
+    for (int d = rank - 2; d >= 0; --d)
+      tile_strides[d] = tile_strides[d + 1] * dims[d + 1].ntiles();
+  }
+
+  int rank() const { return spec->outer_rank; }
+  long dim_extent(int d) const { return dims[d].extent(); }
+
+  // Returns the cell label (>=0 present, -1 empty/absent) and, if present and
+  // out_hi != nullptr, fills the inner-range upper bounds.
+  int query(std::vector<long> const& idx,
+            std::vector<long>* out_hi = nullptr) const {
+    int rank = spec->outer_rank;
+    std::size_t tord = 0;
+    // tile ordinal
+    std::vector<long> tile_lo(rank);
+    std::vector<long> tile_ext(rank);
+    for (int d = 0; d < rank; ++d) {
+      int t = dims[d].elem_to_tile[idx[d]];
+      tord += static_cast<std::size_t>(t) * tile_strides[d];
+      tile_lo[d] = dims[d].bounds[t];
+      tile_ext[d] = dims[d].bounds[t + 1] - dims[d].bounds[t];
+    }
+    auto it = spec->tiles.find(tord);
+    if (it == spec->tiles.end()) return -1;
+    auto const& ts = it->second;
+    // within-tile row-major ordinal
+    std::size_t k = 0;
+    for (int d = 0; d < rank; ++d)
+      k = k * static_cast<std::size_t>(tile_ext[d]) + (idx[d] - tile_lo[d]);
+    if (k >= ts.cell_labels.size()) return -1;
+    int lbl = ts.cell_labels[k];
+    if (lbl >= 0 && out_hi) *out_hi = ts.distinct[lbl].hi;
+    return lbl;
+  }
+};
+
+// ===========================================================================
+// Annotation role classification
+// ===========================================================================
+
+// Split "a,b,c;x,y" -> outer={a,b,c}, inner={x,y}.
+static void split_annot(std::string const& annot, std::vector<std::string>& outer,
+                        std::vector<std::string>& inner) {
+  outer.clear();
+  inner.clear();
+  auto semi = annot.find(';');
+  std::string o = annot.substr(0, semi);
+  std::string in = (semi == std::string::npos) ? "" : annot.substr(semi + 1);
+  auto split = [](std::string const& s, std::vector<std::string>& out) {
+    std::string buf;
+    for (char c : s) {
+      if (c == ',') {
+        if (!buf.empty()) out.push_back(buf);
+        buf.clear();
+      } else
+        buf.push_back(c);
+    }
+    if (!buf.empty()) out.push_back(buf);
+  };
+  split(o, outer);
+  split(in, inner);
+}
+
+static int find_idx(std::vector<std::string> const& v, std::string const& s) {
+  for (int i = 0; i < (int)v.size(); ++i)
+    if (v[i] == s) return i;
+  return -1;
+}
+
+// ===========================================================================
+// Reconstructed work unit
+// ===========================================================================
+
+struct WorkUnit {
+  long P, Q, J;
+};
+
+// ===========================================================================
+// Arena slab operand for one work unit (built once, ready-to-fuse)
+// ===========================================================================
+
+struct CellOperand {
+  Outer Lslab;   // outer {J}, inner {P}: J cells of size P, μ̃-fastest
+  Outer Rslab;   // outer {J}, inner {Q}
+  Outer Ccell;   // outer {1}, inner {P,Q}
+  long P, Q, J;
+  std::ptrdiff_t sL, sR;  // inter-cell strides (elements); ld for strided GEMM
+  double* Cdata;
+  std::vector<double> ref;  // golden P*Q reference
+};
+
+static void build_cell_operand(CellOperand& op, WorkUnit const& wu,
+                               std::mt19937_64& rng) {
+  op.P = wu.P;
+  op.Q = wu.Q;
+  op.J = wu.J;
+  const long P = wu.P, Q = wu.Q, J = wu.J;
+  op.Lslab = TA::detail::arena_outer_init<Outer>(
+      TA::Range{static_cast<std::size_t>(J)}, 1,
+      [P](std::size_t) { return InnerRange{P}; }, /*zero_init=*/false);
+  op.Rslab = TA::detail::arena_outer_init<Outer>(
+      TA::Range{static_cast<std::size_t>(J)}, 1,
+      [Q](std::size_t) { return InnerRange{Q}; }, /*zero_init=*/false);
+  op.Ccell = TA::detail::arena_outer_init<Outer>(
+      TA::Range{1}, 1,
+      [P, Q](std::size_t) { return InnerRange{P, Q}; }, /*zero_init=*/true);
+
+  std::uniform_real_distribution<double> dist(-1.0, 1.0);
+  for (long j = 0; j < J; ++j) {
+    double* l = op.Lslab.data()[j].data();
+    for (long p = 0; p < P; ++p) l[p] = dist(rng);
+    double* r = op.Rslab.data()[j].data();
+    for (long q = 0; q < Q; ++q) r[q] = dist(rng);
+  }
+  op.Cdata = op.Ccell.data()[0].data();
+  op.sL = (J > 1) ? (op.Lslab.data()[1].data() - op.Lslab.data()[0].data()) : P;
+  op.sR = (J > 1) ? (op.Rslab.data()[1].data() - op.Rslab.data()[0].data()) : Q;
+
+  // Golden reference: C[p,q] = sum_j L_j[p] * R_j[q].
+  op.ref.assign(static_cast<std::size_t>(P) * Q, 0.0);
+  for (long j = 0; j < J; ++j) {
+    const double* l = op.Lslab.data()[j].data();
+    const double* r = op.Rslab.data()[j].data();
+    for (long p = 0; p < P; ++p)
+      for (long q = 0; q < Q; ++q) op.ref[p * Q + q] += l[p] * r[q];
+  }
+}
+
+// ===========================================================================
+// The three evaluation paths (one result cell)
+// ===========================================================================
+
+namespace tamb = TiledArray::math::blas;  // TA's row-major gemm wrapper
+using integer = tamb::integer;
+
+static void zero_C(CellOperand& op) {
+  std::memset(op.Cdata, 0, sizeof(double) * op.P * op.Q);
+}
+
+// SEQ-gemm: J tiny PxQ GEMMs, K=1, beta=1 (C pre-zeroed by caller).
+static void eval_seq_gemm(CellOperand& op) {
+  const integer P = op.P, Q = op.Q;
+  for (long j = 0; j < op.J; ++j) {
+    const double* Lj = op.Lslab.data()[j].data();
+    const double* Rj = op.Rslab.data()[j].data();
+    tamb::gemm(tamb::Op::NoTrans, tamb::Op::NoTrans, /*M=*/P, /*N=*/Q, /*K=*/1,
+               1.0, /*A=*/Lj, /*lda=*/1, /*B=*/Rj, /*ldb=*/Q, /*beta=*/1.0,
+               /*C=*/op.Cdata, /*ldc=*/Q);
+  }
+}
+
+// SEQ-ger: J BLAS dger rank-1 updates (C pre-zeroed by caller).
+static void eval_seq_ger(CellOperand& op) {
+  const integer P = op.P, Q = op.Q;
+  for (long j = 0; j < op.J; ++j) {
+    const double* Lj = op.Lslab.data()[j].data();
+    const double* Rj = op.Rslab.data()[j].data();
+    ::blas::ger(::blas::Layout::RowMajor, P, Q, 1.0, Lj, 1, Rj, 1, op.Cdata, Q);
+  }
+}
+
+// FUSED: one PxQ GEMM, K=J, μ̃ ridden into K via the inter-cell slab stride as
+// leading dimension. C(PxQ) = A~(PxJ) . B~(JxQ), beta=0 (full sum in one GEMM).
+//   A~[p,j] = Lslab_j[p] -> column-major PxJ (col stride sL) = row-major JxP^T
+//             => pass op_a = Trans on stored JxP (lda=sL)
+//   B~[j,q] = Rslab_j[q] -> row-major JxQ (row stride sR)  => op_b = NoTrans
+static void eval_strided(CellOperand& op) {
+  const integer P = op.P, Q = op.Q, J = op.J;
+  tamb::gemm(tamb::Op::Trans, tamb::Op::NoTrans, /*M=*/P, /*N=*/Q, /*K=*/J, 1.0,
+             /*A=*/op.Lslab.data()[0].data(), /*lda=*/op.sL,
+             /*B=*/op.Rslab.data()[0].data(), /*ldb=*/op.sR, /*beta=*/0.0,
+             /*C=*/op.Cdata, /*ldc=*/Q);
+}
+
+static double max_abs_diff_ref(CellOperand const& op) {
+  double d = 0;
+  for (std::size_t e = 0; e < op.ref.size(); ++e)
+    d = std::max(d, std::abs(op.Cdata[e] - op.ref[e]));
+  return d;
+}
+
+// ===========================================================================
+// CLI
+// ===========================================================================
+
+struct Cli {
+  std::string dump =
+      "/Users/zhihaodeng/packages/mpqc4/agent/experiments/C6H14/profile/"
+      "op_dump.txt";
+  long max_cells = 20000;  // timed-pool cap; 0 = all
+  int repeats = 5;
+  int warmup = 1;
+  int seed = 42;
+  std::string mode = "all";  // seq_gemm | seq_ger | strided | all
+  bool check = true;
+};
+
+static void usage() {
+  std::fprintf(stderr,
+               "opb_strided_arena_dgemm\n"
+               "  --dump PATH        op_dump.txt path\n"
+               "  --max_cells N      timed-pool cap (0=all)  (default 20000)\n"
+               "  --repeats R        timed reps             (default 5)\n"
+               "  --warmup W         untimed warmup reps     (default 1)\n"
+               "  --seed S           RNG seed                (default 42)\n"
+               "  --mode M           seq_gemm|seq_ger|strided|all (default all)\n"
+               "  --no_check         skip correctness check\n");
+}
+
+static Cli parse_cli(int argc, char** argv) {
+  Cli c;
+  for (int i = 1; i < argc; ++i) {
+    std::string a = argv[i];
+    auto need = [&]() -> std::string {
+      if (i + 1 >= argc) {
+        usage();
+        std::exit(1);
+      }
+      return argv[++i];
+    };
+    if (a == "--dump")
+      c.dump = need();
+    else if (a == "--max_cells")
+      c.max_cells = std::stol(need());
+    else if (a == "--repeats")
+      c.repeats = std::stoi(need());
+    else if (a == "--warmup")
+      c.warmup = std::stoi(need());
+    else if (a == "--seed")
+      c.seed = std::stoi(need());
+    else if (a == "--mode")
+      c.mode = need();
+    else if (a == "--no_check")
+      c.check = false;
+    else if (a == "-h" || a == "--help") {
+      usage();
+      std::exit(0);
+    } else {
+      std::fprintf(stderr, "unknown flag: %s\n", a.c_str());
+      usage();
+      std::exit(1);
+    }
+  }
+  return c;
+}
+
+// ===========================================================================
+// main
+// ===========================================================================
+
+int main(int argc, char** argv) {
+  Cli cli = parse_cli(argc, argv);
+  auto& world = TA_SCOPED_INITIALIZE(argc, argv);
+  (void)world;
+
+  std::printf("=== op_B strided-vs-sequential arena DGEMM bench ===\n");
+  std::printf("dump=%s\n", cli.dump.c_str());
+
+  auto dump = parse_dump(cli.dump);
+
+  // Select op_B: result annotation has 2 inner indices.
+  auto n_inner = [](std::string const& annot) {
+    auto p = annot.find(';');
+    if (p == std::string::npos) return 0;
+    auto in = annot.substr(p + 1);
+    if (in.empty()) return 0;
+    int n = 1;
+    for (char c : in)
+      if (c == ',') ++n;
+    return n;
+  };
+  DumpEntry const* op = nullptr;
+  for (auto const& e : dump)
+    if (n_inner(e.annot_out) == 2) {
+      op = &e;
+      break;
+    }
+  if (!op) {
+    std::fprintf(stderr, "ERROR: no op_B (2 inner indices) in dump\n");
+    return 1;
+  }
+
+  // --- classify outer index roles from annotations ---
+  std::vector<std::string> Lo, Li, Ro, Ri, Co, Ci;
+  split_annot(op->annot_left, Lo, Li);
+  split_annot(op->annot_right, Ro, Ri);
+  split_annot(op->annot_out, Co, Ci);
+
+  std::vector<std::string> hadamard, contracted, ext_left, ext_right;
+  for (auto const& s : Lo) {
+    bool inR = find_idx(Ro, s) >= 0, inC = find_idx(Co, s) >= 0;
+    if (inR && inC)
+      hadamard.push_back(s);
+    else if (inR && !inC)
+      contracted.push_back(s);
+    else if (!inR && inC)
+      ext_left.push_back(s);
+  }
+  for (auto const& s : Ro)
+    if (find_idx(Lo, s) < 0 && find_idx(Co, s) >= 0) ext_right.push_back(s);
+
+  auto join = [](std::vector<std::string> const& v) {
+    std::string s;
+    for (std::size_t i = 0; i < v.size(); ++i)
+      s += (i ? "," : "") + v[i];
+    return s;
+  };
+  std::printf("  annot_left  = %s\n", op->annot_left.c_str());
+  std::printf("  annot_right = %s\n", op->annot_right.c_str());
+  std::printf("  annot_out   = %s\n", op->annot_out.c_str());
+  std::printf("  hadamard={%s} contracted={%s} ext_left={%s} ext_right={%s}\n",
+              join(hadamard).c_str(), join(contracted).c_str(),
+              join(ext_left).c_str(), join(ext_right).c_str());
+
+  if (contracted.size() != 1) {
+    std::fprintf(stderr,
+                 "ERROR: this bench handles exactly one contracted outer index "
+                 "(got %zu)\n",
+                 contracted.size());
+    return 1;
+  }
+  if (!ext_right.empty()) {
+    std::fprintf(stderr,
+                 "ERROR: this bench assumes no pure right-external outer index "
+                 "(got %zu)\n",
+                 ext_right.size());
+    return 1;
+  }
+
+  OperandIndex Lidx, Ridx, Cidx;
+  Lidx.build(op->L);
+  Ridx.build(op->R);
+  Cidx.build(op->C);
+
+  const std::string mu = contracted[0];
+  const int muL = find_idx(Lo, mu);  // μ̃ position in L dim order
+  const int muR = find_idx(Ro, mu);  // μ̃ position in R dim order
+  const long Jext = Lidx.dim_extent(muL);
+  std::printf("  contracted '%s' extent J=%ld\n", mu.c_str(), Jext);
+
+  // Maps from index name -> dim position for assembling cross-operand queries.
+  std::unordered_map<std::string, int> Cpos;
+  for (int d = 0; d < (int)Co.size(); ++d) Cpos[Co[d]] = d;
+
+  // --- faithful work-unit reconstruction: one per nonzero C cell ---
+  std::printf("\nreconstructing work units from dump ...\n");
+  auto t_recon = clock_type::now();
+  std::vector<WorkUnit> units;
+  long total_cells = 0, total_calls_seq = 0;
+  double total_flops = 0;
+  std::map<long, long> J_hist;  // J -> count
+  std::map<long, long> PQ_hist; // P -> count (P==Q here, report P)
+
+  std::vector<long> cidx(Co.size()), lq(Lo.size()), rq(Ro.size()), hi;
+  for (auto const& kv : op->C.tiles) {
+    std::size_t ord = kv.first;
+    TileSpec const& ts = kv.second;
+    // decode tile ord -> tile multi-index over C tile grid
+    std::vector<long> tmi(Cidx.rank());
+    {
+      std::size_t rem = ord;
+      for (int d = 0; d < Cidx.rank(); ++d) {
+        tmi[d] = rem / Cidx.tile_strides[d];
+        rem %= Cidx.tile_strides[d];
+      }
+    }
+    std::vector<long> tlo(Cidx.rank()), text(Cidx.rank());
+    for (int d = 0; d < Cidx.rank(); ++d) {
+      tlo[d] = Cidx.dims[d].bounds[tmi[d]];
+      text[d] = Cidx.dims[d].bounds[tmi[d] + 1] - tlo[d];
+    }
+    for (long k = 0; k < ts.outer_vol; ++k) {
+      int lbl = ts.cell_labels[k];
+      if (lbl < 0) continue;
+      // within-tile k -> element multi-index
+      std::size_t rem = k;
+      for (int d = Cidx.rank() - 1; d >= 0; --d) {
+        cidx[d] = tlo[d] + (rem % text[d]);
+        rem /= text[d];
+      }
+      const auto& cr = ts.distinct[lbl].hi;  // [P, Q]
+      const long P = cr[0], Q = cr.size() > 1 ? cr[1] : cr[0];
+      // assemble L and R query templates from this C cell's hadamard+ext values
+      for (int d = 0; d < (int)Lo.size(); ++d) {
+        auto it = Cpos.find(Lo[d]);
+        if (it != Cpos.end()) lq[d] = cidx[it->second];
+      }
+      for (int d = 0; d < (int)Ro.size(); ++d) {
+        auto it = Cpos.find(Ro[d]);
+        if (it != Cpos.end()) rq[d] = cidx[it->second];
+      }
+      long J = 0;
+      for (long m = 0; m < Jext; ++m) {
+        lq[muL] = m;
+        rq[muR] = m;
+        if (Lidx.query(lq) >= 0 && Ridx.query(rq) >= 0) ++J;
+      }
+      if (J == 0) continue;
+      units.push_back({P, Q, J});
+      ++total_cells;
+      total_calls_seq += J;
+      total_flops += 2.0 * P * Q * J;
+      ++J_hist[J];
+      ++PQ_hist[P];
+    }
+  }
+  std::printf("  reconstructed %ld nonzero result cells in %.1f ms\n",
+              total_cells, ms_since(t_recon));
+  if (units.empty()) {
+    std::fprintf(stderr, "ERROR: no work units\n");
+    return 1;
+  }
+
+  // --- faithful totals (whole op) ---
+  std::printf("\n--- FAITHFUL whole-op work (from dump) ---\n");
+  std::printf("  result cells          : %ld\n", total_cells);
+  std::printf("  BLAS calls  SEQ (=ΣJ) : %ld\n", total_calls_seq);
+  std::printf("  BLAS calls  FUSED     : %ld   (reduction %.1fx)\n",
+              total_cells, double(total_calls_seq) / double(total_cells));
+  std::printf("  total flops           : %.3e\n", total_flops);
+  std::printf("  J distribution        : ");
+  for (auto const& kv : J_hist) std::printf("J=%ld:%ld  ", kv.first, kv.second);
+  std::printf("\n  P(=Q) distribution    : ");
+  for (auto const& kv : PQ_hist) std::printf("P=%ld:%ld  ", kv.first, kv.second);
+  std::printf("\n");
+
+  // --- bounded timed sample ---
+  long n_sample =
+      (cli.max_cells <= 0) ? total_cells : std::min(cli.max_cells, total_cells);
+  // even, deterministic stride sample so the timed subset matches the full
+  // (P,Q,J) distribution rather than just the first tile.
+  std::vector<WorkUnit> sample;
+  sample.reserve(n_sample);
+  double sample_flops = 0;
+  long sample_calls_seq = 0;
+  {
+    double step = double(total_cells) / double(n_sample);
+    for (long s = 0; s < n_sample; ++s) {
+      long i = std::min<long>(total_cells - 1, (long)std::llround(s * step));
+      sample.push_back(units[i]);
+      sample_flops += 2.0 * units[i].P * units[i].Q * units[i].J;
+      sample_calls_seq += units[i].J;
+    }
+  }
+  const double extrap = double(total_cells) / double(n_sample);
+  std::printf("\n--- timed sample ---\n");
+  std::printf("  sampling %ld / %ld cells (extrapolation x%.2f)\n", n_sample,
+              total_cells, extrap);
+
+  // build arena operand pool (untimed)
+  std::mt19937_64 rng(cli.seed);
+  std::printf("  building arena operand pool ...\n");
+  auto t_pool = clock_type::now();
+  std::vector<CellOperand> pool(n_sample);
+  for (long s = 0; s < n_sample; ++s) build_cell_operand(pool[s], sample[s], rng);
+  std::printf("  pool built in %.1f ms\n", ms_since(t_pool));
+
+  // correctness
+  if (cli.check) {
+    auto check_mode = [&](const char* name, void (*fn)(CellOperand&)) {
+      double d = 0;
+      for (auto& op2 : pool) {
+        zero_C(op2);
+        fn(op2);
+        d = std::max(d, max_abs_diff_ref(op2));
+      }
+      std::printf("  check %-9s max_abs_diff=%.3e  %s\n", name, d,
+                  d < 1e-9 ? "pass" : "FAIL");
+      return d < 1e-9;
+    };
+    bool ok = true;
+    ok &= check_mode("seq_gemm", eval_seq_gemm);
+    ok &= check_mode("seq_ger", eval_seq_ger);
+    ok &= check_mode("strided", eval_strided);
+    if (!ok) {
+      std::fprintf(stderr, "correctness FAILED -- aborting timing\n");
+      return 2;
+    }
+  }
+
+  // timing
+  std::printf("\nresults (min/median over %d reps; sample of %ld cells)\n",
+              cli.repeats, n_sample);
+  static double g_seq_gemm = 0, g_seq_ger = 0, g_strided = 0;
+  auto run_capture = [&](const char* name, void (*fn)(CellOperand&), long calls,
+                         double& slot) {
+    if (cli.mode != "all" && cli.mode != name) return;
+    for (int w = 0; w < cli.warmup; ++w)
+      for (auto& op2 : pool) {
+        zero_C(op2);
+        fn(op2);
+      }
+    std::vector<double> times;
+    for (int r = 0; r < cli.repeats; ++r) {
+      for (auto& op2 : pool) zero_C(op2);
+      auto t0 = clock_type::now();
+      for (auto& op2 : pool) fn(op2);
+      times.push_back(ms_since(t0));
+    }
+    std::sort(times.begin(), times.end());
+    double mn = times.front(), md = times[times.size() / 2];
+    double gf = (sample_flops / 1e9) / (mn / 1e3);
+    slot = mn;
+    std::printf(
+        "  %-9s  min=%8.2f ms  median=%8.2f ms  %7.2f GFLOPS  calls=%ld   "
+        "(whole-op est: %8.1f ms)\n",
+        name, mn, md, gf, calls, mn * extrap);
+  };
+  run_capture("seq_gemm", eval_seq_gemm, sample_calls_seq, g_seq_gemm);
+  run_capture("seq_ger", eval_seq_ger, sample_calls_seq, g_seq_ger);
+  run_capture("strided", eval_strided, n_sample, g_strided);
+
+  if (cli.mode == "all") {
+    std::printf("\n--- speedups (min time) ---\n");
+    if (g_strided > 0) {
+      std::printf("  strided vs seq_gemm : %.2fx\n", g_seq_gemm / g_strided);
+      std::printf("  strided vs seq_ger  : %.2fx\n", g_seq_ger / g_strided);
+    }
+  }
+  std::printf("\n");
+  return 0;
+}
diff --git a/examples/tot_bench/regime_a_hce_e_strided_bench.cpp b/examples/tot_bench/regime_a_hce_e_strided_bench.cpp
new file mode 100644
index 0000000000..42d0306ffa
--- /dev/null
+++ b/examples/tot_bench/regime_a_hce_e_strided_bench.cpp
@@ -0,0 +1,276 @@
+// regime_a_hce_e_strided_bench.cpp
+// ---------------------------------------------------------------------------
+// End-to-end einsum-DSL benchmark for the regime-A `hc+e` ToT product on
+// ArenaTensor inner cells. Unlike the sibling tile/BLAS-level benches
+// (op[ab]_strided_arena_dgemm.cpp, which call BLAS directly off an op_dump),
+// this driver times the SAME `einsum(...)` over the SAME arena operands under
+// the two states of the runtime kill switch
+// `TiledArray::detail::regime_a_strided_disabled()`:
+//
+//   strided   (disabled=false): the ce+e core fuses the outer-contraction k
+//             into ONE strided DGEMM per result cell (M=N=1, K=tile-volume).
+//   per-cell  (disabled=true) : the legacy per-cell rank-1 `dger` loop.
+//
+// The ONLY variable between the two timings is that toggle, so the measured
+// ratio isolates the kernel swap (rank-1 dger loop -> one strided DGEMM). It
+// is NOT an arena-vs-owning comparison: both paths read the identical arena
+// data and pay the identical einsum driver / scheduling overhead (which
+// compresses the ratio -- that is honest and expected).
+//
+// The einsum:
+//   c("h,i; a1,a2") = a("h,i,k; a1") * b("h,i,k; a2")
+//     h,i = Hadamard outer (both operands + result)
+//     k   = outer-contracted (both operands, NOT in result)
+//     a1 (left-only) x a2 (right-only) = inner OUTER-PRODUCT
+//
+// Problem is sized to mimic C6H14: large outer-contraction k (multi-tile,
+// a few hundred cells), moderate inner a1/a2, small Hadamard folding to
+// nbatch >= 2.
+// ---------------------------------------------------------------------------
+
+#include <tiledarray.h>
+#include <TiledArray/math/blas.h>
+
+#include <TiledArray/expressions/einsum.h>
+#include <TiledArray/tensor/arena_einsum.h>
+#include <TiledArray/tensor/arena_tensor.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+namespace TA = TiledArray;
+
+using clock_type = std::chrono::steady_clock;
+static double ms_since(clock_type::time_point t0) {
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(clock_type::now() -
+                                                              t0)
+             .count() /
+         1.0e6;
+}
+
+// ===========================================================================
+// CLI
+// ===========================================================================
+
+struct Cli {
+  int reps = 20;        // timed reps per path
+  int warmup = 3;       // untimed warmup reps per path
+  int k_tiles = 8;      // number of k tiles
+  int k_tile = 32;      // extent of each k tile
+  int inner = 24;       // |a1| = |a2|
+};
+
+static void usage() {
+  std::fprintf(stderr,
+               "regime_a_hce_e_strided_bench\n"
+               "  --reps R       timed reps per path        (default 20)\n"
+               "  --warmup W      untimed warmup reps        (default 3)\n"
+               "  --k_tiles N    number of k tiles          (default 8)\n"
+               "  --k_tile E     extent of each k tile       (default 32)\n"
+               "  --inner P      |a1| = |a2|                 (default 24)\n");
+}
+
+static Cli parse_cli(int argc, char** argv) {
+  Cli c;
+  for (int i = 1; i < argc; ++i) {
+    std::string a = argv[i];
+    auto need = [&]() -> std::string {
+      if (i + 1 >= argc) {
+        usage();
+        std::exit(1);
+      }
+      return argv[++i];
+    };
+    if (a == "--reps")
+      c.reps = std::stoi(need());
+    else if (a == "--warmup")
+      c.warmup = std::stoi(need());
+    else if (a == "--k_tiles")
+      c.k_tiles = std::stoi(need());
+    else if (a == "--k_tile")
+      c.k_tile = std::stoi(need());
+    else if (a == "--inner")
+      c.inner = std::stoi(need());
+    else if (a == "-h" || a == "--help") {
+      usage();
+      std::exit(0);
+    } else {
+      std::fprintf(stderr, "unknown flag: %s\n", a.c_str());
+      usage();
+      std::exit(1);
+    }
+  }
+  return c;
+}
+
+// ===========================================================================
+// main
+// ===========================================================================
+
+int main(int argc, char** argv) {
+  Cli cli = parse_cli(argc, argv);
+  auto& world = TA_SCOPED_INITIALIZE(argc, argv);
+
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+
+  const long H = 2, I = 2;             // Hadamard outer (fold to nbatch = H*I = 4)
+  const long P = cli.inner;            // |a1|
+  const long Q = cli.inner;            // |a2|
+  const long nbatch = H * I;
+  const long Kcells = static_cast<long>(cli.k_tiles) * cli.k_tile;
+
+  // k as an explicit multi-boundary TiledRange1: k_tiles tiles of k_tile each.
+  std::vector<long> kbounds;
+  kbounds.reserve(cli.k_tiles + 1);
+  for (int t = 0; t <= cli.k_tiles; ++t)
+    kbounds.push_back(static_cast<long>(t) * cli.k_tile);
+  TA::TiledRange1 ktr1(kbounds.begin(), kbounds.end());
+
+  // outer (h,i,k): h one tile extent H, i one tile extent I, k multi-tile.
+  TA::TiledRange a_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I}, ktr1};
+  TA::TiledRange b_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I}, ktr1};
+
+  // Smooth deterministic fill (a function of global coordinates).
+  auto a_val = [](long h, long i, long k, long a1) {
+    return 1.0 + 0.5 * i + 0.25 * std::sin(0.13 * k) + 0.125 * a1 + 0.0625 * h;
+  };
+  auto b_val = [](long h, long i, long k, long a2) {
+    return 2.0 - 0.3 * std::cos(0.07 * k) + 0.2 * i + 0.05 * a2 + 0.03 * h;
+  };
+
+  // ---- Construct a, b once (arena ToT) ----
+  ArenaArr a(world, a_trange);
+  a.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [&](std::size_t /*ord*/) { return TA::Range{P}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a1 = 0; a1 < P; ++a1) c.data()[a1] = a_val(h, i, k, a1);
+    }
+    return t;
+  });
+  ArenaArr b(world, b_trange);
+  b.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [&](std::size_t /*ord*/) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a2 = 0; a2 < Q; ++a2) c.data()[a2] = b_val(h, i, k, a2);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  std::printf("=== regime-A hc+e einsum strided-vs-per-cell bench ===\n");
+  std::printf(
+      "shape: h=%ld(x1 tile) i=%ld(x1 tile) k=%ld (%d tiles x %d)  "
+      "a1=%ld a2=%ld  nbatch=%ld\n",
+      H, I, Kcells, cli.k_tiles, cli.k_tile, P, Q, nbatch);
+  std::printf("reps=%d warmup=%d\n", cli.reps, cli.warmup);
+
+  // ---- Warmup both paths (untimed: JIT / page-fault / threadpool warmup) ----
+  for (int w = 0; w < cli.warmup; ++w) {
+    TA::detail::regime_a_strided_disabled() = false;
+    {
+      auto c = einsum(a("h,i,k;a1"), b("h,i,k;a2"), "h,i;a1,a2");
+      c.world().gop.fence();
+    }
+    TA::detail::regime_a_strided_disabled() = true;
+    {
+      auto c = einsum(a("h,i,k;a1"), b("h,i,k;a2"), "h,i;a1,a2");
+      c.world().gop.fence();
+    }
+  }
+  TA::detail::regime_a_strided_disabled() = false;
+  world.gop.fence();
+
+  auto median = [](std::vector<double> v) -> double {
+    std::sort(v.begin(), v.end());
+    const std::size_t n = v.size();
+    if (n == 0) return 0.0;
+    return (n % 2) ? v[n / 2] : 0.5 * (v[n / 2 - 1] + v[n / 2]);
+  };
+
+  // ---- Timed: strided path (disabled = false) ----
+  TA::detail::regime_a_strided_disabled() = false;
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TA::detail::g_strided_dgemm_ce_e_calls.store(0);
+#endif
+  std::vector<double> strided_ms;
+  strided_ms.reserve(cli.reps);
+  for (int r = 0; r < cli.reps; ++r) {
+    auto t0 = clock_type::now();
+    auto c = einsum(a("h,i,k;a1"), b("h,i,k;a2"), "h,i;a1,a2");
+    c.world().gop.fence();
+    strided_ms.push_back(ms_since(t0));
+  }
+#ifdef TA_STRIDED_DGEMM_COUNT
+  const std::size_t strided_calls =
+      TA::detail::g_strided_dgemm_ce_e_calls.load();
+#endif
+  const double t_strided_min =
+      *std::min_element(strided_ms.begin(), strided_ms.end());
+  const double t_strided_med = median(strided_ms);
+
+  // ---- Timed: per-cell path (disabled = true) ----
+  TA::detail::regime_a_strided_disabled() = true;
+  std::vector<double> percell_ms;
+  percell_ms.reserve(cli.reps);
+  for (int r = 0; r < cli.reps; ++r) {
+    auto t0 = clock_type::now();
+    auto c = einsum(a("h,i,k;a1"), b("h,i,k;a2"), "h,i;a1,a2");
+    c.world().gop.fence();
+    percell_ms.push_back(ms_since(t0));
+  }
+  const double t_percell_min =
+      *std::min_element(percell_ms.begin(), percell_ms.end());
+  const double t_percell_med = median(percell_ms);
+
+  // ---- Restore production default ----
+  TA::detail::regime_a_strided_disabled() = false;
+
+  // ---- Report ----
+  std::printf("\n--- results (per einsum call, ms) ---\n");
+  std::printf("t_percell : min=%8.4f ms  median=%8.4f ms\n", t_percell_min,
+              t_percell_med);
+  std::printf("t_strided : min=%8.4f ms  median=%8.4f ms\n", t_strided_min,
+              t_strided_med);
+  const double speedup_min =
+      t_strided_min > 0.0 ? t_percell_min / t_strided_min : 0.0;
+  const double speedup_med =
+      t_strided_med > 0.0 ? t_percell_med / t_strided_med : 0.0;
+  std::printf("speedup   : min=%6.3fx  median=%6.3fx  (t_percell / t_strided)\n",
+              speedup_min, speedup_med);
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  std::printf("\n--- firing witness (TA_STRIDED_DGEMM_COUNT) ---\n");
+  std::printf("g_strided_dgemm_ce_e_calls = %zu  (over %d strided reps)\n",
+              strided_calls, cli.reps);
+  if (strided_calls == 0) {
+    std::fprintf(stderr,
+                 "ERROR: strided DGEMM never fired -- reported numbers would "
+                 "reflect a silent fallback, not the strided path.\n");
+    std::abort();
+  }
+  std::printf("OK: strided DGEMM fired (counter > 0).\n");
+#endif
+
+  return 0;
+}
diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h
index 7597479d74..76be6fc3c1 100644
--- a/src/TiledArray/expressions/cont_engine.h
+++ b/src/TiledArray/expressions/cont_engine.h
@@ -136,6 +136,31 @@ class ContEngine : public BinaryEngine<Derived> {
                                 ///< (view-inner-cell) ToT tiles, where a
                                 ///< value-returning per-cell op cannot be
                                 ///< used; null otherwise
+  std::function<void(result_tile_type&, const left_tile_type&,
+                     const right_tile_type&, const math::GemmHelper&)>
+      arena_strided_dgemm_ce_e_tile_op_;  ///< whole-tile ce+e strided DGEMM op
+                                          ///< (arena inner OUTER-PRODUCT under an
+                                          ///< outer contraction); null otherwise
+  std::function<void(result_tile_type&, const left_tile_type&,
+                     const right_tile_type&, const math::GemmHelper&)>
+      arena_strided_dgemm_ce_ce_right_tile_op_;  ///< whole-tile ce+ce strided DGEMM op
+                                           ///< (arena inner CONTRACTION under an
+                                           ///< outer contraction; right-external
+                                           ///< rides BLAS M, left-external rides
+                                           ///< an outer loop); null otherwise.
+                                           ///< Mutually exclusive with
+                                           ///< arena_strided_dgemm_ce_e_tile_op_
+                                           ///< (disjoint num_contract_ranks()
+                                           ///< gates)
+  std::function<void(result_tile_type&, const left_tile_type&,
+                     const right_tile_type&, const math::GemmHelper&)>
+      arena_strided_dgemm_ce_ce_left_tile_op_;  ///< whole-tile ce+ce strided
+                                                ///< DGEMM op, LEFT-clean mirror:
+                                                ///< left-external rides BLAS M,
+                                                ///< right-external rides an
+                                                ///< outer loop. Mutually
+                                                ///< exclusive with the ce_e and
+                                                ///< ce_ce_right ops.
   using arena_plan_storage_t =
       TiledArray::detail::arena_plan_storage_t<result_tile_type, left_tile_type,
                                                right_tile_type>;
@@ -326,6 +351,18 @@ class ContEngine : public BinaryEngine<Derived> {
                       outer_size(left_indices_), outer_size(right_indices_),
                       total_perm, this->element_nonreturn_op_,
                       std::move(this->arena_plan_));
+        if constexpr (TiledArray::detail::is_tensor_of_tensor_v<value_type>) {
+          // ce+e, ce+ce_right and ce+ce_left are mutually exclusive (ce+e gates
+          // on num_contract_ranks()==0; the two ce+ce orientations on disjoint
+          // right-/left-clean inner structure), so at most one is non-null and
+          // only one install fires.
+          if (this->arena_strided_dgemm_ce_e_tile_op_)
+            op_.set_strided_oprod_op(this->arena_strided_dgemm_ce_e_tile_op_);
+          if (this->arena_strided_dgemm_ce_ce_right_tile_op_)
+            op_.set_strided_oprod_op(this->arena_strided_dgemm_ce_ce_right_tile_op_);
+          if (this->arena_strided_dgemm_ce_ce_left_tile_op_)
+            op_.set_strided_oprod_op(this->arena_strided_dgemm_ce_ce_left_tile_op_);
+        }
         // Plan ownership transferred to op_; mark carrier slot empty so any
         // later use of arena_plan_ reads as "no plan" rather than moved-from.
         if constexpr (!std::is_same_v<arena_plan_storage_t, std::monostate>) {
@@ -371,6 +408,18 @@ class ContEngine : public BinaryEngine<Derived> {
                       outer_size(left_indices_), outer_size(right_indices_),
                       total_perm, this->element_nonreturn_op_,
                       std::move(this->arena_plan_));
+        if constexpr (TiledArray::detail::is_tensor_of_tensor_v<value_type>) {
+          // ce+e, ce+ce_right and ce+ce_left are mutually exclusive (ce+e gates
+          // on num_contract_ranks()==0; the two ce+ce orientations on disjoint
+          // right-/left-clean inner structure), so at most one is non-null and
+          // only one install fires.
+          if (this->arena_strided_dgemm_ce_e_tile_op_)
+            op_.set_strided_oprod_op(this->arena_strided_dgemm_ce_e_tile_op_);
+          if (this->arena_strided_dgemm_ce_ce_right_tile_op_)
+            op_.set_strided_oprod_op(this->arena_strided_dgemm_ce_ce_right_tile_op_);
+          if (this->arena_strided_dgemm_ce_ce_left_tile_op_)
+            op_.set_strided_oprod_op(this->arena_strided_dgemm_ce_ce_left_tile_op_);
+        }
         // Plan ownership transferred to op_; mark carrier slot empty so any
         // later use of arena_plan_ reads as "no plan" rather than moved-from.
         if constexpr (!std::is_same_v<arena_plan_storage_t, std::monostate>) {
@@ -730,6 +779,252 @@ class ContEngine : public BinaryEngine<Derived> {
                   TA_EXCEPTION(
                       "nested contraction on view inner tiles: the arena fast "
                       "path was inactive (arena disabled)");
+                // ce+e (hce+e): inner OUTER product (no inner contraction)
+                // under outer contraction on arena view cells -> one strided
+                // DGEMM per result cell (ride the contracted index into BLAS
+                // K). Only the canonical perm-free layout is fused; a
+                // non-identity inner result perm is applied downstream and left
+                // to the per-cell path here.
+                // The strided kernel is specialized to view (arena) inner cells
+                // with double storage, and its static_assert requires that of
+                // ALL THREE operands (result, left, right). Gate on the same
+                // 3-operand predicate so a mixed-operand contraction (e.g. a
+                // view/double result with a non-view or non-double operand, or
+                // float/complex inner) stays on the generic per-cell path and
+                // never instantiates the double-view-only kernel (which would be
+                // a hard compile error rather than a graceful fallback).
+                if constexpr (TiledArray::is_tensor_view_v<
+                                  result_tile_element_type> &&
+                              TiledArray::is_tensor_view_v<
+                                  left_tile_element_type> &&
+                              TiledArray::is_tensor_view_v<
+                                  right_tile_element_type> &&
+                              std::is_same_v<typename result_tile_element_type::
+                                                 numeric_type,
+                                             double> &&
+                              std::is_same_v<
+                                  typename left_tile_element_type::numeric_type,
+                                  double> &&
+                              std::is_same_v<
+                                  typename right_tile_element_type::numeric_type,
+                                  double>) {
+                  if (contrreduce_op.gemm_helper().num_contract_ranks() == 0 &&
+                      !bool(inner(this->perm_))) {
+                    const scalar_type factor = this->factor_;
+                    this->arena_strided_dgemm_ce_e_tile_op_ =
+                        [factor](result_tile_type& Cc, const left_tile_type& Lt,
+                                 const right_tile_type& Rt,
+                                 const math::GemmHelper& gh) {
+                          using integer = TiledArray::math::blas::integer;
+                          integer M, N, K;
+                          gh.compute_matrix_sizes(M, N, K, Lt.range(),
+                                                  Rt.range());
+                          TiledArray::detail::arena_strided_dgemm_ce_e(
+                              Cc, Lt, Rt, static_cast<std::size_t>(M),
+                              static_cast<std::size_t>(N),
+                              static_cast<std::size_t>(K), gh.left_op(),
+                              gh.right_op(), double(factor));
+                        };
+                  }
+                  // ce+ce (hce+ce): inner CONTRACTION (num_contract_ranks() >=
+                  // 1) under outer contraction. One operand inner must be a pure
+                  // contraction vector; that side's outer-external rides BLAS M
+                  // with one strided DGEMM per (batch, other-external,
+                  // outer-contraction) cell. Two orientations (right-clean ->
+                  // ce_ce_right, left-clean -> ce_ce_left); see the either-side
+                  // rule below. Sibling of the ce+e arm above (disjoint
+                  // num_contract_ranks gate) so at most one strided op installs.
+                  const auto& inner_gh = contrreduce_op.gemm_helper();
+                  const bool inner_contraction =
+                      inner_gh.num_contract_ranks() >= 1;
+                  // STRIDED-APPLICABILITY RULE (matrix x matrix exclusion).
+                  // The ce+ce core assumes the RIGHT inner cell is a pure
+                  // contraction vector R[k,μ̃](a4) -- i.e. the right operand
+                  // carries NO inner external. When BOTH operand inners carry an
+                  // external (a genuine inner matrix x matrix, e.g.
+                  // C(m,n;μ,ν) = A(m,k;μ,κ) * B(k,n;κ,ν)), riding μ̃ into BLAS M
+                  // would need a two-level stride the kernel cannot represent:
+                  // the per-cell `clean` probe fails and the GEMV fallback then
+                  // silently contributes nothing (the result cell volume P*Q no
+                  // longer matches the left cell). Refuse the install so such
+                  // shapes take the generic per-cell contraction path. The right
+                  // inner-external rank is right_rank - num_contract_ranks; the
+                  // supported (right-clean) shape has it == 0.
+                  // EITHER-SIDE rule: an inner contraction is strided-castable
+                  // iff at least ONE operand inner is a pure contraction vector
+                  // (no inner external). right-clean -> ce_ce_right (ride the
+                  // right-external into BLAS M); left-clean -> ce_ce_left (ride
+                  // the left-external into BLAS M). When BOTH inners carry an
+                  // external (a genuine inner matrix x matrix, e.g.
+                  // C(m,n;μ,ν) = A(m,k;μ,κ) * B(k,n;κ,ν)) neither fires and the
+                  // generic per-cell path runs. An operand's inner-external rank
+                  // is its rank - num_contract_ranks; clean == 0.
+                  const bool right_inner_clean =
+                      inner_gh.right_rank() == inner_gh.num_contract_ranks();
+                  const bool left_inner_clean =
+                      inner_gh.left_rank() == inner_gh.num_contract_ranks();
+                  // Derive the outer-contracted rank `oc` from the outer index
+                  // sizes (same helper used by the outer op when building op_).
+                  const auto oc = (outer_size(this->left_indices_) +
+                                   outer_size(this->right_indices_) -
+                                   outer_size(this->indices_)) /
+                                  2;
+                  // the ridden operand must carry an outer external to ride.
+                  const bool right_has_ext =
+                      outer_size(this->right_indices_) > oc;
+                  const bool left_has_ext = outer_size(this->left_indices_) > oc;
+                  // canonical inner orientation: identity == "no inner
+                  // transpose". right core assumes L=(a1,a4), R=(a4); left core
+                  // assumes L=(a4), R=(a4,b1). Either way BOTH inner permtypes
+                  // must be identity and there must be no inner result perm. This
+                  // gate is LOAD-BEARING for correctness.
+                  const bool inner_canonical =
+                      this->left_inner_permtype_ ==
+                          TiledArray::expressions::PermutationType::identity &&
+                      this->right_inner_permtype_ ==
+                          TiledArray::expressions::PermutationType::identity &&
+                      !bool(inner(this->perm_));
+                  // RELAXED gate. The strided kernel can fold a matrix_transpose
+                  // of the EXTERNAL-carrying operand into the inner GEMM op flag
+                  // (zero-copy), because matrix_transpose is a contiguous
+                  // two-block swap (permopt) so the cell still flattens cleanly.
+                  // The CLEAN (pure contraction vector) side must stay identity,
+                  // the result inner must not be permuted, and a `general` inner
+                  // perm still falls back. right arm: left carries the external
+                  // (may be T), right is the vector (id). left arm: mirror.
+                  auto inner_pt_ok =
+                      [](TiledArray::expressions::PermutationType p) {
+                        return p == TiledArray::expressions::PermutationType::
+                                        identity ||
+                               p == TiledArray::expressions::PermutationType::
+                                        matrix_transpose;
+                      };
+                  const bool no_result_inner_perm = !bool(inner(this->perm_));
+                  const bool right_arm_ok =
+                      inner_contraction && no_result_inner_perm &&
+                      right_inner_clean && right_has_ext &&
+                      this->right_inner_permtype_ ==
+                          TiledArray::expressions::PermutationType::identity &&
+                      inner_pt_ok(this->left_inner_permtype_);
+                  const bool left_arm_ok =
+                      inner_contraction && no_result_inner_perm &&
+                      left_inner_clean && left_has_ext &&
+                      this->left_inner_permtype_ ==
+                          TiledArray::expressions::PermutationType::identity &&
+                      inner_pt_ok(this->right_inner_permtype_);
+                  if (right_arm_ok) {
+                    const scalar_type factor = this->factor_;
+                    const bool left_inner_T =
+                        this->left_inner_permtype_ ==
+                        TiledArray::expressions::PermutationType::matrix_transpose;
+                    this->arena_strided_dgemm_ce_ce_right_tile_op_ =
+                        [factor, left_inner_T](
+                            result_tile_type& Cc, const left_tile_type& Lt,
+                            const right_tile_type& Rt,
+                            const math::GemmHelper& gh) {
+                          math::blas::integer Mo = 0, No = 0, Ko = 0;
+                          gh.compute_matrix_sizes(Mo, No, Ko, Lt.range(),
+                                                  Rt.range());
+                          TiledArray::detail::arena_strided_dgemm_ce_ce_right(
+                              Cc, Lt, Rt, static_cast<std::size_t>(Mo),
+                              static_cast<std::size_t>(No),
+                              static_cast<std::size_t>(Ko), gh.left_op(),
+                              gh.right_op(), double(factor), left_inner_T);
+                        };
+                  } else if (left_arm_ok) {
+                    const scalar_type factor = this->factor_;
+                    const bool right_inner_T =
+                        this->right_inner_permtype_ ==
+                        TiledArray::expressions::PermutationType::matrix_transpose;
+                    this->arena_strided_dgemm_ce_ce_left_tile_op_ =
+                        [factor, right_inner_T](
+                            result_tile_type& Cc, const left_tile_type& Lt,
+                            const right_tile_type& Rt,
+                            const math::GemmHelper& gh) {
+                          math::blas::integer Mo = 0, No = 0, Ko = 0;
+                          gh.compute_matrix_sizes(Mo, No, Ko, Lt.range(),
+                                                  Rt.range());
+                          TiledArray::detail::arena_strided_dgemm_ce_ce_left(
+                              Cc, Lt, Rt, static_cast<std::size_t>(Mo),
+                              static_cast<std::size_t>(No),
+                              static_cast<std::size_t>(Ko), gh.left_op(),
+                              gh.right_op(), double(factor), right_inner_T);
+                        };
+                  }
+                  // [strided-dgemm] install-decision instrumentation. For each
+                  // ToT contraction reaching this double-view path, report
+                  // whether a strided-DGEMM regime (hce+e / hc+e / hce+ce)
+                  // FIRED or the contraction REVERTED to the generic by-cell
+                  // evaluation path (with the blocking guard). Gated by
+                  // TA_STRIDED_DGEMM_VERBOSE; a no-op otherwise.
+                  if (TiledArray::detail::strided_dgemm_verbose()) {
+                    if (this->arena_strided_dgemm_ce_e_tile_op_) {
+                      TiledArray::detail::strided_dgemm_log(
+                          left_has_ext ? "hce+e  FIRES (ce+e)"
+                                       : "hc+e   FIRES (ce+e)");
+                    } else if (this->arena_strided_dgemm_ce_ce_right_tile_op_) {
+                      TiledArray::detail::strided_dgemm_log(
+                          "hce+ce FIRES (ce+ce right)");
+                    } else if (this->arena_strided_dgemm_ce_ce_left_tile_op_) {
+                      TiledArray::detail::strided_dgemm_log(
+                          "hce+ce FIRES (ce+ce left)");
+                    } else if (!inner_contraction) {
+                      // ce+e candidate (no inner contraction); the only guard
+                      // that can block its install is a non-identity inner
+                      // result perm.
+                      TiledArray::detail::strided_dgemm_log(
+                          left_has_ext
+                              ? "hce+e  REVERTED -> by-cell (inner result perm)"
+                              : "hc+e   REVERTED -> by-cell (inner result perm)");
+                    } else if (!inner_canonical) {
+                      // ce+ce candidate blocked by a non-canonical inner perm.
+                      // Break down WHICH operand/result perm is non-identity
+                      // (matrix_transpose 'T' vs general 'gen') and the inner
+                      // ranks, so free transposes can be told apart from
+                      // interleaving general perms.
+                      auto pt = [](TiledArray::expressions::PermutationType p)
+                          -> const char* {
+                        switch (p) {
+                          case TiledArray::expressions::PermutationType::
+                              identity:
+                            return "id";
+                          case TiledArray::expressions::PermutationType::
+                              matrix_transpose:
+                            return "T";
+                          case TiledArray::expressions::PermutationType::
+                              general:
+                            return "gen";
+                        }
+                        return "?";
+                      };
+                      std::string msg =
+                          "hce+ce REVERTED -> by-cell (non-canonical inner "
+                          "perm: L=";
+                      msg += pt(this->left_inner_permtype_);
+                      msg += " R=";
+                      msg += pt(this->right_inner_permtype_);
+                      msg += " resInner=";
+                      msg += (bool(inner(this->perm_)) ? "perm" : "id");
+                      msg += "; innerRank L/R/contract=";
+                      msg += std::to_string(inner_gh.left_rank());
+                      msg += "/";
+                      msg += std::to_string(inner_gh.right_rank());
+                      msg += "/";
+                      msg += std::to_string(inner_gh.num_contract_ranks());
+                      msg += ")";
+                      TiledArray::detail::strided_dgemm_log(msg.c_str());
+                    } else {
+                      // ce+ce candidate, canonical inner, but no clean side /
+                      // no outer external to ride.
+                      TiledArray::detail::strided_dgemm_log(
+                          !(right_inner_clean || left_inner_clean)
+                              ? "hce+ce REVERTED -> by-cell (matrix x matrix, "
+                                "no clean inner side)"
+                              : "hce+ce REVERTED -> by-cell (no outer external "
+                                "to ride)");
+                    }
+                  }
+                }
               } else {
                 // outer Hadamard: MultEngine builds a binary tile op, which
                 // cannot use a value-returning per-cell op. Supply a whole-tile
diff --git a/src/TiledArray/tensor/arena_einsum.h b/src/TiledArray/tensor/arena_einsum.h
index 8d7b3a578a..f49b259c4b 100644
--- a/src/TiledArray/tensor/arena_einsum.h
+++ b/src/TiledArray/tensor/arena_einsum.h
@@ -12,7 +12,19 @@
 #include "TiledArray/tensor/type_traits.h"
 #include "TiledArray/util/annotation.h"
 
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <mutex>
 #include <optional>
+#include <unordered_map>
+#include <vector>
+#include <string>
 #include <type_traits>
 #include <utility>
 #include <variant>
@@ -25,6 +37,816 @@
 
 namespace TiledArray::detail {
 
+/// Env-gated (TA_STRIDED_DGEMM_VERBOSE) toggle for the strided-DGEMM install
+/// logger. Reads the environment once. Set TA_STRIDED_DGEMM_VERBOSE=1 to have
+/// the ContEngine print, per ToT contraction, whether a strided-DGEMM regime
+/// (hce+e / hc+e / hce+ce) FIRES or REVERTS to the generic by-cell path.
+inline bool strided_dgemm_verbose() {
+  static const bool enabled = [] {
+    const char* e = std::getenv("TA_STRIDED_DGEMM_VERBOSE");
+    return e != nullptr && e[0] != '\0' && !(e[0] == '0' && e[1] == '\0');
+  }();
+  return enabled;
+}
+
+/// One-line install-decision logger for the strided-DGEMM regimes. No-op unless
+/// strided_dgemm_verbose() (i.e. TA_STRIDED_DGEMM_VERBOSE) is set.
+inline void strided_dgemm_log(const char* msg) {
+  if (strided_dgemm_verbose()) std::cerr << "[strided-dgemm] " << msg << '\n';
+}
+
+/// ===========================================================================
+/// GEMM-vs-op timing instrumentation (Amdahl profiling).
+///
+/// Accumulates the wall-clock nanoseconds spent INSIDE blas::gemm in the
+/// strided-DGEMM regimes, separated by regime (ce+e = inner outer-product,
+/// ce+ce = inner contraction). The inner-cell loops in each kernel are serial
+/// within a single Product op, so summing per-call durations across all ops is
+/// directly comparable to the summed per-op "Eval | Product | <ns>ns" trace
+/// time (same aggregation across MADNESS task threads). The ratio
+/// gemm_ns / product_ns is the Amdahl compute (kernel) fraction.
+///
+/// Env-gated by TA_GEMM_TIMING=1: when unset the timer takes no clock samples
+/// and touches no atomics (zero overhead on production runs). The per-regime
+/// totals are printed to stderr at process exit.
+inline bool gemm_timing_enabled() {
+  static const bool enabled = [] {
+    const char* e = std::getenv("TA_GEMM_TIMING");
+    return e != nullptr && e[0] != '\0' && !(e[0] == '0' && e[1] == '\0');
+  }();
+  return enabled;
+}
+
+inline std::atomic<std::uint64_t> g_gemm_ns_ce_e{0};
+inline std::atomic<std::uint64_t> g_gemm_ns_ce_ce{0};
+inline std::atomic<std::uint64_t> g_gemm_calls_ce_e{0};
+inline std::atomic<std::uint64_t> g_gemm_calls_ce_ce{0};
+
+/// RAII timer scoping a single blas::gemm. No-op (no clock read, no atomic
+/// touch) unless TA_GEMM_TIMING is set.
+class ScopedGemmTimer {
+ public:
+  ScopedGemmTimer(std::atomic<std::uint64_t>& ns_acc,
+                  std::atomic<std::uint64_t>& call_acc)
+      : ns_(gemm_timing_enabled() ? &ns_acc : nullptr), calls_(&call_acc) {
+    if (ns_) t0_ = std::chrono::steady_clock::now();
+  }
+  ~ScopedGemmTimer() {
+    if (!ns_) return;
+    const auto dt = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        std::chrono::steady_clock::now() - t0_)
+                        .count();
+    ns_->fetch_add(static_cast<std::uint64_t>(dt), std::memory_order_relaxed);
+    calls_->fetch_add(1, std::memory_order_relaxed);
+  }
+  ScopedGemmTimer(const ScopedGemmTimer&) = delete;
+  ScopedGemmTimer& operator=(const ScopedGemmTimer&) = delete;
+
+ private:
+  std::atomic<std::uint64_t>* ns_;
+  std::atomic<std::uint64_t>* calls_;
+  std::chrono::steady_clock::time_point t0_;
+};
+
+/// ---------------------------------------------------------------------------
+/// Per-shape GEMM histogram for the ce+e regime (inner outer-product).
+///
+/// Each ce+e strided GEMM is a (P x Q) result accumulated over K, i.e.
+/// gemm dims M=P, N=Q, K=K. We bucket calls by the exact (P,Q,K) triple and
+/// tally count + wall ns per bucket, so we can see whether the inner extents
+/// in a given molecule are large enough to feed an efficient DGEMM (the bench
+/// used K~256; CSV/PNO domains in small molecules may be far smaller).
+///
+/// To stay lock-free in the hot loop, each thread accumulates into its own
+/// heap-allocated map (intentionally leaked so the pointer survives MADNESS
+/// worker-thread teardown); the exit dumper merges all registered maps.
+struct GemmShapeMap {
+  // key = M,N,K packed (21 bits each); value = {count, total ns}.
+  std::unordered_map<std::uint64_t, std::pair<std::uint64_t, std::uint64_t>> m;
+};
+
+/// A per-regime shape registry: a mutex + a list of per-thread maps merged at
+/// exit. Two instances exist (ce+e, ce+ce).
+struct ShapeRegistry {
+  std::mutex mtx;
+  std::vector<GemmShapeMap*> maps;
+};
+inline ShapeRegistry g_ce_e_shapes;
+inline ShapeRegistry g_ce_ce_shapes;
+
+/// This thread's bucket map for the given registry (heap-allocated + leaked so
+/// the pointer survives MADNESS worker-thread teardown). The per-thread lookup
+/// list holds at most two entries (ce+e, ce+ce), so the linear scan is trivial.
+inline GemmShapeMap& tls_shapes(ShapeRegistry& reg) {
+  thread_local std::vector<std::pair<ShapeRegistry*, GemmShapeMap*>> mine;
+  for (auto& p : mine)
+    if (p.first == &reg) return *p.second;
+  auto* a = new GemmShapeMap();
+  {
+    std::lock_guard<std::mutex> lk(reg.mtx);
+    reg.maps.push_back(a);
+  }
+  mine.emplace_back(&reg, a);
+  return *a;
+}
+
+inline std::uint64_t pack_shape(std::size_t M, std::size_t N, std::size_t K) {
+  constexpr std::uint64_t MASK = (std::uint64_t{1} << 21) - 1;  // up to 2,097,151
+  return ((static_cast<std::uint64_t>(M) & MASK) << 42) |
+         ((static_cast<std::uint64_t>(N) & MASK) << 21) |
+         (static_cast<std::uint64_t>(K) & MASK);
+}
+
+/// Records one GEMM's (M,N,K) shape into the calling thread's bucket for `reg`.
+inline void record_shape(ShapeRegistry& reg, std::size_t M, std::size_t N,
+                         std::size_t K, std::uint64_t ns) {
+  auto& e = tls_shapes(reg).m[pack_shape(M, N, K)];
+  e.first += 1;
+  e.second += ns;
+}
+
+/// RAII timer wrapping one blas::gemm that accumulates wall ns into the given
+/// regime totals AND records the (M,N,K) shape into the given registry. No-op
+/// (no clock read, no map touch) unless TA_GEMM_TIMING is set.
+class ScopedShapedGemmTimer {
+ public:
+  ScopedShapedGemmTimer(std::atomic<std::uint64_t>& ns_acc,
+                        std::atomic<std::uint64_t>& call_acc, ShapeRegistry& reg,
+                        std::size_t M, std::size_t N, std::size_t K)
+      : on_(gemm_timing_enabled()),
+        ns_(&ns_acc),
+        calls_(&call_acc),
+        reg_(&reg),
+        M_(M),
+        N_(N),
+        K_(K) {
+    if (on_) t0_ = std::chrono::steady_clock::now();
+  }
+  ~ScopedShapedGemmTimer() {
+    if (!on_) return;
+    const auto dt = static_cast<std::uint64_t>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+            std::chrono::steady_clock::now() - t0_)
+            .count());
+    ns_->fetch_add(dt, std::memory_order_relaxed);
+    calls_->fetch_add(1, std::memory_order_relaxed);
+    record_shape(*reg_, M_, N_, K_, dt);
+  }
+  ScopedShapedGemmTimer(const ScopedShapedGemmTimer&) = delete;
+  ScopedShapedGemmTimer& operator=(const ScopedShapedGemmTimer&) = delete;
+
+ private:
+  bool on_;
+  std::atomic<std::uint64_t>* ns_;
+  std::atomic<std::uint64_t>* calls_;
+  ShapeRegistry* reg_;
+  std::size_t M_, N_, K_;
+  std::chrono::steady_clock::time_point t0_;
+};
+
+/// ---------------------------------------------------------------------------
+/// Phase decomposition of the ce+ce kernel time (where does the non-GEMM
+/// overhead go?). All gated by TA_GEMM_TIMING; printed at exit.
+///   kernel_total = whole arena_strided_dgemm_ce_ce_{right,left} body
+///   gemm         = g_gemm_ns_ce_ce (the blas::gemm calls, timed elsewhere)
+///   check        = the per-(b,m/n) presence+stride cleanliness verification
+///   fallback     = the per-cell GEMV scalar path taken when a run is not clean
+///   loop residual= kernel_total - gemm - check - fallback (cell iteration,
+///                  offset math, result-pointer setup)
+/// Separately, dispatch/scheduling OUTSIDE the kernel is the per-op eval-trace
+/// "Product" time minus kernel_total (computed post-hoc from the trace).
+inline std::atomic<std::uint64_t> g_kernel_ns_ce_ce{0};
+inline std::atomic<std::uint64_t> g_check_ns_ce_ce{0};
+inline std::atomic<std::uint64_t> g_fallback_ns_ce_ce{0};
+
+/// Scoped timer accumulating wall ns of its lexical scope into `acc`. No-op
+/// unless TA_GEMM_TIMING is set.
+class ScopedPhaseTimer {
+ public:
+  explicit ScopedPhaseTimer(std::atomic<std::uint64_t>& acc)
+      : acc_(gemm_timing_enabled() ? &acc : nullptr) {
+    if (acc_) t0_ = std::chrono::steady_clock::now();
+  }
+  ~ScopedPhaseTimer() {
+    if (!acc_) return;
+    acc_->fetch_add(static_cast<std::uint64_t>(
+                        std::chrono::duration_cast<std::chrono::nanoseconds>(
+                            std::chrono::steady_clock::now() - t0_)
+                            .count()),
+                    std::memory_order_relaxed);
+  }
+  ScopedPhaseTimer(const ScopedPhaseTimer&) = delete;
+  ScopedPhaseTimer& operator=(const ScopedPhaseTimer&) = delete;
+
+ private:
+  std::atomic<std::uint64_t>* acc_;
+  std::chrono::steady_clock::time_point t0_;
+};
+
+/// Manual start/stop variant for a region that can't be lexically scoped (the
+/// cleanliness check writes locals consumed after it).
+inline std::chrono::steady_clock::time_point phase_start() {
+  return gemm_timing_enabled() ? std::chrono::steady_clock::now()
+                               : std::chrono::steady_clock::time_point{};
+}
+inline void phase_stop(std::atomic<std::uint64_t>& acc,
+                       std::chrono::steady_clock::time_point t0) {
+  if (!gemm_timing_enabled()) return;
+  acc.fetch_add(static_cast<std::uint64_t>(
+                    std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        std::chrono::steady_clock::now() - t0)
+                        .count()),
+                std::memory_order_relaxed);
+}
+
+/// ---------------------------------------------------------------------------
+/// Why did a ce+ce run fall back to scalar GEMV? Diagnose the rejected run by
+/// re-walking it (gate order: presence -> uniform size -> constant stride).
+///   1 = absent   : a cell in the run is missing (sparsity / screened out)
+///   2 = nonuniform: present, but inner-cell sizes differ along the run
+///   3 = stride    : present + uniform, but cells are NOT at a constant
+///                   page-jump-free stride (the strided-DGEMM precondition)
+///   0 = run looks clean (so the rejection came from the OTHER operand run)
+inline std::atomic<std::uint64_t> g_fall_runs_ce_ce{0};
+inline std::atomic<std::uint64_t> g_fall_res_absent_ce_ce{0};      // 1
+inline std::atomic<std::uint64_t> g_fall_res_nonuniform_ce_ce{0};  // 2
+inline std::atomic<std::uint64_t> g_fall_res_stride_ce_ce{0};      // 3
+inline std::atomic<std::uint64_t> g_fall_op_absent_ce_ce{0};       // 13
+inline std::atomic<std::uint64_t> g_fall_op_nonuniform_ce_ce{0};   // 14
+inline std::atomic<std::uint64_t> g_fall_op_stride_ce_ce{0};       // 15
+inline std::atomic<std::uint64_t> g_fall_op_acrossk_ce_ce{0};      // 16
+inline std::atomic<std::uint64_t> g_fall_both_clean_ce_ce{0};      // 17
+
+/// Classify a strided run: 1=absent, 2=nonuniform size, 3=bad stride, 0=clean.
+template <typename GetCell>
+inline int classify_run(GetCell getcell, std::size_t n) {
+  if (n == 0) return 0;
+  long s0 = -1;
+  const double* base = nullptr;
+  for (std::size_t i = 0; i < n; ++i) {
+    const auto& c = getcell(i);
+    if (!c) return 1;  // absent
+    const long sz = static_cast<long>(c.size());
+    if (s0 < 0) {
+      s0 = sz;
+      base = c.data();
+    } else if (sz != s0) {
+      return 2;  // nonuniform
+    }
+  }
+  if (n <= 1 || s0 <= 0) return 0;  // single cell: no stride to violate
+  const long st = static_cast<long>(getcell(1).data() - base);
+  if (st < s0) return 3;  // page-jump / overlap
+  for (std::size_t i = 0; i < n; ++i)
+    if (getcell(i).data() != base + static_cast<std::ptrdiff_t>(i) * st)
+      return 3;  // non-constant stride
+  return 0;
+}
+
+/// Diagnose the OPERAND side (called when the result run is clean). getR(k,i)
+/// is the strided operand run (length `nrun`, per outer-contraction k); getL(k)
+/// is the per-k single (non-strided) operand cell, expected size P*Q. Returns
+/// 13=absent/size, 14=nonuniform, 15=bad stride within a k, 16=stride varies
+/// across k, 17=clean (gate rejected a run this re-check finds valid).
+template <typename GetR, typename GetL>
+inline int classify_operand(GetR getR, GetL getL, std::size_t nrun,
+                            std::size_t nK, long P) {
+  const auto& r00 = getR(0, 0);
+  if (!r00) return 13;
+  const long Q = static_cast<long>(r00.size());
+  if (Q <= 0) return 13;
+  long sR = -1;
+  for (std::size_t k = 0; k < nK; ++k) {
+    const auto& lk = getL(k);
+    if (!lk || static_cast<long>(lk.size()) != P * Q) return 13;  // single-cell
+    long s0 = -1;
+    const double* base = nullptr;
+    for (std::size_t i = 0; i < nrun; ++i) {
+      const auto& c = getR(k, i);
+      if (!c) return 13;
+      const long sz = static_cast<long>(c.size());
+      if (s0 < 0) {
+        s0 = sz;
+        base = c.data();
+      } else if (sz != s0) {
+        return 14;
+      }
+    }
+    if (s0 != Q) return 14;  // run size != Q (cross-k size mismatch)
+    if (nrun > 1) {
+      const long sk = static_cast<long>(getR(k, 1).data() - base);
+      if (sk < Q) return 15;
+      for (std::size_t i = 0; i < nrun; ++i)
+        if (getR(k, i).data() != base + static_cast<std::ptrdiff_t>(i) * sk)
+          return 15;
+      if (k == 0)
+        sR = sk;
+      else if (sk != sR)
+        return 16;  // stride varies across k
+    }
+  }
+  return 17;  // both runs look clean to this re-check
+}
+
+// How many rejected runs would become a valid strided GEMM if we simply
+// SKIPPED the absent outer-contraction (k) slabs (correct under beta=1)? A run
+// is k-skip-rescuable when the result strided run is clean AND every k whose
+// single-cell operand is present has a clean strided operand run -- i.e. the
+// only defect is absent-k, not a hole in the strided (μ̃/m) dimension.
+inline std::atomic<std::uint64_t> g_fall_kskip_ce_ce{0};
+inline std::atomic<std::uint64_t> g_fall_kskip_present_ce_ce{0};  // present-k count
+
+template <typename GetC, typename GetR, typename GetL>
+inline bool kskip_rescuable(GetC getC, GetR getR, GetL getL, std::size_t nrun,
+                            std::size_t nK, long P, std::size_t& present_k) {
+  present_k = 0;
+  if (classify_run(getC, nrun) != 0) return false;  // strided result hole
+  long Q = -1;
+  for (std::size_t k = 0; k < nK; ++k) {
+    const auto& sc = getL(k);
+    if (!sc) continue;  // absent single-cell operand -> skip this k (beta=1)
+    const long scsz = static_cast<long>(sc.size());
+    auto runk = [&](std::size_t i) -> decltype(getR(k, i)) {
+      return getR(k, i);
+    };
+    if (classify_run(runk, nrun) != 0) return false;  // strided operand hole
+    const auto& r0 = getR(k, 0);
+    const long q = static_cast<long>(r0.size());
+    if (q <= 0 || scsz != P * q) return false;  // size mismatch
+    if (Q < 0) Q = q;
+    else if (q != Q) return false;
+    ++present_k;
+  }
+  return present_k > 0;
+}
+
+// Stronger test: can we recover the run by GATHERING the present strided
+// indices (allowing μ̃-holes) into ONE strided GEMM? Requires: (a) the present
+// result cells are at a uniform packed stride, and (b) for every present-k the
+// operand slab has those SAME present indices, also uniform-stride, size Q.
+// This is exactly the "result & operand empties are aligned" case -- the
+// sparsity is shared (driven by the domain), so a single gather handles both.
+inline std::atomic<std::uint64_t> g_fall_gather_ce_ce{0};
+inline std::atomic<std::uint64_t> g_fall_gather_misalign_ce_ce{0};  // holes, but
+                                                                    // unaligned
+
+template <typename GetC, typename GetR, typename GetL>
+inline int gather_rescuable(GetC getC, GetR getR, GetL getL, std::size_t nrun,
+                            std::size_t nK, long P) {
+  // returns 1 = gatherable (aligned), 0 = not (misaligned/irregular).
+  if (nrun == 0 || nrun > 1024) return 0;
+  std::size_t pres[1024];
+  std::size_t np = 0;
+  const double* rbase = nullptr;
+  for (std::size_t i = 0; i < nrun; ++i) {
+    const auto& c = getC(i);
+    if (!c) continue;
+    if (static_cast<long>(c.size()) != P) return 0;  // result inner nonuniform
+    if (np == 0) rbase = c.data();
+    pres[np++] = i;
+  }
+  if (np == 0) return 0;
+  long rstride = -1;
+  if (np > 1) {
+    rstride = static_cast<long>(getC(pres[1]).data() - rbase);
+    if (rstride < P) return 0;
+    for (std::size_t j = 0; j < np; ++j)
+      if (getC(pres[j]).data() != rbase + static_cast<std::ptrdiff_t>(j) * rstride)
+        return 0;  // present result cells not at uniform packed stride
+  }
+  long Q = -1;
+  bool any_k = false;
+  for (std::size_t k = 0; k < nK; ++k) {
+    if (!getL(k)) continue;  // absent single-cell -> skip k (β=1)
+    const double* ob = nullptr;
+    long os = -1;
+    for (std::size_t j = 0; j < np; ++j) {
+      const auto& oc = getR(k, pres[j]);
+      if (!oc) return 0;  // operand hole at a result-present index -> misaligned
+      const long q = static_cast<long>(oc.size());
+      if (Q < 0) Q = q;
+      else if (q != Q) return 0;
+      if (j == 0) ob = oc.data();
+      else if (j == 1) {
+        os = static_cast<long>(oc.data() - ob);
+        if (os < Q) return 0;
+      }
+      if (os >= 0 &&
+          oc.data() != ob + static_cast<std::ptrdiff_t>(j) * os)
+        return 0;
+    }
+    if (Q <= 0 || static_cast<long>(getL(k).size()) != P * Q) return 0;
+    any_k = true;
+  }
+  return any_k ? 1 : 0;
+}
+
+// Simulate the per-k segmented strided GEMM: for each present k, walk the
+// strided axis and count maximal contiguous (present + uniform-stride) segments.
+// Reports how many segment-GEMMs the scheme issues and their length (=BLAS M)
+// distribution -- the make-or-break metric (M>1 GEMM vs M=1 GEMV).
+inline std::atomic<std::uint64_t> g_seg_calls_ce_ce{0};   // # segment GEMMs
+inline std::atomic<std::uint64_t> g_seg_cells_ce_ce{0};   // Σ segment lengths
+inline std::atomic<std::uint64_t> g_seg_len1_ce_ce{0};    // length-1 (GEMV)
+inline std::atomic<std::uint64_t> g_seg_len2_ce_ce{0};
+inline std::atomic<std::uint64_t> g_seg_len3_4_ce_ce{0};
+inline std::atomic<std::uint64_t> g_seg_len5_8_ce_ce{0};
+inline std::atomic<std::uint64_t> g_seg_len9p_ce_ce{0};
+
+template <typename GetC, typename GetR, typename GetL>
+inline void measure_segments(GetC getC, GetR getR, GetL getL, std::size_t nrun,
+                             std::size_t nK, long P) {
+  for (std::size_t k = 0; k < nK; ++k) {
+    const auto& sc = getL(k);
+    if (!sc) continue;  // skip absent-k
+    std::size_t mu = 0;
+    while (mu < nrun) {
+      const auto& c0 = getC(mu);
+      const auto& r0 = getR(k, mu);
+      if (!c0 || !r0 || static_cast<long>(c0.size()) != P) {
+        ++mu;
+        continue;
+      }
+      const long Q = static_cast<long>(r0.size());
+      if (Q <= 0 || static_cast<long>(sc.size()) != P * Q) {
+        ++mu;
+        continue;
+      }
+      const double* cb = c0.data();
+      const double* rb = r0.data();
+      std::size_t end = mu + 1;
+      long sC = -1, sR = -1;
+      while (end < nrun) {
+        const auto& ce = getC(end);
+        const auto& re = getR(k, end);
+        if (!ce || !re) break;
+        if (static_cast<long>(ce.size()) != P ||
+            static_cast<long>(re.size()) != Q)
+          break;
+        const long dc = static_cast<long>(ce.data() - cb);
+        const long dr = static_cast<long>(re.data() - rb);
+        const long off = static_cast<long>(end - mu);
+        if (off == 1) {
+          sC = dc;
+          sR = dr;
+          if (sC < P || sR < Q) break;
+        } else if (dc != off * sC || dr != off * sR) {
+          break;
+        }
+        ++end;
+      }
+      const std::size_t len = end - mu;
+      g_seg_calls_ce_ce.fetch_add(1, std::memory_order_relaxed);
+      g_seg_cells_ce_ce.fetch_add(len, std::memory_order_relaxed);
+      if (len == 1) g_seg_len1_ce_ce.fetch_add(1, std::memory_order_relaxed);
+      else if (len == 2) g_seg_len2_ce_ce.fetch_add(1, std::memory_order_relaxed);
+      else if (len <= 4) g_seg_len3_4_ce_ce.fetch_add(1, std::memory_order_relaxed);
+      else if (len <= 8) g_seg_len5_8_ce_ce.fetch_add(1, std::memory_order_relaxed);
+      else g_seg_len9p_ce_ce.fetch_add(1, std::memory_order_relaxed);
+      mu = end;
+    }
+  }
+}
+
+inline void record_ce_ce_fallback(int why) {
+  if (!gemm_timing_enabled()) return;
+  g_fall_runs_ce_ce.fetch_add(1, std::memory_order_relaxed);
+  switch (why) {
+    case 1: g_fall_res_absent_ce_ce.fetch_add(1, std::memory_order_relaxed); break;
+    case 2: g_fall_res_nonuniform_ce_ce.fetch_add(1, std::memory_order_relaxed); break;
+    case 3: g_fall_res_stride_ce_ce.fetch_add(1, std::memory_order_relaxed); break;
+    case 13: g_fall_op_absent_ce_ce.fetch_add(1, std::memory_order_relaxed); break;
+    case 14: g_fall_op_nonuniform_ce_ce.fetch_add(1, std::memory_order_relaxed); break;
+    case 15: g_fall_op_stride_ce_ce.fetch_add(1, std::memory_order_relaxed); break;
+    case 16: g_fall_op_acrossk_ce_ce.fetch_add(1, std::memory_order_relaxed); break;
+    default: g_fall_both_clean_ce_ce.fetch_add(1, std::memory_order_relaxed); break;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// ce+e phase timers + fallback diagnosis (mirror of the ce+ce instrumentation).
+// In ce+e a "run" is one result cell (m,n); the clean check is over the k-slabs
+// of L (stride ldA) and R (stride ldB). Fallback reasons classify those k-runs:
+//   L-run: 1 absent / 2 nonuniform / 3 stride;  R-run: 11 / 12 / 13;  17 clean.
+inline std::atomic<std::uint64_t> g_kernel_ns_ce_e{0};
+inline std::atomic<std::uint64_t> g_check_ns_ce_e{0};
+inline std::atomic<std::uint64_t> g_fallback_ns_ce_e{0};
+inline std::atomic<std::uint64_t> g_e_fall_runs{0};
+inline std::atomic<std::uint64_t> g_e_l_absent{0};
+inline std::atomic<std::uint64_t> g_e_l_nonuniform{0};
+inline std::atomic<std::uint64_t> g_e_l_stride{0};
+inline std::atomic<std::uint64_t> g_e_r_absent{0};
+inline std::atomic<std::uint64_t> g_e_r_nonuniform{0};
+inline std::atomic<std::uint64_t> g_e_r_stride{0};
+inline std::atomic<std::uint64_t> g_e_both_clean{0};
+
+inline void record_ce_e_fallback(int why) {
+  if (!gemm_timing_enabled()) return;
+  g_e_fall_runs.fetch_add(1, std::memory_order_relaxed);
+  switch (why) {
+    case 1: g_e_l_absent.fetch_add(1, std::memory_order_relaxed); break;
+    case 2: g_e_l_nonuniform.fetch_add(1, std::memory_order_relaxed); break;
+    case 3: g_e_l_stride.fetch_add(1, std::memory_order_relaxed); break;
+    case 11: g_e_r_absent.fetch_add(1, std::memory_order_relaxed); break;
+    case 12: g_e_r_nonuniform.fetch_add(1, std::memory_order_relaxed); break;
+    case 13: g_e_r_stride.fetch_add(1, std::memory_order_relaxed); break;
+    default: g_e_both_clean.fetch_add(1, std::memory_order_relaxed); break;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Coverage: clean (strided-GEMM) vs fallback (scalar) work. Clean FLOPs and
+// time come from the shape histogram + g_gemm_ns_*; here we also count clean
+// runs (ce+ce, where one run issues nK gemms) and accumulate the scalar
+// fallback FLOPs so we can report exactly what fraction of each regime's
+// arithmetic the strided path captures today.
+inline std::atomic<std::uint64_t> g_clean_runs_ce_ce{0};
+inline std::atomic<std::uint64_t> g_fall_flops_ce_e{0};
+inline std::atomic<std::uint64_t> g_fall_flops_ce_ce{0};
+
+/// Dumps the per-regime GEMM-time totals at process exit when TA_GEMM_TIMING
+/// is set. The single inline instance is constructed after <iostream>'s static
+/// init, hence destroyed before std::cerr.
+struct GemmTimingDumper {
+  ~GemmTimingDumper() {
+    if (!gemm_timing_enabled()) return;
+    const auto ce_e = g_gemm_ns_ce_e.load(std::memory_order_relaxed);
+    const auto ce_ce = g_gemm_ns_ce_ce.load(std::memory_order_relaxed);
+    std::cerr << "[gemm-timing] ce+e  GEMM: " << (ce_e / 1e9) << " s  ("
+              << g_gemm_calls_ce_e.load(std::memory_order_relaxed)
+              << " gemm calls)\n";
+    std::cerr << "[gemm-timing] ce+ce GEMM: " << (ce_ce / 1e9) << " s  ("
+              << g_gemm_calls_ce_ce.load(std::memory_order_relaxed)
+              << " gemm calls)\n";
+    std::cerr << "[gemm-timing] total GEMM: " << ((ce_e + ce_ce) / 1e9)
+              << " s\n";
+    auto L = [](std::atomic<std::uint64_t>& a) {
+      return a.load(std::memory_order_relaxed);
+    };
+
+    // ---- kernel-internal phase decomposition (per regime) ----
+    auto dump_phases = [](const char* tag, std::uint64_t kn, std::uint64_t gm,
+                          std::uint64_t ck, std::uint64_t fbt) {
+      if (kn == 0) return;
+      const auto resid =
+          kn > (gm + ck + fbt) ? kn - (gm + ck + fbt) : std::uint64_t{0};
+      auto pct = [&](std::uint64_t x) { return 100.0 * x / kn; };
+      std::cerr << "[" << tag << "-phases] kernel total   : " << (kn / 1e9)
+                << " s\n";
+      std::cerr << "[" << tag << "-phases]   gemm         : " << (gm / 1e9)
+                << " s  (" << pct(gm) << "%)\n";
+      std::cerr << "[" << tag << "-phases]   clean-check  : " << (ck / 1e9)
+                << " s  (" << pct(ck) << "%)\n";
+      std::cerr << "[" << tag << "-phases]   fallback     : " << (fbt / 1e9)
+                << " s  (" << pct(fbt) << "%)\n";
+      std::cerr << "[" << tag << "-phases]   loop residual: " << (resid / 1e9)
+                << " s  (" << pct(resid) << "%)\n";
+    };
+    dump_phases("ce+e", L(g_kernel_ns_ce_e), ce_e, L(g_check_ns_ce_e),
+                L(g_fallback_ns_ce_e));
+    dump_phases("ce+ce", L(g_kernel_ns_ce_ce), ce_ce, L(g_check_ns_ce_ce),
+                L(g_fallback_ns_ce_ce));
+
+    // ---- shape histograms (return clean strided-GEMM FLOPs per regime) ----
+    const double clean_flops_e = dump_shapes("ce+e", g_ce_e_shapes);
+    const double clean_flops_ce = dump_shapes("ce+ce", g_ce_ce_shapes);
+
+    // ---- coverage: what fraction of each regime's arithmetic & time the
+    //      strided GEMM captures today, vs the scalar fallback ----
+    auto cov = [](const char* tag, double clean_flops, std::uint64_t clean_ns,
+                  std::uint64_t clean_runs, std::uint64_t fall_flops_u,
+                  std::uint64_t fall_ns, std::uint64_t fall_runs) {
+      const double ff = static_cast<double>(fall_flops_u);
+      auto p = [](double a, double b) { return b > 0 ? 100.0 * a / b : 0.0; };
+      std::cerr << "[coverage " << tag << "] clean GEMM: " << std::fixed
+                << std::setprecision(2) << (clean_flops / 1e9) << " GFLOP, "
+                << (clean_ns / 1e9) << " s, " << clean_runs << " runs\n";
+      std::cerr << "[coverage " << tag << "] fallback  : " << (ff / 1e9)
+                << " GFLOP, " << (fall_ns / 1e9) << " s, " << fall_runs
+                << " runs\n";
+      std::cerr << "[coverage " << tag
+                << "] FLOP coverage = " << p(clean_flops, clean_flops + ff)
+                << "%   time coverage = " << p(clean_ns, clean_ns + fall_ns)
+                << "%\n"
+                << std::defaultfloat;
+    };
+    cov("ce+e", clean_flops_e, ce_e, L(g_gemm_calls_ce_e), L(g_fall_flops_ce_e),
+        L(g_fallback_ns_ce_e), L(g_e_fall_runs));
+    cov("ce+ce", clean_flops_ce, ce_ce, L(g_clean_runs_ce_ce),
+        L(g_fall_flops_ce_ce), L(g_fallback_ns_ce_ce), L(g_fall_runs_ce_ce));
+
+    // ---- ce+e fallback reasons (which k-run failed the strided gate) ----
+    const auto efr = L(g_e_fall_runs);
+    if (efr > 0) {
+      auto fp = [&](std::uint64_t x) { return 100.0 * x / efr; };
+      std::cerr << "[ce+e-fallback] total rejected cells: " << efr << "\n";
+      std::cerr << "[ce+e-fallback]   L-run absent     : " << L(g_e_l_absent)
+                << "  (" << fp(L(g_e_l_absent)) << "%)\n";
+      std::cerr << "[ce+e-fallback]   L-run nonuniform : " << L(g_e_l_nonuniform)
+                << "  (" << fp(L(g_e_l_nonuniform)) << "%)\n";
+      std::cerr << "[ce+e-fallback]   L-run bad stride : " << L(g_e_l_stride)
+                << "  (" << fp(L(g_e_l_stride)) << "%)\n";
+      std::cerr << "[ce+e-fallback]   R-run absent     : " << L(g_e_r_absent)
+                << "  (" << fp(L(g_e_r_absent)) << "%)\n";
+      std::cerr << "[ce+e-fallback]   R-run nonuniform : " << L(g_e_r_nonuniform)
+                << "  (" << fp(L(g_e_r_nonuniform)) << "%)\n";
+      std::cerr << "[ce+e-fallback]   R-run bad stride : " << L(g_e_r_stride)
+                << "  (" << fp(L(g_e_r_stride)) << "%)\n";
+      std::cerr << "[ce+e-fallback]   both runs clean  : " << L(g_e_both_clean)
+                << "  (" << fp(L(g_e_both_clean)) << "%)\n";
+    }
+
+    // ---- ce+ce fallback reasons (result-run then operand-run diagnosis) ----
+    const auto fruns = L(g_fall_runs_ce_ce);
+    if (fruns > 0) {
+      auto fp = [&](std::uint64_t x) { return 100.0 * x / fruns; };
+      std::cerr << "[ce+ce-fallback] total rejected runs: " << fruns << "\n";
+      std::cerr << "[ce+ce-fallback]   result absent (sparse): "
+                << L(g_fall_res_absent_ce_ce) << "  ("
+                << fp(L(g_fall_res_absent_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-fallback]   result nonuniform     : "
+                << L(g_fall_res_nonuniform_ce_ce) << "  ("
+                << fp(L(g_fall_res_nonuniform_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-fallback]   result bad stride     : "
+                << L(g_fall_res_stride_ce_ce) << "  ("
+                << fp(L(g_fall_res_stride_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-fallback]   operand absent/size   : "
+                << L(g_fall_op_absent_ce_ce) << "  ("
+                << fp(L(g_fall_op_absent_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-fallback]   operand nonuniform    : "
+                << L(g_fall_op_nonuniform_ce_ce) << "  ("
+                << fp(L(g_fall_op_nonuniform_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-fallback]   operand bad stride    : "
+                << L(g_fall_op_stride_ce_ce) << "  ("
+                << fp(L(g_fall_op_stride_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-fallback]   operand stride X k    : "
+                << L(g_fall_op_acrossk_ce_ce) << "  ("
+                << fp(L(g_fall_op_acrossk_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-fallback]   both runs clean (!)   : "
+                << L(g_fall_both_clean_ce_ce) << "  ("
+                << fp(L(g_fall_both_clean_ce_ce)) << "%)\n";
+      const auto ks = L(g_fall_kskip_ce_ce);
+      const auto kp = L(g_fall_kskip_present_ce_ce);
+      const auto gr = L(g_fall_gather_ce_ce);
+      std::cerr << "[ce+ce-fallback]   >> rescuable by k-skip (no μ̃ holes): "
+                << ks << "  (" << fp(ks) << "%); "
+                << (ks ? static_cast<double>(kp) / ks : 0.0)
+                << " present-k/run avg\n";
+      std::cerr << "[ce+ce-fallback]   >> rescuable by μ̃-gather (aligned holes): "
+                << gr << "  (" << fp(gr) << "% of rejected runs) -- present "
+                   "result & operand μ̃ aligned at uniform stride => ONE "
+                   "strided GEMM over gathered cells\n";
+    }
+    // Per-k segmented strided GEMM (the contiguous-sub-run scheme): how many
+    // segment-GEMMs would it issue on the fallback runs, and how long (BLAS M)?
+    const auto sc = L(g_seg_calls_ce_ce);
+    if (sc > 0) {
+      const auto sl = L(g_seg_cells_ce_ce);
+      auto sp = [&](std::uint64_t x) { return 100.0 * x / sc; };
+      std::cerr << "[ce+ce-segment] segment-GEMMs the scheme would issue: " << sc
+                << "  (covering " << sl << " present cells, mean M="
+                << (sc ? static_cast<double>(sl) / sc : 0.0) << ")\n";
+      std::cerr << "[ce+ce-segment]   length distribution (M=segment len):\n";
+      std::cerr << "[ce+ce-segment]     M=1 (GEMV) : " << L(g_seg_len1_ce_ce)
+                << "  (" << sp(L(g_seg_len1_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-segment]     M=2        : " << L(g_seg_len2_ce_ce)
+                << "  (" << sp(L(g_seg_len2_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-segment]     M=3-4      : " << L(g_seg_len3_4_ce_ce)
+                << "  (" << sp(L(g_seg_len3_4_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-segment]     M=5-8      : " << L(g_seg_len5_8_ce_ce)
+                << "  (" << sp(L(g_seg_len5_8_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-segment]     M>=9       : " << L(g_seg_len9p_ce_ce)
+                << "  (" << sp(L(g_seg_len9p_ce_ce)) << "%)\n";
+      std::cerr << "[ce+ce-segment]   (today's clean path issues "
+                << L(g_gemm_calls_ce_ce)
+                << " full-run GEMMs; this would ADD the above on fallback runs)\n";
+    }
+  }
+
+  // Per-call-weighted distribution of a single GEMM dimension (M, N, or K),
+  // sorted by total time (descending) so the dominant values come first.
+  static void dump_dim_dist(
+      const char* tag, const char* dim,
+      const std::map<std::size_t, std::pair<std::uint64_t, std::uint64_t>>& by,
+      std::uint64_t tot_ns) {
+    struct E {
+      std::size_t val;
+      std::uint64_t calls, ns;
+    };
+    std::vector<E> v;
+    v.reserve(by.size());
+    for (const auto& kv : by)
+      v.push_back(E{kv.first, kv.second.first, kv.second.second});
+    std::sort(v.begin(), v.end(),
+              [](const E& a, const E& b) { return a.ns > b.ns; });
+    std::cerr << "[gemm-shapes " << tag << "] " << dim
+              << " distribution (by %time desc; val: calls, ns_total, %time):\n";
+    for (const auto& e : v) {
+      const double pct = tot_ns > 0 ? 100.0 * e.ns / tot_ns : 0.0;
+      std::cerr << "[gemm-shapes " << tag << "]   " << dim << "=" << std::setw(4)
+                << e.val << "  " << std::setw(10) << e.calls << " calls  "
+                << std::setw(13) << e.ns << "  " << std::fixed
+                << std::setprecision(1) << std::setw(5) << pct << "%\n"
+                << std::defaultfloat;
+    }
+  }
+
+  static double dump_shapes(const char* tag, ShapeRegistry& reg) {
+    // Merge all per-thread maps.
+    std::unordered_map<std::uint64_t, std::pair<std::uint64_t, std::uint64_t>>
+        merged;
+    {
+      std::lock_guard<std::mutex> lk(reg.mtx);
+      for (auto* a : reg.maps)
+        for (const auto& kv : a->m) {
+          auto& e = merged[kv.first];
+          e.first += kv.second.first;
+          e.second += kv.second.second;
+        }
+    }
+    if (merged.empty()) return 0.0;
+    constexpr std::uint64_t MASK = (std::uint64_t{1} << 21) - 1;
+    struct Row {
+      std::size_t M, N, K;
+      std::uint64_t calls, ns;
+    };
+    std::vector<Row> rows;
+    rows.reserve(merged.size());
+    for (const auto& kv : merged) {
+      const std::uint64_t k = kv.first;
+      rows.push_back(Row{static_cast<std::size_t>((k >> 42) & MASK),
+                         static_cast<std::size_t>((k >> 21) & MASK),
+                         static_cast<std::size_t>(k & MASK), kv.second.first,
+                         kv.second.second});
+    }
+    // Sort by total ns descending (dominant shapes first).
+    std::sort(rows.begin(), rows.end(),
+              [](const Row& a, const Row& b) { return a.ns > b.ns; });
+    std::cerr << "[gemm-shapes " << tag << "] distinct (M,N,K) shapes: "
+              << rows.size() << "\n";
+    std::cerr << "[gemm-shapes " << tag << "] "
+              << "M       N       K     calls       ns_total      "
+                 "ns/call   GFLOP/s   %time\n";
+    std::uint64_t tot_ns = 0;
+    for (const auto& r : rows) tot_ns += r.ns;
+    std::size_t shown = 0;
+    for (const auto& r : rows) {
+      const double flops = 2.0 * r.M * r.N * r.K * static_cast<double>(r.calls);
+      const double gflops = r.ns > 0 ? flops / r.ns : 0.0;  // 2MNK*calls / ns
+      const double pct = tot_ns > 0 ? 100.0 * r.ns / tot_ns : 0.0;
+      if (shown < 40)
+        std::cerr << "[gemm-shapes " << tag << "] " << std::setw(6) << r.M
+                  << "  " << std::setw(6) << r.N << "  " << std::setw(5) << r.K
+                  << "  " << std::setw(9) << r.calls << "  " << std::setw(13)
+                  << r.ns << "  " << std::setw(8) << (r.ns / r.calls) << "  "
+                  << std::setw(7) << std::fixed << std::setprecision(2) << gflops
+                  << "  " << std::setw(5) << std::setprecision(1) << pct
+                  << "%\n"
+                  << std::defaultfloat;
+      else if (shown == 40)
+        std::cerr << "[gemm-shapes " << tag << "] ... (" << (rows.size() - 40)
+                  << " more shapes omitted)\n";
+      ++shown;
+    }
+
+    // ---- aggregate summaries over ALL shapes ----
+    std::uint64_t tot_calls = 0;
+    double tot_flops = 0.0;
+    double sum_M = 0.0, sum_N = 0.0, sum_K = 0.0;  // call-weighted
+    std::map<std::size_t, std::pair<std::uint64_t, std::uint64_t>> byK, byM, byN;
+    for (const auto& r : rows) {
+      tot_calls += r.calls;
+      tot_flops += 2.0 * r.M * r.N * r.K * static_cast<double>(r.calls);
+      sum_M += static_cast<double>(r.M) * r.calls;
+      sum_N += static_cast<double>(r.N) * r.calls;
+      sum_K += static_cast<double>(r.K) * r.calls;
+      byK[r.K].first += r.calls;
+      byK[r.K].second += r.ns;
+      byM[r.M].first += r.calls;
+      byM[r.M].second += r.ns;
+      byN[r.N].first += r.calls;
+      byN[r.N].second += r.ns;
+    }
+    const double eff_gflops = tot_ns > 0 ? tot_flops / tot_ns : 0.0;
+    std::cerr << "[gemm-shapes " << tag << "] ---- aggregate over all "
+              << rows.size() << " shapes ----\n";
+    std::cerr << "[gemm-shapes " << tag << "] total calls=" << tot_calls
+              << "  total GFLOP=" << std::fixed << std::setprecision(2)
+              << (tot_flops / 1e9) << "  total ns=" << tot_ns
+              << "  effective GFLOP/s=" << eff_gflops << std::defaultfloat
+              << "\n";
+    std::cerr << "[gemm-shapes " << tag << "] call-weighted mean: M="
+              << (tot_calls ? sum_M / tot_calls : 0.0)
+              << "  N=" << (tot_calls ? sum_N / tot_calls : 0.0)
+              << "  K=" << (tot_calls ? sum_K / tot_calls : 0.0) << "\n";
+    dump_dim_dist(tag, "K", byK, tot_ns);
+    dump_dim_dist(tag, "M", byM, tot_ns);
+    dump_dim_dist(tag, "N", byN, tot_ns);
+    return tot_flops;
+  }
+};
+inline GemmTimingDumper g_gemm_timing_dumper;
+
 /// Specifies how an inner-cell range is derived from operand inner cells.
 enum class ArenaInnerShapeKind {
   left_range,         // Hadamard inner; Scale tot_x_t
@@ -332,6 +1154,650 @@ void fused_scale_t_x_tot_inplace(Result& result, const Scalar& s,
       result, right);
 }
 
+#ifdef TA_STRIDED_DGEMM_COUNT
+inline std::atomic<std::size_t> g_strided_dgemm_ce_e_calls{0};
+#endif
+
+/// ce+e strided-DGEMM core (inner OUTER-PRODUCT), looped over the Hadamard-
+/// folded nbatch. For each batch b and result cell (m,n):
+///   C[m,n](p,q) += factor * sum_k L[m,k](p) * R[k,n](q)
+/// as ONE P x Q DGEMM riding the outer-contracted k into BLAS K via the
+/// inter-cell slab stride (zero-copy) when the k-run is "clean" (all cells
+/// present, uniform inner size, single constant stride); else an inline per-k
+/// rank-1 fallback for THAT cell only. Orientation-aware (left_op/right_op pick
+/// per-(m,n,k) offsets). M=left-external, N=right-external, K=outer-contracted.
+template <typename ResultOuter, typename LeftOuter, typename RightOuter>
+void arena_strided_dgemm_ce_e(ResultOuter& C, const LeftOuter& L,
+                              const RightOuter& R, std::size_t M, std::size_t N,
+                              std::size_t K, math::blas::Op left_op,
+                              math::blas::Op right_op, double factor) {
+  namespace blas = TiledArray::math::blas;
+  using integer = blas::integer;
+  static_assert(is_tensor_view_v<typename ResultOuter::value_type> &&
+                    is_tensor_view_v<typename LeftOuter::value_type> &&
+                    is_tensor_view_v<typename RightOuter::value_type>,
+                "arena_strided_dgemm_ce_e: arena (view) inner cells only");
+  static_assert(
+      std::is_same_v<typename ResultOuter::value_type::numeric_type, double> &&
+          std::is_same_v<typename LeftOuter::value_type::numeric_type, double> &&
+          std::is_same_v<typename RightOuter::value_type::numeric_type, double>,
+      "arena_strided_dgemm_ce_e: double inner storage only");
+  if (M == 0 || N == 0 || K == 0) return;
+  const std::size_t nbatch = static_cast<std::size_t>(C.nbatch());
+  if (nbatch == 0) return;
+  const bool shape_ok =
+      (C.range().volume() == M * N && L.range().volume() == M * K &&
+       R.range().volume() == K * N &&
+       static_cast<std::size_t>(L.nbatch()) == nbatch &&
+       static_cast<std::size_t>(R.nbatch()) == nbatch);
+  TA_ASSERT(shape_ok);
+  if (!shape_ok) return;
+  ScopedPhaseTimer _kernel_timer(g_kernel_ns_ce_e);
+  const std::size_t lda = (left_op == blas::NoTranspose) ? K : M;
+  const std::size_t ldb = (right_op == blas::NoTranspose) ? N : K;
+  auto a_off = [&](std::size_t m, std::size_t k) {
+    return (left_op == blas::NoTranspose) ? m * lda + k : k * lda + m;
+  };
+  auto b_off = [&](std::size_t k, std::size_t n) {
+    return (right_op == blas::NoTranspose) ? k * ldb + n : n * ldb + k;
+  };
+  const auto* lc = L.data();
+  const auto* rc = R.data();
+  auto* cc = C.data();
+  for (std::size_t b = 0; b < nbatch; ++b) {
+    const std::size_t cbase = b * M * N;
+    const std::size_t lbase = b * M * K;
+    const std::size_t rbase = b * K * N;
+    for (std::size_t m = 0; m < M; ++m) {
+      for (std::size_t n = 0; n < N; ++n) {
+        auto& Cc = cc[cbase + m * N + n];
+        if (!Cc) continue;
+        const auto _check_t0 = phase_start();
+        const auto& l0 = lc[lbase + a_off(m, 0)];
+        const auto& r0 = rc[rbase + b_off(0, n)];
+        long P = l0 ? static_cast<long>(l0.size()) : -1;
+        long Q = r0 ? static_cast<long>(r0.size()) : -1;
+        bool clean = (P > 0 && Q > 0 && static_cast<long>(Cc.size()) == P * Q);
+        // presence-first: verify every k-cell present + uniform size BEFORE
+        // any .data() pointer subtraction.
+        for (std::size_t k = 0; clean && k < K; ++k) {
+          const auto& lk = lc[lbase + a_off(m, k)];
+          const auto& rk = rc[rbase + b_off(k, n)];
+          if (!lk || static_cast<long>(lk.size()) != P) clean = false;
+          else if (!rk || static_cast<long>(rk.size()) != Q) clean = false;
+        }
+        long ldA = P, ldB = Q;
+        if (clean && K > 1) {
+          ldA = static_cast<long>(lc[lbase + a_off(m, 1)].data() - l0.data());
+          ldB = static_cast<long>(rc[rbase + b_off(1, n)].data() - r0.data());
+          if (ldA < P || ldB < Q) clean = false;
+          for (std::size_t k = 0; clean && k < K; ++k) {
+            if (lc[lbase + a_off(m, k)].data() !=
+                l0.data() + static_cast<std::ptrdiff_t>(k) * ldA)
+              clean = false;
+            else if (rc[rbase + b_off(k, n)].data() !=
+                     r0.data() + static_cast<std::ptrdiff_t>(k) * ldB)
+              clean = false;
+          }
+        }
+        phase_stop(g_check_ns_ce_e, _check_t0);
+        if (clean) {
+          // C(P x Q) += factor * Lmat(P x K) . Rmat^T... realized as
+          // gemm(Transpose, NoTranspose): A=K x P slab, B=K x Q slab.
+          {
+            ScopedShapedGemmTimer _gt(g_gemm_ns_ce_e, g_gemm_calls_ce_e,
+                                      g_ce_e_shapes, P, Q, K);
+            blas::gemm(blas::Transpose, blas::NoTranspose,
+                       /*M=*/static_cast<integer>(P),
+                       /*N=*/static_cast<integer>(Q),
+                       /*K=*/static_cast<integer>(K), factor,
+                       /*A=*/l0.data(), /*lda=*/static_cast<integer>(ldA),
+                       /*B=*/r0.data(), /*ldb=*/static_cast<integer>(ldB),
+                       /*beta=*/1.0,
+                       /*C=*/Cc.data(), /*ldc=*/static_cast<integer>(Q));
+          }
+#ifdef TA_STRIDED_DGEMM_COUNT
+          g_strided_dgemm_ce_e_calls.fetch_add(1, std::memory_order_relaxed);
+#endif
+        } else {
+          ScopedPhaseTimer _fb_timer(g_fallback_ns_ce_e);
+          if (gemm_timing_enabled()) {
+            int why = classify_run(
+                [&](std::size_t k) -> const typename LeftOuter::value_type& {
+                  return lc[lbase + a_off(m, k)];
+                },
+                K);
+            if (why == 0) {
+              const int wr = classify_run(
+                  [&](std::size_t k) -> const typename RightOuter::value_type& {
+                    return rc[rbase + b_off(k, n)];
+                  },
+                  K);
+              why = (wr == 0) ? 17 : 10 + wr;
+            }
+            record_ce_e_fallback(why);
+          }
+          // inline per-k rank-1 fallback for THIS cell (computed once)
+          double* c = Cc.data();
+          std::uint64_t _fl = 0;
+          for (std::size_t k = 0; k < K; ++k) {
+            const auto& lk = lc[lbase + a_off(m, k)];
+            const auto& rk = rc[rbase + b_off(k, n)];
+            if (!lk || !rk) continue;
+            const std::size_t pp = lk.size(), qq = rk.size();
+            if (static_cast<long>(Cc.size()) != static_cast<long>(pp * qq))
+              continue;
+            const double* lp = lk.data();
+            const double* rp = rk.data();
+            _fl += 2ull * pp * qq;
+            for (std::size_t p = 0; p < pp; ++p)
+              for (std::size_t q = 0; q < qq; ++q)
+                c[p * qq + q] += factor * lp[p] * rp[q];
+          }
+          if (gemm_timing_enabled())
+            g_fall_flops_ce_e.fetch_add(_fl, std::memory_order_relaxed);
+        }
+      }
+    }
+  }
+}
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+inline std::atomic<std::size_t> g_strided_dgemm_ce_ce_right_calls{0};
+#endif
+
+/// Kill switch for the ce+ce (hce+ce) per-k segmented strided-DGEMM path: when
+/// true, arena_strided_dgemm_ce_ce_right/_left route EVERY present cell through
+/// the per-cell scalar GEMV loop (the legacy "revert to per-cell" behavior)
+/// instead of the segment walker. Test/bench hook only -- production default is
+/// false (segmented on). Mirrors regime_a_strided_disabled().
+inline bool& ce_ce_strided_disabled() {
+  static bool flag = false;
+  return flag;
+}
+
+/// ce+ce strided-DGEMM core (inner CONTRACTION; ride right-external μ̃ into BLAS
+/// M). ORIENTATION-AWARE (offsets derived from left_op/right_op of the OUTER
+/// GemmHelper, exactly as arena_strided_dgemm_ce_e) and Hadamard-agnostic:
+/// Hadamard is carried as nbatch by the einsum driver, so a thin nbatch loop
+/// wraps a fixed-Hadamard ce+ce core and serves both hce+ce (nbatch>1) and the
+/// no-Hadamard ce+ce (nbatch==1). Do NOT assert nbatch==1.
+///
+/// Outer GemmHelper mapping: Mo = left outer-external, No = right outer-external
+/// = Mμ, Ko = outer-contracted = nK. The left-external `m` (Mo) is an OUTER loop
+/// around the per-(b) body; R (a function of k,μ̃ only) is reused across m.
+/// For each batch b, each left-external m, and each outer-contraction cell Κ=k:
+///   C̃[m,μ̃, a_1] += factor * Σ_{a_4} R[k,μ̃](a_4) · L[m,k](a_1,a_4)
+/// realized as ONE M=μ̃ × N=a_1 × K=a_4 DGEMM riding μ̃ into BLAS M via the
+/// (empirically measured) inter-μ̃-cell slab stride (zero-copy), looping k with
+/// beta=1. Mo==1 reduces to the original ce+ce kernel exactly. If a per-(b,m)
+/// run is not clean, an inline per-cell GEMV fallback handles THAT (b,m) only
+/// (each cell once -> no double-count). C must be pre-shaped (a_1-major); the
+/// result outer is (m, μ̃) row-major (left-then-right concatenation, matching
+/// make_result_range). Accumulates into C (beta=1).
+template <typename ResultOuter, typename LeftOuter, typename RightOuter>
+void arena_strided_dgemm_ce_ce_right(ResultOuter& C, const LeftOuter& L,
+                               const RightOuter& R, std::size_t Mo,
+                               std::size_t No, std::size_t Ko,
+                               math::blas::Op left_op, math::blas::Op right_op,
+                               double factor,
+                               bool left_inner_transposed = false) {
+  // left_inner_transposed: the external-carrying LEFT inner cell is stored
+  // (a4,a1)=Q x P (matrix_transpose) instead of canonical (a1,a4)=P x Q. Folded
+  // into the inner GEMM via transb (zero-copy); the right contraction-vector
+  // side must remain canonical (gated upstream).
+  namespace blas = TiledArray::math::blas;
+  using integer = blas::integer;
+  static_assert(is_tensor_view_v<typename ResultOuter::value_type> &&
+                    is_tensor_view_v<typename LeftOuter::value_type> &&
+                    is_tensor_view_v<typename RightOuter::value_type>,
+                "arena_strided_dgemm_ce_ce_right: arena (view) inner cells only");
+  static_assert(
+      std::is_same_v<typename ResultOuter::value_type::numeric_type, double> &&
+          std::is_same_v<typename LeftOuter::value_type::numeric_type, double> &&
+          std::is_same_v<typename RightOuter::value_type::numeric_type, double>,
+      "arena_strided_dgemm_ce_ce_right: double inner storage only");
+  const std::size_t Mmu = No;  // right outer-external rides BLAS M
+  const std::size_t nK = Ko;   // outer-contracted is looped with beta=1
+  const std::size_t nbatch = static_cast<std::size_t>(C.nbatch());
+  if (nbatch == 0 || Mmu == 0 || nK == 0 || Mo == 0) return;
+  // structural + self-defense (a mis-gated shape falls back / no-ops, never
+  // miscomputes). The left-external Mo>=1 is supported (rides as an outer loop).
+  const bool shape_ok =
+      (C.range().volume() == Mo * Mmu && L.range().volume() == Mo * nK &&
+       R.range().volume() == Mmu * nK &&
+       static_cast<std::size_t>(L.nbatch()) == nbatch &&
+       static_cast<std::size_t>(R.nbatch()) == nbatch);
+  // If the structural invariant is violated, do nothing rather than form
+  // out-of-bounds cell offsets below. This is only reachable via a mis-gate.
+  TA_ASSERT(shape_ok);
+  if (!shape_ok) return;
+  ScopedPhaseTimer _kernel_timer(g_kernel_ns_ce_ce);
+  // orientation-aware outer offsets (mirror arena_strided_dgemm_ce_e a_off/b_off)
+  const std::size_t ldb_o = (right_op == blas::NoTranspose) ? No : Ko;
+  auto r_off = [&](std::size_t k, std::size_t mu) {
+    return (right_op == blas::NoTranspose) ? k * ldb_o + mu : mu * ldb_o + k;
+  };
+  // L: 2-D (m,k) offset (orientation-aware). l_off(k)==k only held for Mo==1.
+  const std::size_t lda_o = (left_op == blas::NoTranspose) ? Ko : Mo;
+  auto l_off = [&](std::size_t m, std::size_t k) {
+    return (left_op == blas::NoTranspose) ? m * lda_o + k : k * lda_o + m;
+  };
+  // result outer (Mo x Mμ) row-major: (m, μ̃) = m*Mmu + mu.
+  auto c_off = [&](std::size_t m, std::size_t mu) { return m * Mmu + mu; };
+  const auto* lc = L.data();
+  const auto* rc = R.data();
+  auto* cc = C.data();
+  for (std::size_t b = 0; b < nbatch; ++b) {
+    const std::size_t cbase = b * Mo * Mmu;
+    const std::size_t rbase = b * Mmu * nK;
+    const std::size_t lbase = b * Mo * nK;
+    for (std::size_t m = 0; m < Mo; ++m) {
+      // Per-k segment walker (replaces the old all-or-nothing clean gate). For
+      // each present left single-cell operand L[m,k], walk the μ̃ axis and emit
+      // one strided GEMM per maximal contiguous segment of present, size-matched
+      // (C.size==P, R.size==Q), uniformly-strided cells; skip holes. β=1
+      // accumulates across k AND across segments. A fully-dense run yields one
+      // full-run segment == the old clean GEMM (T1 regression). A genuine size
+      // mismatch L[m,k] != P*Q drops to a tiny scalar path for that k only.
+      const auto _check_t0 = phase_start();
+      // Result inner free index P from the FIRST PRESENT C[m,μ̃] (the run's
+      // leading cell may be a hole); Q (operand contraction) is discovered per k
+      // from the first present R[k,μ̃]. P is uniform across present result cells
+      // by construction (each present segment re-checks C.size==P below).
+      long P = -1;
+      for (std::size_t mu = 0; mu < Mmu; ++mu) {
+        const auto& cmu = cc[cbase + c_off(m, mu)];
+        if (cmu) {
+          P = static_cast<long>(cmu.size());
+          break;
+        }
+      }
+      phase_stop(g_check_ns_ce_ce, _check_t0);
+      if (P <= 0) continue;  // result run entirely absent: nothing to write
+
+      // Gated diagnosis: log the segment-length distribution this (b,m) would
+      // issue (there is no separate scalar fallback path to attribute now).
+      if (gemm_timing_enabled()) {
+        auto getC = [&](std::size_t mu) -> const typename ResultOuter::value_type& {
+          return cc[cbase + c_off(m, mu)];
+        };
+        auto getR = [&](std::size_t k, std::size_t mu)
+            -> const typename RightOuter::value_type& {
+          return rc[rbase + r_off(k, mu)];
+        };
+        auto getL = [&](std::size_t k) -> const typename LeftOuter::value_type& {
+          return lc[lbase + l_off(m, k)];
+        };
+        int why = classify_run(getC, Mmu);
+        if (why == 0) why = classify_operand(getR, getL, Mmu, nK, P);
+        if (why != 0) record_ce_ce_fallback(why);
+        measure_segments(getC, getR, getL, Mmu, nK, P);
+      }
+
+      // Per-cell scalar evaluation of one k-slab: each present (μ̃) result cell
+      // gets C[μ̃] += factor * op(L) · R[k,μ̃] as one length-Q GEMV. This is the
+      // legacy per-cell path -- the route for a genuine size mismatch AND the
+      // body the ce_ce_strided_disabled() bench switch forces, to isolate the
+      // segmented-vs-per-cell speedup. Identical math to the segment walker.
+      auto percell_k = [&](std::size_t k,
+                           const typename LeftOuter::value_type& lk) {
+        ScopedPhaseTimer _fb_timer(g_fallback_ns_ce_ce);
+        std::uint64_t _fl = 0;
+        const double* l = lk.data();
+        for (std::size_t mu = 0; mu < Mmu; ++mu) {
+          auto& Cc = cc[cbase + c_off(m, mu)];
+          const auto& rk = rc[rbase + r_off(k, mu)];
+          if (!Cc || !rk) continue;
+          const long Pl = static_cast<long>(Cc.size());
+          const long Ql = static_cast<long>(rk.size());
+          if (Ql == 0 || static_cast<long>(lk.size()) != Pl * Ql) continue;
+          _fl += 2ull * static_cast<std::uint64_t>(Pl) * Ql;
+          double* c = Cc.data();
+          const double* rr = rk.data();
+          for (long a1 = 0; a1 < Pl; ++a1) {
+            double acc = 0;
+            if (left_inner_transposed) {
+              for (long a4 = 0; a4 < Ql; ++a4) acc += l[a4 * Pl + a1] * rr[a4];
+            } else {
+              const double* lr = l + a1 * Ql;
+              for (long a4 = 0; a4 < Ql; ++a4) acc += lr[a4] * rr[a4];
+            }
+            c[a1] += factor * acc;
+          }
+        }
+        if (gemm_timing_enabled())
+          g_fall_flops_ce_ce.fetch_add(_fl, std::memory_order_relaxed);
+      };
+
+      for (std::size_t k = 0; k < nK; ++k) {
+        const auto& lk = lc[lbase + l_off(m, k)];
+        if (!lk) continue;  // absent left single-cell operand: skip k (β=1)
+        // Discover Q from the first present R[k,μ̃] for this k.
+        long Q = -1;
+        for (std::size_t mu = 0; mu < Mmu; ++mu) {
+          const auto& rmu = rc[rbase + r_off(k, mu)];
+          if (rmu) {
+            Q = static_cast<long>(rmu.size());
+            break;
+          }
+        }
+        if (Q <= 0) continue;  // no present operand cell for this k
+
+        // Bench/forced per-cell, OR genuine size mismatch L[m,k] != P*Q (a
+        // strided segment GEMM would be ill-shaped): take the per-cell path.
+        if (ce_ce_strided_disabled() ||
+            static_cast<long>(lk.size()) != P * Q) {
+          percell_k(k, lk);
+          continue;
+        }
+
+        const double* Lk = lk.data();  // P x Q (or Q x P if transposed)
+        std::size_t mu = 0;
+        while (mu < Mmu) {
+          const auto& rc0 = rc[rbase + r_off(k, mu)];
+          auto& cc0 = cc[cbase + c_off(m, mu)];
+          // skip holes / size-mismatched cells (cannot join a P/Q segment).
+          if (!rc0 || !cc0 || static_cast<long>(cc0.size()) != P ||
+              static_cast<long>(rc0.size()) != Q) {
+            ++mu;
+            continue;
+          }
+          const double* rstart = rc0.data();  // segment μ̃-run base on R, stride sR
+          double* cstart = cc0.data();        // segment μ̃-run base on C, stride sC
+          // Grow the maximal segment, recomputing the strides locally (never
+          // reuse a run-wide stale stride).
+          std::size_t end = mu + 1;
+          long sR = -1, sC = -1;
+          while (end < Mmu) {
+            const auto& rce = rc[rbase + r_off(k, end)];
+            const auto& cce = cc[cbase + c_off(m, end)];
+            if (!rce || !cce) break;
+            if (static_cast<long>(cce.size()) != P ||
+                static_cast<long>(rce.size()) != Q)
+              break;
+            const long dR = static_cast<long>(rce.data() - rstart);
+            const long dC = static_cast<long>(cce.data() - cstart);
+            const long off = static_cast<long>(end - mu);
+            if (off == 1) {
+              sR = dR;
+              sC = dC;
+              if (sR < Q || sC < P) break;  // page-jump / overlap
+            } else if (dR != off * sR || dC != off * sC) {
+              break;
+            }
+            ++end;
+          }
+          const std::size_t Mseg = end - mu;
+          const long ldR = (Mseg > 1) ? sR : Q;
+          const long ldC = (Mseg > 1) ? sC : P;
+          // C(Mseg x P) += factor * R̃(Mseg x Q) · op(L) ; contract a_4(=Q).
+          // op(L) is (a4,a1)=Q x P: canonical L is P x Q used transposed
+          // (transb=T, ldb=Q); a matrix_transpose left inner is already Q x P,
+          // fed transb=N with ldb=P (zero-copy). Threaded identically per
+          // segment to the old clean GEMM.
+          {
+            ScopedShapedGemmTimer _gt(g_gemm_ns_ce_ce, g_gemm_calls_ce_ce,
+                                      g_ce_ce_shapes, Mseg, P, Q);
+            blas::gemm(
+                blas::NoTranspose,
+                left_inner_transposed ? blas::NoTranspose : blas::Transpose,
+                /*M=*/static_cast<integer>(Mseg),
+                /*N=*/static_cast<integer>(P),
+                /*K=*/static_cast<integer>(Q), factor,
+                /*A=*/rstart, /*lda=*/static_cast<integer>(ldR),
+                /*B=*/Lk,
+                /*ldb=*/static_cast<integer>(left_inner_transposed ? P : Q),
+                /*beta=*/1.0,
+                /*C=*/cstart, /*ldc=*/static_cast<integer>(ldC));
+          }
+#ifdef TA_STRIDED_DGEMM_COUNT
+          g_strided_dgemm_ce_ce_right_calls.fetch_add(1,
+                                                      std::memory_order_relaxed);
+#endif
+          mu = end;
+        }
+      }
+    }
+  }
+}
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+inline std::atomic<std::size_t> g_strided_dgemm_ce_ce_left_calls{0};
+#endif
+
+/// ce+ce strided-DGEMM core, LEFT-clean mirror of
+/// arena_strided_dgemm_ce_ce_right. Here the LEFT operand inner cell is the pure
+/// contraction vector L[m,k](a4) (no inner external) and the RIGHT operand
+/// carries the inner external R[k,n](a4,b1); the result inner is the right
+/// inner-external b1. Rides the LEFT outer-external `m` (Mo) into BLAS M and
+/// loops the RIGHT outer-external `n` (No) as an OUTER loop; L (a function of
+/// m,k) supplies the strided BLAS-M rows. For each batch b, each right-external
+/// n, and each outer-contraction cell k:
+///   C[m,n](b1) += factor * Σ_{a4} L[m,k](a4) · R[k,n](a4,b1)
+/// realized as ONE M=Mo × N=P(=b1) × K=Q(=a4) DGEMM riding `m` into BLAS M via
+/// the inter-m-cell slab stride (zero-copy), looping k with beta=1. If a
+/// per-(b,n) run is not clean, an inline per-cell fallback handles THAT (b,n)
+/// only (each cell once -> no double-count). Orientation-aware (l_off/r_off from
+/// left_op/right_op of the OUTER GemmHelper, exactly as the right core). C must
+/// be pre-shaped; the result outer is (m, n) row-major. Accumulates (beta=1).
+template <typename ResultOuter, typename LeftOuter, typename RightOuter>
+void arena_strided_dgemm_ce_ce_left(ResultOuter& C, const LeftOuter& L,
+                                    const RightOuter& R, std::size_t Mo,
+                                    std::size_t No, std::size_t Ko,
+                                    math::blas::Op left_op,
+                                    math::blas::Op right_op, double factor,
+                                    bool right_inner_transposed = false) {
+  // right_inner_transposed: the external-carrying RIGHT inner cell is stored
+  // (b1,a4)=P x Q (matrix_transpose) instead of canonical (a4,b1)=Q x P. Folded
+  // into the inner GEMM via transb (zero-copy); the left contraction-vector
+  // side must remain canonical (gated upstream).
+  namespace blas = TiledArray::math::blas;
+  using integer = blas::integer;
+  static_assert(is_tensor_view_v<typename ResultOuter::value_type> &&
+                    is_tensor_view_v<typename LeftOuter::value_type> &&
+                    is_tensor_view_v<typename RightOuter::value_type>,
+                "arena_strided_dgemm_ce_ce_left: arena (view) inner cells only");
+  static_assert(
+      std::is_same_v<typename ResultOuter::value_type::numeric_type, double> &&
+          std::is_same_v<typename LeftOuter::value_type::numeric_type, double> &&
+          std::is_same_v<typename RightOuter::value_type::numeric_type, double>,
+      "arena_strided_dgemm_ce_ce_left: double inner storage only");
+  const std::size_t nK = Ko;  // outer-contracted, looped with beta=1
+  const std::size_t nbatch = static_cast<std::size_t>(C.nbatch());
+  if (nbatch == 0 || Mo == 0 || nK == 0 || No == 0) return;
+  const bool shape_ok =
+      (C.range().volume() == Mo * No && L.range().volume() == Mo * nK &&
+       R.range().volume() == No * nK &&
+       static_cast<std::size_t>(L.nbatch()) == nbatch &&
+       static_cast<std::size_t>(R.nbatch()) == nbatch);
+  TA_ASSERT(shape_ok);
+  if (!shape_ok) return;
+  ScopedPhaseTimer _kernel_timer(g_kernel_ns_ce_ce);
+  // orientation-aware outer offsets (mirror arena_strided_dgemm_ce_ce_right)
+  const std::size_t lda_o = (left_op == blas::NoTranspose) ? Ko : Mo;
+  auto l_off = [&](std::size_t m, std::size_t k) {
+    return (left_op == blas::NoTranspose) ? m * lda_o + k : k * lda_o + m;
+  };
+  const std::size_t ldb_o = (right_op == blas::NoTranspose) ? No : Ko;
+  auto r_off = [&](std::size_t k, std::size_t n) {
+    return (right_op == blas::NoTranspose) ? k * ldb_o + n : n * ldb_o + k;
+  };
+  // result outer (Mo x No) row-major: (m, n) = m*No + n.
+  auto c_off = [&](std::size_t m, std::size_t n) { return m * No + n; };
+  const auto* lc = L.data();
+  const auto* rc = R.data();
+  auto* cc = C.data();
+  for (std::size_t b = 0; b < nbatch; ++b) {
+    const std::size_t cbase = b * Mo * No;
+    const std::size_t lbase = b * Mo * nK;
+    const std::size_t rbase = b * No * nK;
+    for (std::size_t n = 0; n < No; ++n) {  // right-external outer loop
+      // Per-k segment walker (mirror of arena_strided_dgemm_ce_ce_right; strided
+      // axis is m, single-cell operand is R[k,n], strided operands are L[m,k]
+      // (the BLAS-M rows) and C[m,n]). For each present R[k,n], walk the m axis
+      // and emit one strided GEMM per maximal contiguous segment of present,
+      // size-matched (C.size==P, L.size==Q), uniformly-strided cells; skip holes.
+      // beta=1 accumulates across k AND segments. A fully-dense run yields one
+      // full-run segment == the old clean GEMM. A genuine size mismatch
+      // R[k,n] != P*Q drops to a tiny scalar path for that k only.
+      const auto _check_t0 = phase_start();
+      // Result inner free index P(=b1) from the FIRST PRESENT C[m,n].
+      long P = -1;
+      for (std::size_t m = 0; m < Mo; ++m) {
+        const auto& cm = cc[cbase + c_off(m, n)];
+        if (cm) {
+          P = static_cast<long>(cm.size());
+          break;
+        }
+      }
+      phase_stop(g_check_ns_ce_ce, _check_t0);
+      if (P <= 0) continue;  // result run entirely absent: nothing to write
+
+      // Gated diagnosis: strided operand is L (m-run); single-cell is R[k,n].
+      if (gemm_timing_enabled()) {
+        auto getC =
+            [&](std::size_t m) -> const typename ResultOuter::value_type& {
+          return cc[cbase + c_off(m, n)];
+        };
+        auto getL = [&](std::size_t k, std::size_t m)
+            -> const typename LeftOuter::value_type& {
+          return lc[lbase + l_off(m, k)];
+        };
+        auto getR =
+            [&](std::size_t k) -> const typename RightOuter::value_type& {
+          return rc[rbase + r_off(k, n)];
+        };
+        int why = classify_run(getC, Mo);
+        if (why == 0) why = classify_operand(getL, getR, Mo, nK, P);
+        if (why != 0) record_ce_ce_fallback(why);
+        measure_segments(getC, getL, getR, Mo, nK, P);
+      }
+
+      // Per-cell scalar evaluation of one k-slab (left orientation): each
+      // present (m) result cell gets C[m,n] += factor * L[m,k] · op(R[k,n]).
+      // Legacy per-cell path; forced by ce_ce_strided_disabled() for the bench.
+      auto percell_k = [&](std::size_t k,
+                           const typename RightOuter::value_type& rk) {
+        ScopedPhaseTimer _fb_timer(g_fallback_ns_ce_ce);
+        std::uint64_t _fl = 0;
+        const double* bd = rk.data();  // canonical Q x P row-major
+        for (std::size_t m = 0; m < Mo; ++m) {
+          auto& Cc = cc[cbase + c_off(m, n)];
+          const auto& lk = lc[lbase + l_off(m, k)];
+          if (!Cc || !lk) continue;
+          const long Pl = static_cast<long>(Cc.size());
+          const long Ql = static_cast<long>(lk.size());
+          if (Ql == 0 || static_cast<long>(rk.size()) != Ql * Pl) continue;
+          _fl += 2ull * static_cast<std::uint64_t>(Pl) * Ql;
+          double* c = Cc.data();
+          const double* a = lk.data();  // Ql vector
+          for (long a4 = 0; a4 < Ql; ++a4) {
+            const double av = a[a4];
+            if (right_inner_transposed) {
+              for (long p = 0; p < Pl; ++p) c[p] += factor * av * bd[p * Ql + a4];
+            } else {
+              const double* br = bd + a4 * Pl;
+              for (long p = 0; p < Pl; ++p) c[p] += factor * av * br[p];
+            }
+          }
+        }
+        if (gemm_timing_enabled())
+          g_fall_flops_ce_ce.fetch_add(_fl, std::memory_order_relaxed);
+      };
+
+      for (std::size_t k = 0; k < nK; ++k) {
+        const auto& rk = rc[rbase + r_off(k, n)];
+        if (!rk) continue;  // absent right single-cell operand: skip k (beta=1)
+        // Discover Q from the first present L[m,k] for this k.
+        long Q = -1;
+        for (std::size_t m = 0; m < Mo; ++m) {
+          const auto& lm = lc[lbase + l_off(m, k)];
+          if (lm) {
+            Q = static_cast<long>(lm.size());
+            break;
+          }
+        }
+        if (Q <= 0) continue;  // no present operand cell for this k
+
+        // Bench/forced per-cell, OR genuine size mismatch R[k,n] != P*Q (a
+        // strided segment GEMM would be ill-shaped): take the per-cell path.
+        if (ce_ce_strided_disabled() ||
+            static_cast<long>(rk.size()) != P * Q) {
+          percell_k(k, rk);
+          continue;
+        }
+
+        const double* Rk = rk.data();  // Q x P (or P x Q if transposed)
+        std::size_t m = 0;
+        while (m < Mo) {
+          const auto& lc0 = lc[lbase + l_off(m, k)];
+          auto& cc0 = cc[cbase + c_off(m, n)];
+          // skip holes / size-mismatched cells (cannot join a P/Q segment).
+          if (!lc0 || !cc0 || static_cast<long>(cc0.size()) != P ||
+              static_cast<long>(lc0.size()) != Q) {
+            ++m;
+            continue;
+          }
+          const double* lstart = lc0.data();  // segment m-run base on L, stride sA
+          double* cstart = cc0.data();        // segment m-run base on C, stride sC
+          // Grow the maximal segment, recomputing the strides locally (never
+          // reuse a run-wide stale stride).
+          std::size_t end = m + 1;
+          long sA = -1, sC = -1;
+          while (end < Mo) {
+            const auto& lce = lc[lbase + l_off(end, k)];
+            const auto& cce = cc[cbase + c_off(end, n)];
+            if (!lce || !cce) break;
+            if (static_cast<long>(cce.size()) != P ||
+                static_cast<long>(lce.size()) != Q)
+              break;
+            const long dA = static_cast<long>(lce.data() - lstart);
+            const long dC = static_cast<long>(cce.data() - cstart);
+            const long off = static_cast<long>(end - m);
+            if (off == 1) {
+              sA = dA;
+              sC = dC;
+              if (sA < Q || sC < P) break;  // page-jump / overlap
+            } else if (dA != off * sA || dC != off * sC) {
+              break;
+            }
+            ++end;
+          }
+          const std::size_t Mseg = end - m;
+          const long ldA = (Mseg > 1) ? sA : Q;
+          const long ldC = (Mseg > 1) ? sC : P;
+          // C(Mseg x P) += factor * L_tilde(Mseg x Q) . op(R) ; contract a4(=Q).
+          // op(R) is (a4,b1)=Q x P: canonical R is Q x P used directly
+          // (transb=N, ldb=P); a matrix_transpose right inner is stored
+          // (b1,a4)=P x Q, fed transb=T with ldb=Q (zero-copy). Threaded
+          // identically per segment to the old clean GEMM.
+          {
+            ScopedShapedGemmTimer _gt(g_gemm_ns_ce_ce, g_gemm_calls_ce_ce,
+                                      g_ce_ce_shapes, Mseg, P, Q);
+            blas::gemm(
+                blas::NoTranspose,
+                right_inner_transposed ? blas::Transpose : blas::NoTranspose,
+                /*M=*/static_cast<integer>(Mseg),
+                /*N=*/static_cast<integer>(P),
+                /*K=*/static_cast<integer>(Q), factor,
+                /*A=*/lstart, /*lda=*/static_cast<integer>(ldA),
+                /*B=*/Rk,
+                /*ldb=*/static_cast<integer>(right_inner_transposed ? Q : P),
+                /*beta=*/1.0,
+                /*C=*/cstart, /*ldc=*/static_cast<integer>(ldC));
+          }
+#ifdef TA_STRIDED_DGEMM_COUNT
+          g_strided_dgemm_ce_ce_left_calls.fetch_add(1,
+                                                     std::memory_order_relaxed);
+#endif
+          m = end;
+        }
+      }
+    }
+  }
+}
+
 /// Creates a fused contraction callback.
 template <typename Result, typename Left, typename Right, typename Op>
 auto make_fused_contraction_lambda(Op contrreduce_op) {
@@ -678,6 +2144,16 @@ auto make_regime_a_arena_plan(const A& a, const B& b, const Inner& inner,
   }
 }
 
+/// Kill switch for the regime-A hc+e strided-DGEMM reuse path: when true,
+/// run_regime_a_arena keeps the legacy per-cell accumulate. Test/bench hook
+/// for the strided-vs-per-cell differential (correctness) and the perf
+/// measurement; production default is false (strided on). Mirrors
+/// arena_disabled().
+inline bool& regime_a_strided_disabled() {
+  static bool flag = false;
+  return flag;
+}
+
 /// Runs the arena regime-A path for one H-slice when the plan is active.
 template <typename Plan, typename HIndex, typename TermA, typename TermB,
           typename TermC, typename LocalTiles, typename Tiles, typename Trange>
@@ -719,6 +2195,24 @@ bool run_regime_a_arena(const Plan& plan, const HIndex& h, std::size_t batch,
 
     if constexpr (a_is_tot && b_is_tot) {
       using IIndex = ::Einsum::index::Index<std::size_t>;
+      // hc+e reuse gate: the result/operand inner cells must be the kernel's
+      // (view + double) inner type; mirror arena_strided_dgemm_ce_e's
+      // static_assert so non-view / non-double ToT keep the per-cell path.
+      using LInnerT = typename ArrayA_t::value_type::value_type;
+      using RInnerT = typename ArrayB_t::value_type::value_type;
+      constexpr bool ce_e_kernel_ok =
+          is_tensor_view_v<InnerT> && is_tensor_view_v<LInnerT> &&
+          is_tensor_view_v<RInnerT> &&
+          std::is_same_v<typename InnerT::numeric_type, double> &&
+          std::is_same_v<typename LInnerT::numeric_type, double> &&
+          std::is_same_v<typename RInnerT::numeric_type, double>;
+      // Inner OUTER-PRODUCT (K_inner==0) is the strided-reusable shape; any
+      // inner contraction (hc+ce) stays per-cell (two-level stride). The
+      // runtime toggle lets tests/benches force the per-cell path.
+      const bool hce_e_strided =
+          plan.kind == RegimeAInnerKind::contraction && plan.c_plan &&
+          plan.c_plan->gemm_helper.num_contract_ranks() == 0 &&
+          !regime_a_strided_disabled();
       auto range_for = [&](std::size_t k) -> InnerRange {
         if (k >= batch) return InnerRange{};
         for (IIndex i : tiles) {
@@ -778,6 +2272,23 @@ bool run_regime_a_arena(const Plan& plan, const HIndex& h, std::size_t batch,
         auto shape = trange.tile(i);
         ai = ai.reshape(shape, batch);
         bi = bi.reshape(shape, batch);
+        if constexpr (ce_e_kernel_ok) {
+          if (hce_e_strided) {
+            // hc+e: ride the within-tile contraction cells into BLAS K via the
+            // landed ce+e core. M=N=1, K=vol; kernel nbatch == Hadamard batch.
+            // cview shares tile's data_ (storage-aliasing reshape), so the
+            // kernel's beta=1 writes accumulate into tile across the i-loop.
+            namespace blas = TiledArray::math::blas;
+            const std::size_t Kvol =
+                static_cast<std::size_t>(trange.tile(i).volume());
+            auto cview = tile.reshape(TiledArray::Range{1}, batch);
+            arena_strided_dgemm_ce_e(cview, ai, bi, /*M=*/std::size_t{1},
+                                     /*N=*/std::size_t{1}, /*K=*/Kvol,
+                                     blas::NoTranspose, blas::NoTranspose,
+                                     /*factor=*/1.0);
+            continue;  // tile-i contribution complete
+          }
+        }
         for (std::size_t k = 0; k < batch; ++k) {
           auto& cell = tile({k});
           if (cell.empty()) continue;
diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h
index 921490add8..d7d28220f3 100644
--- a/src/TiledArray/tensor/tensor.h
+++ b/src/TiledArray/tensor/tensor.h
@@ -38,6 +38,12 @@
 
 #include <umpire_cxx_allocator.hpp>
 
+#include <atomic>
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+
 namespace TiledArray {
 
 namespace detail {
@@ -84,6 +90,164 @@ To clone_or_cast(From&& f) {
   }
 }
 
+/// ---------------------------------------------------------------------------
+/// Env-gated timing probe for the strided-GEMM ToT "scale" outer-contraction
+/// path (Tensor::gemm, commit 266f0a48): measures how much of the scale work
+/// runs on the fast strided BLAS GEMM vs. how much reverts to the per-cell
+/// AXPY fallback. Mirrors the ce+e / ce+ce probes in arena_einsum.h: switched
+/// on by the SAME env var TA_GEMM_TIMING=1 (single master switch); takes no
+/// clock samples and touches no atomics when unset (zero production overhead).
+/// Two regimes are tracked separately:
+///   [0] tot_x_t : left ToT x plain scalar, "m,k;a * k,n -> m,n;a" (per-row m)
+///   [1] t_x_tot : plain scalar x right ToT, "m,k * k,n;a -> m,n;a" (per-col n)
+/// Per-regime totals print to stderr at process exit.
+inline bool scale_gemm_timing_enabled() {
+  static const bool enabled = [] {
+    const char* e = std::getenv("TA_GEMM_TIMING");
+    return e != nullptr && e[0] != '\0' && !(e[0] == '0' && e[1] == '\0');
+  }();
+  return enabled;
+}
+
+/// Counters for one scale regime. `{0}` member-init gives well-defined zero.
+struct ScaleRegimeCounters {
+  std::atomic<std::uint64_t> gemm_ns{0};   // wall ns inside the strided gemm
+  std::atomic<std::uint64_t> fb_ns{0};     // wall ns inside the AXPY fallback
+  std::atomic<std::uint64_t> gemm_runs{0}; // clean rows/cols (one strided GEMM)
+  std::atomic<std::uint64_t> fb_runs{0};   // rows/cols that fell back to AXPY
+  std::atomic<std::uint64_t> gemm_flop{0}; // 2*K*N*A (clean), summed
+  std::atomic<std::uint64_t> fb_flop{0};   // exact 2*K*Sum(cellsize) (fallback)
+  std::atomic<std::uint64_t> fb_absent{0}; // fallback reason: an empty cell
+  std::atomic<std::uint64_t> fb_ragged{0}; // fallback reason: ragged inner size
+  std::atomic<std::uint64_t> fb_stride{0}; // fallback reason: multi-page stride
+  // --- phase breakdown of the per-(b,m) loop (Amdahl of the 75% overhead) ---
+  std::atomic<std::uint64_t> kernel_ns{0};    // whole for-b/for-m loop body
+  std::atomic<std::uint64_t> check_pres_ns{0};// per-row presence + size scan
+  std::atomic<std::uint64_t> check_str_ns{0}; // per-row constant-stride walk
+  // beta-eligibility: how many Tensor::gemm CALLS land on a freshly-allocated
+  // (this->empty()) output tile -- where beta=0 would be valid -- vs an
+  // accumulation into an existing tile (beta=1 required for correctness).
+  std::atomic<std::uint64_t> calls_firstwrite{0};
+  std::atomic<std::uint64_t> calls_accum{0};
+  // loop-residual := kernel_ns - check_pres - check_str - gemm_ns - fb_ns
+  //   (row pointer setup, loop control, A<=0 skips; absorbs probe clock cost)
+};
+inline ScaleRegimeCounters g_scale[2];  // [0]=tot_x_t, [1]=t_x_tot
+
+/// Manual (non-scoped) phase clock for regions that set locals used later, so a
+/// timed scope can't wrap them. No-op unless TA_GEMM_TIMING is set. Mirrors the
+/// arena_einsum.h phase_start/phase_stop pattern.
+inline std::chrono::steady_clock::time_point scale_phase_start() {
+  return scale_gemm_timing_enabled() ? std::chrono::steady_clock::now()
+                                     : std::chrono::steady_clock::time_point{};
+}
+inline void scale_phase_stop(std::atomic<std::uint64_t>& acc,
+                             std::chrono::steady_clock::time_point t0) {
+  if (!scale_gemm_timing_enabled()) return;
+  acc.fetch_add(static_cast<std::uint64_t>(
+                    std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        std::chrono::steady_clock::now() - t0)
+                        .count()),
+                std::memory_order_relaxed);
+}
+
+/// RAII timer over one strided GEMM / one AXPY-fallback run; no-op (no clock
+/// read, no atomic touch) unless TA_GEMM_TIMING is set. Mirrors the
+/// arena_einsum.h ScopedPhaseTimer.
+class ScopedScaleTimer {
+ public:
+  explicit ScopedScaleTimer(std::atomic<std::uint64_t>& acc)
+      : acc_(scale_gemm_timing_enabled() ? &acc : nullptr) {
+    if (acc_) t0_ = std::chrono::steady_clock::now();
+  }
+  ~ScopedScaleTimer() {
+    if (!acc_) return;
+    const auto dt = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        std::chrono::steady_clock::now() - t0_)
+                        .count();
+    acc_->fetch_add(static_cast<std::uint64_t>(dt), std::memory_order_relaxed);
+  }
+  ScopedScaleTimer(const ScopedScaleTimer&) = delete;
+  ScopedScaleTimer& operator=(const ScopedScaleTimer&) = delete;
+
+ private:
+  std::atomic<std::uint64_t>* acc_;
+  std::chrono::steady_clock::time_point t0_;
+};
+
+/// Prints the scale-path coverage at process exit (only if TA_GEMM_TIMING set).
+struct ScaleGemmTimingDumper {
+  ~ScaleGemmTimingDumper() {
+    if (!scale_gemm_timing_enabled()) return;
+    auto L = [](std::atomic<std::uint64_t>& a) {
+      return a.load(std::memory_order_relaxed);
+    };
+    const char* names[2] = {"tot_x_t (left ToT x scalar, per-row)",
+                            "t_x_tot (scalar x right ToT, per-col)"};
+    std::uint64_t tg_ns = 0, tf_ns = 0, tg_fl = 0, tf_fl = 0;
+    for (int r = 0; r < 2; ++r) {
+      const auto gns = L(g_scale[r].gemm_ns), fns = L(g_scale[r].fb_ns);
+      const auto gr = L(g_scale[r].gemm_runs), fr = L(g_scale[r].fb_runs);
+      const auto gf = L(g_scale[r].gemm_flop), ff = L(g_scale[r].fb_flop);
+      tg_ns += gns; tf_ns += fns; tg_fl += gf; tf_fl += ff;
+      const double tt = static_cast<double>(gns + fns);
+      const double ftot = static_cast<double>(gf + ff);
+      std::cerr << "[scale-timing] " << names[r] << ":\n";
+      std::cerr << "[scale-timing]   strided GEMM : " << gns / 1e9 << " s  ("
+                << gr << " runs)\n";
+      std::cerr << "[scale-timing]   fallback AXPY: " << fns / 1e9 << " s  ("
+                << fr << " runs)\n";
+      std::cerr << "[scale-timing]   time coverage (GEMM / total) : "
+                << (tt > 0 ? 100.0 * gns / tt : 0.0) << "%\n";
+      std::cerr << "[scale-timing]   FLOP coverage (GEMM / total) : "
+                << (ftot > 0 ? 100.0 * gf / ftot : 0.0) << "%  ("
+                << gf / 1e9 << " GFLOP gemm / " << ftot / 1e9 << " GFLOP)\n";
+      std::cerr << "[scale-timing]   GFLOP/s strided="
+                << (gns > 0 ? gf / static_cast<double>(gns) : 0.0)
+                << "  fallback="
+                << (fns > 0 ? ff / static_cast<double>(fns) : 0.0) << "\n";
+      std::cerr << "[scale-timing]   fallback runs by reason: absent="
+                << L(g_scale[r].fb_absent) << " ragged=" << L(g_scale[r].fb_ragged)
+                << " multipage-stride=" << L(g_scale[r].fb_stride) << "\n";
+      // Phase breakdown of the per-(b,m) loop = where the non-GEMM overhead goes.
+      const auto kn = L(g_scale[r].kernel_ns);
+      const auto cp = L(g_scale[r].check_pres_ns);
+      const auto cs = L(g_scale[r].check_str_ns);
+      const auto resid =
+          (kn > gns + fns + cp + cs) ? kn - gns - fns - cp - cs : 0;
+      auto pc = [kn](std::uint64_t x) {
+        return kn > 0 ? 100.0 * static_cast<double>(x) / static_cast<double>(kn)
+                      : 0.0;
+      };
+      std::cerr << "[scale-phases] kernel total (for-b/for-m): " << kn / 1e9
+                << " s\n";
+      std::cerr << "[scale-phases]   strided GEMM        : " << gns / 1e9
+                << " s  (" << pc(gns) << "%)\n";
+      std::cerr << "[scale-phases]   fallback AXPY       : " << fns / 1e9
+                << " s  (" << pc(fns) << "%)\n";
+      std::cerr << "[scale-phases]   clean-check presence: " << cp / 1e9
+                << " s  (" << pc(cp) << "%)\n";
+      std::cerr << "[scale-phases]   clean-check STRIDE walk: " << cs / 1e9
+                << " s  (" << pc(cs) << "%)\n";
+      std::cerr << "[scale-phases]   loop residual       : " << resid / 1e9
+                << " s  (" << pc(resid) << "%)\n";
+      const auto fw = L(g_scale[r].calls_firstwrite);
+      const auto ac = L(g_scale[r].calls_accum);
+      std::cerr << "[scale-beta] gemm CALLS: first-write (beta=0 ok)=" << fw
+                << "  accumulate (beta=1 needed)=" << ac << "  ("
+                << (fw + ac > 0 ? 100.0 * fw / (fw + ac) : 0.0)
+                << "% beta=0-eligible)\n";
+    }
+    const double allt = static_cast<double>(tg_ns + tf_ns);
+    const double allf = static_cast<double>(tg_fl + tf_fl);
+    std::cerr << "[scale-timing] SCALE TOTAL: strided GEMM " << tg_ns / 1e9
+              << " s, fallback AXPY " << tf_ns / 1e9 << " s, time coverage "
+              << (allt > 0 ? 100.0 * tg_ns / allt : 0.0) << "%, FLOP coverage "
+              << (allf > 0 ? 100.0 * tg_fl / allf : 0.0) << "%\n";
+  }
+};
+inline ScaleGemmTimingDumper g_scale_gemm_timing_dumper;
+
 }  // namespace detail
 
 /// An N-dimensional tensor object
@@ -3054,6 +3218,9 @@ class Tensor {
               gemm_helper.left_right_congruent(left.range().upbound_data(),
                                                right.range().upbound_data()));
 
+    // beta-eligibility probe: a fresh (empty) result tile could use beta=0
+    // (and skip zero-init); an existing one needs beta=1 to accumulate.
+    [[maybe_unused]] const bool _scale_was_empty = this->empty();
     if (this->empty()) {  // initialize, if empty
       *this = Tensor(gemm_helper.make_result_range<range_type>(left.range(),
                                                                right.range()),
@@ -3110,6 +3277,14 @@ class Tensor {
       if constexpr (std::is_same_v<std::remove_cv_t<V>, Real>) {
         if (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose &&
             gemm_helper.right_op() == TiledArray::math::blas::NoTranspose) {
+          // kernel-total timer: destroyed at `return *this;` below, so it
+          // captures the whole for-b/for-m loop. loop-residual is derived from
+          // it minus the sub-phases.
+          detail::ScopedScaleTimer _scale_kt(detail::g_scale[0].kernel_ns);
+          if (detail::scale_gemm_timing_enabled())
+            (_scale_was_empty ? detail::g_scale[0].calls_firstwrite
+                              : detail::g_scale[0].calls_accum)
+                .fetch_add(1, std::memory_order_relaxed);
           for (integer b = 0; b != nbatch(); ++b) {
             auto this_data = this->batch_data(b);
             auto left_data = left.batch_data(b);
@@ -3123,6 +3298,7 @@ class Tensor {
               // AXPY.
               long A = -1;
               bool clean = true;
+              const auto _scale_tcp = detail::scale_phase_start();
               for (integer k = 0; k != K && clean; ++k) {
                 const auto& c = lc0[k];
                 if (c.empty()) {
@@ -3147,6 +3323,8 @@ class Tensor {
                 else if (A != s)
                   clean = false;
               }
+              detail::scale_phase_stop(detail::g_scale[0].check_pres_ns,
+                                       _scale_tcp);
               // Arena cells are SIMD-padded, so the per-row inter-cell stride
               // is the padded inner size (>= A). The strided GEMM requires the
               // row's cells to be ONE contiguous run at constant stride -- only
@@ -3154,6 +3332,7 @@ class Tensor {
               // compacted) ToT tile may span multiple pages, where the stride
               // jumps at a page boundary; verify constant stride across ALL
               // cells (so multi-page tiles fall back to the AXPY loop).
+              const auto _scale_tcs = detail::scale_phase_start();
               integer ldb = static_cast<integer>(A);
               integer ldc = static_cast<integer>(A);
               if (clean && A > 0) {
@@ -3168,6 +3347,8 @@ class Tensor {
                 for (integer n = 0; clean && n != N; ++n)
                   if (rc0[n].data() != rc0[0].data() + n * sc) clean = false;
               }
+              detail::scale_phase_stop(detail::g_scale[0].check_str_ns,
+                                       _scale_tcs);
               if (A <= 0) continue;  // empty row -> nothing to do
               if (clean) {
                 // result[m,n][a] += sum_k left[m,k][a] * right[k,n].
@@ -3175,7 +3356,17 @@ class Tensor {
                 // where L2 = left row-m slab (K x A, ld=ldb), C2 = result row-m
                 // slab (N x A, ld=ldc), right is K x N (ld=N). ldb/ldc carry
                 // padding.
+                if (detail::scale_gemm_timing_enabled()) {
+                  detail::g_scale[0].gemm_runs.fetch_add(
+                      1, std::memory_order_relaxed);
+                  detail::g_scale[0].gemm_flop.fetch_add(
+                      2ull * static_cast<std::uint64_t>(K) *
+                          static_cast<std::uint64_t>(N) *
+                          static_cast<std::uint64_t>(A),
+                      std::memory_order_relaxed);
+                }
                 const integer Ai = static_cast<integer>(A);
+                detail::ScopedScaleTimer _scale_gt(detail::g_scale[0].gemm_ns);
                 TiledArray::math::blas::gemm(
                     TiledArray::math::blas::Transpose,
                     TiledArray::math::blas::NoTranspose,
@@ -3184,6 +3375,38 @@ class Tensor {
                     /*B=*/lc0[0].data(), /*ldb=*/ldb, Real(1),
                     /*C=*/rc0[0].data(), /*ldc=*/ldc);
               } else {  // per-cell AXPY fallback for this row
+                if (detail::scale_gemm_timing_enabled()) {
+                  // classify fallback reason (re-scan; observation only, does
+                  // not affect the decision above) + exact fallback FLOPs.
+                  bool absent = false, ragged = false;
+                  long a0 = -1;
+                  for (integer k = 0; k != K; ++k) {
+                    const auto& c = lc0[k];
+                    if (c.empty()) { absent = true; break; }
+                    long s = static_cast<long>(c.size());
+                    if (a0 < 0) a0 = s; else if (a0 != s) ragged = true;
+                  }
+                  if (!absent)
+                    for (integer n = 0; n != N; ++n) {
+                      const auto& c = rc0[n];
+                      if (c.empty()) { absent = true; break; }
+                      long s = static_cast<long>(c.size());
+                      if (a0 < 0) a0 = s; else if (a0 != s) ragged = true;
+                    }
+                  std::uint64_t fl = 0;
+                  for (integer n = 0; n != N; ++n)
+                    fl += 2ull * static_cast<std::uint64_t>(K) *
+                          static_cast<std::uint64_t>(rc0[n].size());
+                  detail::g_scale[0].fb_runs.fetch_add(
+                      1, std::memory_order_relaxed);
+                  detail::g_scale[0].fb_flop.fetch_add(
+                      fl, std::memory_order_relaxed);
+                  (absent ? detail::g_scale[0].fb_absent
+                          : ragged ? detail::g_scale[0].fb_ragged
+                                   : detail::g_scale[0].fb_stride)
+                      .fetch_add(1, std::memory_order_relaxed);
+                }
+                detail::ScopedScaleTimer _scale_fb(detail::g_scale[0].fb_ns);
                 for (integer n = 0; n != N; ++n) {
                   auto c_offset = m * N + n;
                   for (integer k = 0; k != K; ++k)
@@ -3210,6 +3433,12 @@ class Tensor {
       if constexpr (std::is_same_v<std::remove_cv_t<U>, Real>) {
         if (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose &&
             gemm_helper.right_op() == TiledArray::math::blas::NoTranspose) {
+          // kernel-total timer (see tot_x_t block); destroyed at `return`.
+          detail::ScopedScaleTimer _scale_kt(detail::g_scale[1].kernel_ns);
+          if (detail::scale_gemm_timing_enabled())
+            (_scale_was_empty ? detail::g_scale[1].calls_firstwrite
+                              : detail::g_scale[1].calls_accum)
+                .fetch_add(1, std::memory_order_relaxed);
           for (integer b = 0; b != nbatch(); ++b) {
             auto this_data = this->batch_data(b);
             auto left_data = left.batch_data(b);    // M x K row-major scalars
@@ -3217,6 +3446,7 @@ class Tensor {
             for (integer n = 0; n != N; ++n) {
               long A = -1;
               bool clean = true;
+              const auto _scale_tcp = detail::scale_phase_start();
               for (integer k = 0; k != K && clean; ++k) {
                 const auto& c = right_data[k * N + n];
                 if (c.empty()) {
@@ -3241,6 +3471,9 @@ class Tensor {
                 else if (A != s)
                   clean = false;
               }
+              detail::scale_phase_stop(detail::g_scale[1].check_pres_ns,
+                                       _scale_tcp);
+              const auto _scale_tcs = detail::scale_phase_start();
               integer ldb = static_cast<integer>(A);  // k-stride, right col n
               integer ldc = static_cast<integer>(A);  // m-stride, result col n
               if (clean && A > 0) {
@@ -3261,10 +3494,22 @@ class Tensor {
                       this_data[n].data() + m * sc)
                     clean = false;
               }
+              detail::scale_phase_stop(detail::g_scale[1].check_str_ns,
+                                       _scale_tcs);
               if (A <= 0) continue;
               if (clean) {
                 // C_n(M x A) += left(M x K) * B_n(K x A). Row-major gemm.
+                if (detail::scale_gemm_timing_enabled()) {
+                  detail::g_scale[1].gemm_runs.fetch_add(
+                      1, std::memory_order_relaxed);
+                  detail::g_scale[1].gemm_flop.fetch_add(
+                      2ull * static_cast<std::uint64_t>(M) *
+                          static_cast<std::uint64_t>(K) *
+                          static_cast<std::uint64_t>(A),
+                      std::memory_order_relaxed);
+                }
                 const integer Ai = static_cast<integer>(A);
+                detail::ScopedScaleTimer _scale_gt(detail::g_scale[1].gemm_ns);
                 TiledArray::math::blas::gemm(
                     TiledArray::math::blas::NoTranspose,
                     TiledArray::math::blas::NoTranspose,
@@ -3273,6 +3518,38 @@ class Tensor {
                     /*B=*/right_data[n].data(), /*ldb=*/ldb, Real(1),
                     /*C=*/this_data[n].data(), /*ldc=*/ldc);
               } else {  // per-cell AXPY fallback for this column
+                if (detail::scale_gemm_timing_enabled()) {
+                  // classify fallback reason (re-scan; observation only) +
+                  // exact fallback FLOPs.
+                  bool absent = false, ragged = false;
+                  long a0 = -1;
+                  for (integer k = 0; k != K; ++k) {
+                    const auto& c = right_data[k * N + n];
+                    if (c.empty()) { absent = true; break; }
+                    long s = static_cast<long>(c.size());
+                    if (a0 < 0) a0 = s; else if (a0 != s) ragged = true;
+                  }
+                  if (!absent)
+                    for (integer m = 0; m != M; ++m) {
+                      const auto& c = this_data[m * N + n];
+                      if (c.empty()) { absent = true; break; }
+                      long s = static_cast<long>(c.size());
+                      if (a0 < 0) a0 = s; else if (a0 != s) ragged = true;
+                    }
+                  std::uint64_t fl = 0;
+                  for (integer m = 0; m != M; ++m)
+                    fl += 2ull * static_cast<std::uint64_t>(K) *
+                          static_cast<std::uint64_t>(this_data[m * N + n].size());
+                  detail::g_scale[1].fb_runs.fetch_add(
+                      1, std::memory_order_relaxed);
+                  detail::g_scale[1].fb_flop.fetch_add(
+                      fl, std::memory_order_relaxed);
+                  (absent ? detail::g_scale[1].fb_absent
+                          : ragged ? detail::g_scale[1].fb_ragged
+                                   : detail::g_scale[1].fb_stride)
+                      .fetch_add(1, std::memory_order_relaxed);
+                }
+                detail::ScopedScaleTimer _scale_fb(detail::g_scale[1].fb_ns);
                 for (integer m = 0; m != M; ++m) {
                   auto c_offset = m * N + n;
                   for (integer k = 0; k != K; ++k)
diff --git a/src/TiledArray/tile_op/contract_reduce.h b/src/TiledArray/tile_op/contract_reduce.h
index e599110d49..3e7b562f96 100644
--- a/src/TiledArray/tile_op/contract_reduce.h
+++ b/src/TiledArray/tile_op/contract_reduce.h
@@ -128,6 +128,16 @@ class ContractReduceBase {
     /// \note the lifetime is managed by the callee!
     TiledArray::function_ref<elem_muladd_op_type> elem_muladd_op_;
 
+    /// whole-tile strided-DGEMM outer-contraction op for arena ToT. When set,
+    /// ContractReduce::operator() delegates the (pre-shaped) result fill to it
+    /// instead of the generic gemm. Carries factor itself (alpha_ is forced to
+    /// 1 for ToT).
+    /// \note lifetime managed by the callee (a ContEngine std::function member).
+    using strided_oprod_op_type = void(result_type&, const left_tile_type&,
+                                       const right_tile_type&,
+                                       const math::GemmHelper&);
+    TiledArray::function_ref<strided_oprod_op_type> strided_oprod_op_{};
+
     TA_NO_UNIQUE_ADDRESS arena_plan_storage_t arena_plan_;
   };
 
@@ -222,6 +232,18 @@ class ContractReduceBase {
     return pimpl_->arena_plan_;
   }
 
+  /// Strided-DGEMM op accessor/mutator
+
+  /// \return A const reference to the strided-DGEMM op function_ref
+  const TiledArray::function_ref<typename Impl::strided_oprod_op_type>&
+  strided_oprod_op() const {
+    return pimpl_->strided_oprod_op_;
+  }
+  void set_strided_oprod_op(
+      TiledArray::function_ref<typename Impl::strided_oprod_op_type> op) {
+    pimpl_->strided_oprod_op_ = op;
+  }
+
   //-------------- these are only used for unit tests -----------------
 
   /// Compute the number of contracted ranks
@@ -400,6 +422,11 @@ class ContractReduce : public ContractReduceBase<Result, Left, Right, Scalar> {
             this->arena_plan()->grow_to_cover(result, left, right,
                                               this->gemm_helper());
         }
+        if (this->strided_oprod_op()) {
+          this->strided_oprod_op()(result, left, right,
+                                   ContractReduceBase_::gemm_helper());
+          return;
+        }
       }
       gemm(result, left, right, ContractReduceBase_::gemm_helper(),
            this->elem_muladd_op());
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 308a3dec1e..b217e22deb 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -110,6 +110,7 @@ set(ta_test_src_files  ta_test.cpp
     arena_sizeof_invariant_suite.cpp
     arena_tensor.cpp
     arena_tensor_kernels.cpp
+    arena_strided_dgemm.cpp
     tot_construction.cpp
 )
 
diff --git a/tests/arena_strided_dgemm.cpp b/tests/arena_strided_dgemm.cpp
new file mode 100644
index 0000000000..bd99b12f43
--- /dev/null
+++ b/tests/arena_strided_dgemm.cpp
@@ -0,0 +1,2112 @@
+// tests/arena_strided_dgemm.cpp
+#include "TiledArray/tensor/arena_einsum.h"
+#include "TiledArray/tensor/arena_kernels.h"
+#include "TiledArray/tensor.h"
+#include "TiledArray/math/blas.h"
+#include "tiledarray.h"
+#include "unit_test_config.h"
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace TA = TiledArray;
+using Inner = TA::ArenaTensor<double, TA::Range>;
+using Outer = TA::Tensor<Inner>;
+
+namespace {
+// Fabricate an arena ToT tile: outer range r, one batch, inner shape from
+// shape_fn(ordinal); fill cell e of ordinal o with base + 0.01*o + e.
+Outer make_filled(const TA::Range& r,
+                  const std::function<TA::Range(std::size_t)>& shape_fn,
+                  double base) {
+  Outer t = TA::detail::arena_outer_init<Outer>(r, 1, shape_fn);
+  for (std::size_t o = 0; o < t.range().volume(); ++o) {
+    Inner& c = t.data()[o];
+    if (!c) continue;
+    for (std::size_t e = 0; e < c.size(); ++e) c.data()[e] = base + 0.01 * o + e;
+  }
+  return t;
+}
+
+std::vector<double> ref_ce_e(const Outer& L, const Outer& R, std::size_t m,
+                             std::size_t n, std::size_t K, std::size_t P,
+                             std::size_t Q, double factor) {
+  std::vector<double> c(P * Q, 0.0);
+  for (std::size_t k = 0; k < K; ++k) {
+    const double* lp = L.data()[m * K + k].data();
+    const double* rp = R.data()[n * K + k].data();
+    for (std::size_t p = 0; p < P; ++p)
+      for (std::size_t q = 0; q < Q; ++q) c[p * Q + q] += factor * lp[p] * rp[q];
+  }
+  return c;
+}
+
+// C[mu](a1) = factor * sum_k sum_{a4} L[k](a1,a4) * R[mu,k](a4)
+// L outer {nK} inner {P,Q}; R outer {Mmu,nK} (mu slow, k fast) inner {Q};
+// C outer {Mmu} inner {P}.  (Mo==1 reference.)
+std::vector<double> ref_ce_ce(const Outer& L, const Outer& R, std::size_t Mmu,
+                              std::size_t nK, std::size_t P, std::size_t Q,
+                              double factor) {
+  std::vector<double> c(Mmu * P, 0.0);
+  for (std::size_t k = 0; k < nK; ++k) {
+    const double* l = L.data()[k].data();              // P x Q row-major
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      const double* r = R.data()[mu * nK + k].data();  // Q  (mu slow, k fast)
+      for (std::size_t a1 = 0; a1 < P; ++a1) {
+        double acc = 0;
+        for (std::size_t a4 = 0; a4 < Q; ++a4) acc += l[a1 * Q + a4] * r[a4];
+        c[mu * P + a1] += factor * acc;
+      }
+    }
+  }
+  return c;
+}
+
+// ---------------------------------------------------------------------------
+// Sparsity-aware helpers for the per-k segmented strided-DGEMM tests (T1-T15).
+//
+// make_sparse: like make_filled, but a cell whose dense_shape(o) is selected by
+// is_hole(o) is built from a zero-volume TA::Range{}, which arena_outer_init
+// leaves NULL (a hole). Present cells get deterministic data. nbatch>=1.
+Outer make_sparse(const TA::Range& outer_range, std::size_t nbatch,
+                  const std::function<TA::Range(std::size_t)>& dense_shape,
+                  const std::function<bool(std::size_t)>& is_hole, double base) {
+  Outer t = TA::detail::arena_outer_init<Outer>(
+      outer_range, nbatch,
+      [&](std::size_t o) { return is_hole(o) ? TA::Range{} : dense_shape(o); });
+  for (std::size_t o = 0; o < t.range().volume() * nbatch; ++o) {
+    Inner& c = t.data()[o];
+    if (!c) continue;
+    for (std::size_t e = 0; e < c.size(); ++e) c.data()[e] = base + 0.01 * o + e;
+  }
+  return t;
+}
+
+// Sparsity-aware reference for arena_strided_dgemm_ce_ce_right in the SAME
+// canonical convention as ref_ce_ce (L=strided P x Q matrix, R=single Q-vector),
+// but generalized to Mo>=1 outer rows, NB batches, holes, and the left-inner
+// transpose. Per the kernel: for result cell C[m,mu] (length P),
+//   C[m,mu](a1) = factor * sum_k present L[m,k] (P x Q) * present R[mu,k] (Q),
+// skipping any k where L[m,k] or R[mu,k] is absent or size-mismatched.
+//   L outer (Mo x nK), canonical index b*Mo*nK + m*nK + k, inner {P,Q}
+//   R outer (Mmu x nK), canonical (mu slow, k fast) b*Mmu*nK + mu*nK + k, inner {Q}
+//   C outer (Mo x Mmu), index b*Mo*Mmu + m*Mmu + mu, inner {P}
+// out[(b*Mo+m)*Mmu+mu] is the expected length-P vector (empty == expect absent).
+// lt mirrors left_inner_transposed: lt=false L stored P x Q (l[a1*Q+a4]),
+// lt=true L stored Q x P (l[a4*P+a1]).
+std::vector<std::vector<double>> ref_ce_ce_right_sparse(
+    const Outer& L, const Outer& R, std::size_t Mo, std::size_t Mmu,
+    std::size_t nK, std::size_t P, double factor, std::size_t nbatch = 1,
+    bool lt = false) {
+  std::vector<std::vector<double>> out(nbatch * Mo * Mmu);
+  for (std::size_t b = 0; b < nbatch; ++b)
+    for (std::size_t m = 0; m < Mo; ++m)
+      for (std::size_t mu = 0; mu < Mmu; ++mu) {
+        std::vector<double> c(P, 0.0);
+        bool any = false;
+        for (std::size_t k = 0; k < nK; ++k) {
+          const Inner& lk = L.data()[b * Mo * nK + m * nK + k];
+          const Inner& rk = R.data()[b * Mmu * nK + mu * nK + k];
+          if (!lk || !rk) continue;
+          const std::size_t Q = rk.size();
+          if (Q == 0 || lk.size() != P * Q) continue;
+          const double* l = lk.data();
+          const double* r = rk.data();
+          for (std::size_t a1 = 0; a1 < P; ++a1) {
+            double acc = 0.0;
+            for (std::size_t a4 = 0; a4 < Q; ++a4)
+              acc += (lt ? l[a4 * P + a1] : l[a1 * Q + a4]) * r[a4];
+            c[a1] += factor * acc;
+          }
+          any = true;
+        }
+        out[(b * Mo + m) * Mmu + mu] = any ? c : std::vector<double>{};
+      }
+  return out;
+}
+
+// Sparsity-aware reference for arena_strided_dgemm_ce_ce_left. Here m (Mo) is
+// the strided axis, n (No) the fixed result column; the RIGHT operand cell
+// R[k,n] (the P x Q matrix) is the single non-strided operand and L[m,k] (the
+// length-Q contraction vector) is the strided run. Per the kernel:
+//   C[m,n](b1) = factor * sum_k present L[m,k] (Q) * present R[k,n] (Q x P),
+// where result inner length P = b1. R canonical (a4,b1)=Q x P row-major
+// (rt=false: r[a4*P+b1]); rt mirrors right_inner_transposed (P x Q, r[b1*Q+a4]).
+//   L outer (Mo x nK), index b*Mo*nK + m*nK + k, inner {Q}
+//   R outer (nK x No), canonical (k slow, n fast) b*nK*No + k*No + n, inner {Q,P}
+//   C outer (Mo x No), index b*Mo*No + m*No + n, inner {P}
+std::vector<std::vector<double>> ref_ce_ce_left_sparse(
+    const Outer& L, const Outer& R, std::size_t Mo, std::size_t No,
+    std::size_t nK, std::size_t P, double factor, std::size_t nbatch = 1,
+    bool rt = false) {
+  std::vector<std::vector<double>> out(nbatch * Mo * No);
+  for (std::size_t b = 0; b < nbatch; ++b)
+    for (std::size_t m = 0; m < Mo; ++m)
+      for (std::size_t n = 0; n < No; ++n) {
+        std::vector<double> c(P, 0.0);
+        bool any = false;
+        for (std::size_t k = 0; k < nK; ++k) {
+          const Inner& lk = L.data()[b * Mo * nK + m * nK + k];
+          const Inner& rk = R.data()[b * nK * No + k * No + n];
+          if (!lk || !rk) continue;
+          const std::size_t Q = lk.size();
+          if (Q == 0 || rk.size() != P * Q) continue;
+          const double* l = lk.data();
+          const double* r = rk.data();
+          for (std::size_t b1 = 0; b1 < P; ++b1) {
+            double acc = 0.0;
+            for (std::size_t a4 = 0; a4 < Q; ++a4)
+              acc += l[a4] * (rt ? r[b1 * Q + a4] : r[a4 * P + b1]);
+            c[b1] += factor * acc;
+          }
+          any = true;
+        }
+        out[(b * Mo + m) * No + n] = any ? c : std::vector<double>{};
+      }
+  return out;
+}
+
+// Assemble an Outer whose outer-cell views point at the cells of `src` in the
+// order given by `phys[ord]` (assembled.data()[ord] aliases the SAME Cell as
+// src.data()[phys[ord]]). Because ArenaTensor is a non-owning view, the
+// assembled tile shares `src`'s arena slab without copying element data; the
+// deleter keeps `src` alive. This is the only in-harness lever that yields a
+// UNIFORM-SIZE run with a NON-CONSTANT inter-cell .data() stride.
+Outer assemble_aliased(const Outer& src, const TA::Range& outer_range,
+                       const std::vector<std::size_t>& phys) {
+  const std::size_t n = outer_range.volume();
+  TA_ASSERT(phys.size() == n);
+  std::allocator<Inner> alloc;
+  Inner* raw = alloc.allocate(n);
+  for (std::size_t ord = 0; ord < n; ++ord)
+    ::new (raw + ord) Inner(src.data()[phys[ord]]);  // shallow rebind
+  auto deleter = [alloc, src, n](Inner* p) mutable {
+    for (std::size_t i = 0; i < n; ++i) (p + i)->~Inner();
+    alloc.deallocate(p, n);
+    (void)src;
+  };
+  std::shared_ptr<Inner[]> data(raw, std::move(deleter));
+  return Outer(outer_range, /*nbatch=*/1, std::move(data));
+}
+}  // namespace
+
+BOOST_AUTO_TEST_SUITE(arena_strided_dgemm_suite, TA_UT_LABEL_SERIAL)
+
+// FACT A: uniform cells -> single constant inter-cell stride (>= cell size).
+BOOST_AUTO_TEST_CASE(fact_uniform_constant_stride) {
+  const std::size_t A = 8;
+  Outer t = make_filled(TA::Range{5}, [&](std::size_t) { return TA::Range{A}; }, 1.0);
+  const std::ptrdiff_t s = t.data()[1].data() - t.data()[0].data();
+  BOOST_CHECK_GE(s, static_cast<std::ptrdiff_t>(A));
+  for (std::size_t k = 0; k < 5; ++k)
+    BOOST_CHECK_EQUAL(t.data()[k].data(), t.data()[0].data() + k * s);
+}
+
+// FACT B: ragged sizes whose padded cell allocations land in the SAME stride
+// bucket alias to the SAME inter-cell stride, so a stride-only fusability check
+// is unsafe -> the kernel guard must ALSO check size(). Each cell consumes
+// arena_align_up(cell_size(vol), kArenaCachelineAlign) bytes; the per-element
+// step (sizeof(double)=8B) is far smaller than the 128B bucket, so most
+// adjacent volumes share a bucket -- we search for one such pair rather than
+// hardcoding sizes (a hardcoded pair like 8/9 can straddle a bucket boundary
+// and NOT alias, which says nothing about the hazard the kernel must guard).
+BOOST_AUTO_TEST_CASE(fact_padding_aliases_stride) {
+  auto padded = [](std::size_t n) {
+    return TA::detail::arena_align_up(Inner::cell_size(n),
+                                      TA::detail::kArenaCachelineAlign);
+  };
+  // smallest n>0 whose padded allocation equals that of n+1 (same bucket)
+  std::size_t n = 1;
+  while (n < 4096 && padded(n) != padded(n + 1)) ++n;
+  BOOST_REQUIRE_LT(n, 4096u);  // such an aliasing pair must exist
+  // Cells [n, n, n+1]: padded(n)==padded(n+1) makes all three sit at one
+  // constant stride, yet cell 2's size() differs -> stride-only check fooled.
+  Outer t = make_filled(TA::Range{3}, [&](std::size_t o) {
+    return TA::Range{o < 2 ? n : n + 1};
+  }, 1.0);
+  const std::ptrdiff_t s01 = t.data()[1].data() - t.data()[0].data();
+  const std::ptrdiff_t s12 = t.data()[2].data() - t.data()[1].data();
+  BOOST_CHECK_EQUAL(s01, s12);                               // strides alias...
+  BOOST_CHECK_NE(t.data()[1].size(), t.data()[2].size());    // ...but sizes differ
+}
+
+BOOST_AUTO_TEST_CASE(ce_e_matches_reference) {
+  namespace blas = TA::math::blas;
+  const std::size_t M = 2, N = 2, K = 3, P = 4, Q = 5;
+  Outer L = make_filled(TA::Range{M, K}, [&](std::size_t){return TA::Range{P};}, 1.0);
+  Outer R = make_filled(TA::Range{N, K}, [&](std::size_t){return TA::Range{Q};}, 2.0);
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{M, N}, 1, [&](std::size_t){return TA::Range{P, Q};});  // zero-init
+  TA::detail::arena_strided_dgemm_ce_e(C, L, R, M, N, K, blas::NoTranspose,
+                                       blas::Transpose, /*factor=*/1.0);
+  for (std::size_t m = 0; m < M; ++m)
+    for (std::size_t n = 0; n < N; ++n) {
+      auto ref = ref_ce_e(L, R, m, n, K, P, Q, 1.0);
+      const double* got = C.data()[m * N + n].data();
+      for (std::size_t e = 0; e < P * Q; ++e) BOOST_CHECK_CLOSE(got[e], ref[e], 1e-12);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(ce_e_ragged_cell_still_correct_inline) {
+  namespace blas = TA::math::blas;
+  const std::size_t M = 2, N = 1, K = 2, Q = 3;
+  // m=0: clean P=3 across k; m=1: ragged P (3 then 4) -> cell (1,0) falls back.
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{M, K}, 1, [&](std::size_t o){
+        return (o < K) ? TA::Range{3} : TA::Range{3 + (o % 2)}; });
+  for (std::size_t o = 0; o < L.range().volume(); ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e) L.data()[o].data()[e] = 1.0 + e;
+  Outer R = make_filled(TA::Range{N, K}, [&](std::size_t){return TA::Range{Q};}, 2.0);
+  // result cell sizes: (0,0) -> 3*Q ; (1,0) -> only well-defined when P uniform;
+  // for the ragged row use P from k=0 (=3) so the fallback's pp*qq guard holds.
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{M, N}, 1, [&](std::size_t o){ return TA::Range{3, Q}; });
+  TA::detail::arena_strided_dgemm_ce_e(C, L, R, M, N, K, blas::NoTranspose,
+                                       blas::Transpose, 1.0);
+  // (0,0): clean GEMM path
+  {
+    auto ref = ref_ce_e(L, R, 0, 0, K, 3, Q, 1.0);
+    const double* got = C.data()[0].data();
+    for (std::size_t e = 0; e < 3 * Q; ++e) BOOST_CHECK_CLOSE(got[e], ref[e], 1e-12);
+  }
+  // (1,0): fallback path; only the k whose P==3 contributes under the guard.
+  // Reference: same formula, restricted to k with L cell size == 3.
+  {
+    std::vector<double> ref(3 * Q, 0.0);
+    for (std::size_t k = 0; k < K; ++k) {
+      const auto& lk = L.data()[1 * K + k];
+      if (lk.size() != 3) continue;
+      const double* lp = lk.data();
+      const double* rp = R.data()[0 * K + k].data();
+      for (std::size_t p = 0; p < 3; ++p)
+        for (std::size_t q = 0; q < Q; ++q) ref[p * Q + q] += lp[p] * rp[q];
+    }
+    const double* got = C.data()[1].data();
+    for (std::size_t e = 0; e < 3 * Q; ++e) BOOST_CHECK_CLOSE(got[e], ref[e], 1e-12);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(ce_e_applies_factor) {
+  namespace blas = TA::math::blas;
+  const std::size_t M = 1, N = 1, K = 4, P = 3, Q = 3;
+  Outer L = make_filled(TA::Range{M, K}, [&](std::size_t){return TA::Range{P};}, 1.0);
+  Outer R = make_filled(TA::Range{N, K}, [&](std::size_t){return TA::Range{Q};}, 2.0);
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{M, N}, 1, [&](std::size_t){return TA::Range{P, Q};});
+  TA::detail::arena_strided_dgemm_ce_e(C, L, R, M, N, K, blas::NoTranspose,
+                                       blas::Transpose, 0.5);
+  auto ref = ref_ce_e(L, R, 0, 0, K, P, Q, 0.5);
+  const double* got = C.data()[0].data();
+  for (std::size_t e = 0; e < P * Q; ++e) BOOST_CHECK_CLOSE(got[e], ref[e], 1e-12);
+}
+
+// ce+e core looped over Hadamard-folded nbatch: each batch b independently
+// computes C_b[m,n](p,q) += sum_k L_b[m,k](p) * R_b[k,n](q).
+BOOST_AUTO_TEST_CASE(ce_e_multi_batch) {
+  namespace blas = TiledArray::math::blas;
+  const std::size_t M = 2, N = 2, K = 3, P = 3, Q = 4, NB = 2;
+  // L outer (M,K) inner {P}; R outer (K,N) inner {Q}; C outer (M,N) inner {P,Q}.
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{M, K}, NB, [&](std::size_t){ return TA::Range{P}; });
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{K, N}, NB, [&](std::size_t){ return TA::Range{Q}; });
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{M, N}, NB, [&](std::size_t){ return TA::Range{P, Q}; });
+  for (std::size_t o = 0; o < NB * M * K; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  for (std::size_t o = 0; o < NB * K * N; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  // M=left-ext, N=right-ext, K=contracted; canonical orientation.
+  TA::detail::arena_strided_dgemm_ce_e(C, L, R, M, N, K,
+                                       blas::NoTranspose, blas::NoTranspose, 1.0);
+  for (std::size_t b = 0; b < NB; ++b)
+    for (std::size_t m = 0; m < M; ++m)
+      for (std::size_t n = 0; n < N; ++n) {
+        std::vector<double> ref(P * Q, 0.0);
+        for (std::size_t k = 0; k < K; ++k) {
+          const double* l = L.data()[b * M * K + m * K + k].data();  // P
+          const double* r = R.data()[b * K * N + k * N + n].data();  // Q
+          for (std::size_t p = 0; p < P; ++p)
+            for (std::size_t q = 0; q < Q; ++q) ref[p * Q + q] += l[p] * r[q];
+        }
+        const double* got = C.data()[b * M * N + m * N + n].data();
+        for (std::size_t e = 0; e < P * Q; ++e)
+          BOOST_CHECK_CLOSE(got[e], ref[e], 1e-12);
+      }
+}
+
+// Addition A: multi-external inner indices on BOTH operands flatten into the
+// inner outer-product. Left inner {a1,a2} (P=a1*a2), right inner {a3,a4}
+// (Q=a3*a4), result inner {a1,a2,a3,a4} (P*Q). Single batch. Independent
+// outer-product reference; under TA_STRIDED_DGEMM_COUNT the per-cell DGEMM
+// fires once per result cell (= M*N).
+BOOST_AUTO_TEST_CASE(ce_e_multi_external_inner) {
+  namespace blas = TiledArray::math::blas;
+  const std::size_t M = 2, N = 2, K = 3;
+  const std::size_t a1 = 2, a2 = 3, a3 = 2, a4 = 2;
+  const std::size_t P = a1 * a2, Q = a3 * a4;
+  Outer L = make_filled(TA::Range{M, K},
+                        [&](std::size_t){ return TA::Range{a1, a2}; }, 1.0);
+  Outer R = make_filled(TA::Range{N, K},
+                        [&](std::size_t){ return TA::Range{a3, a4}; }, 2.0);
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{M, N}, 1, [&](std::size_t){ return TA::Range{a1, a2, a3, a4}; });
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TA::detail::g_strided_dgemm_ce_e_calls.store(0);
+#endif
+  TA::detail::arena_strided_dgemm_ce_e(C, L, R, M, N, K, blas::NoTranspose,
+                                       blas::Transpose, 1.0);
+  for (std::size_t m = 0; m < M; ++m)
+    for (std::size_t n = 0; n < N; ++n) {
+      auto ref = ref_ce_e(L, R, m, n, K, P, Q, 1.0);  // flat P*Q
+      const double* got = C.data()[m * N + n].data();
+      for (std::size_t e = 0; e < P * Q; ++e)
+        BOOST_CHECK_CLOSE(got[e], ref[e], 1e-12);
+    }
+#ifdef TA_STRIDED_DGEMM_COUNT
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_e_calls.load(), M * N);
+#endif
+}
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+BOOST_AUTO_TEST_CASE(ce_e_fires_clean_path) {
+  namespace blas = TiledArray::math::blas;
+  const std::size_t M = 2, N = 2, K = 3, P = 3, Q = 4, NB = 2;
+  // NB batches, all uniform sizes => every result cell clean => one DGEMM each.
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{M, K}, NB, [&](std::size_t){ return TA::Range{P}; });
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{K, N}, NB, [&](std::size_t){ return TA::Range{Q}; });
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{M, N}, NB, [&](std::size_t){ return TA::Range{P, Q}; });
+  for (std::size_t o = 0; o < NB * M * K; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  for (std::size_t o = 0; o < NB * K * N; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  TA::detail::g_strided_dgemm_ce_e_calls.store(0);
+  TA::detail::arena_strided_dgemm_ce_e(C, L, R, M, N, K, blas::NoTranspose,
+                                       blas::NoTranspose, 1.0);
+  // one DGEMM per result cell per batch
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_e_calls.load(), NB * M * N);
+}
+#endif
+
+// Regime-A hc+e calling convention: M=N=1, K=vol, kernel-nbatch == Hadamard
+// batch. Result tile is (Range{batch}, nbatch=1); presented to the kernel as
+// (Range{1}, nbatch=batch) via a storage-aliasing reshape. Two operand
+// "tiles" accumulate into the SAME result with beta=1 (cross-tile reduction).
+BOOST_AUTO_TEST_CASE(regime_a_oprod_mapping_two_tiles) {
+  namespace blas = TiledArray::math::blas;
+  const std::size_t NB = 2;            // Hadamard-folded batch
+  const std::size_t P = 3, Q = 4;      // inner outer-product extents (left, right)
+  const std::size_t VOL0 = 3, VOL1 = 2;  // within-tile contraction cells per tile
+
+  // Result: outer Range{NB}, single batch; each cell inner {P,Q}, zero-filled.
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{NB}, /*batch=*/1, [&](std::size_t) { return TA::Range{P, Q}; });
+  // Operand tile t: outer Range{VOL_t}, nbatch=NB; left cells {P}, right {Q}.
+  auto make_L = [&](std::size_t VOL, double seed) {
+    Outer L = TA::detail::arena_outer_init<Outer>(
+        TA::Range{VOL}, NB, [&](std::size_t) { return TA::Range{P}; });
+    for (std::size_t o = 0; o < NB * VOL; ++o)
+      for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+        L.data()[o].data()[e] = seed + 0.01 * o + e;
+    return L;
+  };
+  auto make_R = [&](std::size_t VOL, double seed) {
+    Outer R = TA::detail::arena_outer_init<Outer>(
+        TA::Range{VOL}, NB, [&](std::size_t) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < NB * VOL; ++o)
+      for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+        R.data()[o].data()[e] = seed + 0.02 * o + e;
+    return R;
+  };
+  Outer L0 = make_L(VOL0, 1.0), R0 = make_R(VOL0, 2.0);
+  Outer L1 = make_L(VOL1, 5.0), R1 = make_R(VOL1, 7.0);
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TA::detail::g_strided_dgemm_ce_e_calls.store(0);
+#endif
+  // Regime-A call: present C as (Range{1}, nbatch=NB), M=N=1, K=vol, per tile.
+  auto cview = C.reshape(TA::Range{1}, NB);
+  TA::detail::arena_strided_dgemm_ce_e(cview, L0, R0, /*M=*/std::size_t{1},
+                                       /*N=*/std::size_t{1}, /*K=*/VOL0,
+                                       blas::NoTranspose, blas::NoTranspose, 1.0);
+  auto cview2 = C.reshape(TA::Range{1}, NB);
+  TA::detail::arena_strided_dgemm_ce_e(cview2, L1, R1, /*M=*/std::size_t{1},
+                                       /*N=*/std::size_t{1}, /*K=*/VOL1,
+                                       blas::NoTranspose, blas::NoTranspose, 1.0);
+
+  // Reference: for each batch b, sum the rank-1 outer products over BOTH tiles.
+  for (std::size_t b = 0; b < NB; ++b) {
+    std::vector<double> ref(P * Q, 0.0);
+    auto add_tile = [&](const Outer& L, const Outer& R, std::size_t VOL) {
+      for (std::size_t k = 0; k < VOL; ++k) {
+        const double* l = L.data()[b * VOL + k].data();  // P
+        const double* r = R.data()[b * VOL + k].data();  // Q
+        for (std::size_t p = 0; p < P; ++p)
+          for (std::size_t q = 0; q < Q; ++q) ref[p * Q + q] += l[p] * r[q];
+      }
+    };
+    add_tile(L0, R0, VOL0);
+    add_tile(L1, R1, VOL1);
+    const double* got = C.data()[b].data();  // C still (Range{NB}, nbatch=1)
+    for (std::size_t e = 0; e < P * Q; ++e)
+      BOOST_CHECK_CLOSE(got[e], ref[e], 1e-12);
+  }
+#ifdef TA_STRIDED_DGEMM_COUNT
+  // clean operands => one DGEMM per (tile, batch); both tiles clean.
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_e_calls.load(),
+                    std::size_t{2} * NB);
+#endif
+}
+
+// Task 3.2 (tile-level fallback): regime-A hc+e convention (M=N=1, K=vol>1)
+// with a RAGGED left operand so the kernel's clean-check rejects and the
+// inline per-k fallback runs for the (single) result cell.
+//
+// WHY TILE-LEVEL: the einsum driver lays arena cells out as clean contiguous
+// slabs (uniform-size, constant-stride), so e2e the kernel's clean-check
+// always passes and the inline fallback is not reliably reachable from the
+// einsum entry point. We therefore exercise the fallback directly at the
+// kernel call, modeled on ce_e_ragged_cell_still_correct_inline, but in the
+// regime-A reshape(Range{1}, NB) / M=N=1 / K=vol presentation from R1.
+//
+// The reference uses the SAME guard the kernel uses: only k-cells whose LEFT
+// inner size equals the result-cell P (==P0, the k=0 size) contribute; the
+// ragged k-cell is skipped. (See ce_e_ragged_cell_still_correct_inline.)
+BOOST_AUTO_TEST_CASE(regime_a_oprod_mapping_scattered_falls_back) {
+  namespace blas = TiledArray::math::blas;
+  const std::size_t NB = 2;            // Hadamard-folded batch
+  const std::size_t P0 = 3, Q = 4;     // left P (from k=0), right Q
+  const std::size_t VOL = 3;           // within-tile contraction cells (K)
+  // Left operand: outer Range{VOL}, nbatch=NB, inner {P0} EXCEPT one ragged
+  // k-cell (k==1) in EVERY batch gets inner {P0+1} -> kernel's clean-check
+  // rejects (non-uniform left size) and falls back to inline per-k for the
+  // result cell. (The ragged cell is then skipped under the size guard.)
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{VOL}, NB, [&](std::size_t ord) {
+        const std::size_t k = ord % VOL;  // ord runs batch-major over (b,k)
+        return TA::Range{k == 1 ? P0 + 1 : P0};
+      });
+  for (std::size_t o = 0; o < NB * VOL; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  // Right operand: clean, inner {Q}.
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{VOL}, NB, [&](std::size_t) { return TA::Range{Q}; });
+  for (std::size_t o = 0; o < NB * VOL; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.02 * o + e;
+  // Result: outer Range{NB}, single batch; each cell inner {P0,Q}, zero-init.
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{NB}, /*batch=*/1, [&](std::size_t) { return TA::Range{P0, Q}; });
+
+  auto cview = C.reshape(TA::Range{1}, NB);
+  TA::detail::arena_strided_dgemm_ce_e(cview, L, R, /*M=*/std::size_t{1},
+                                       /*N=*/std::size_t{1}, /*K=*/VOL,
+                                       blas::NoTranspose, blas::NoTranspose, 1.0);
+
+  // Reference: per batch b, sum rank-1 outer products over k, but ONLY for k
+  // whose left inner size == P0 (the kernel's fallback guard skips the ragged k).
+  for (std::size_t b = 0; b < NB; ++b) {
+    std::vector<double> ref(P0 * Q, 0.0);
+    for (std::size_t k = 0; k < VOL; ++k) {
+      const auto& lk = L.data()[b * VOL + k];
+      if (lk.size() != P0) continue;  // guard: ragged k-cell does not contribute
+      const double* l = lk.data();  // P0
+      const double* r = R.data()[b * VOL + k].data();  // Q
+      for (std::size_t p = 0; p < P0; ++p)
+        for (std::size_t q = 0; q < Q; ++q) ref[p * Q + q] += l[p] * r[q];
+    }
+    const double* got = C.data()[b].data();
+    for (std::size_t e = 0; e < P0 * Q; ++e)
+      BOOST_CHECK_CLOSE(got[e], ref[e], 1e-12);
+  }
+  // Counter intentionally NOT asserted: the ragged cell takes the inline
+  // fallback (no clean DGEMM), so firing is not the property under test --
+  // correctness of the fallback is.
+}
+
+// Task 3.3 Step 2a: VOL=1 edge -> K=1, a single rank-1 outer product per batch.
+// One operand tile (one strided call). Hand reference + (count build) exactly
+// NB DGEMMs (one per batch, one tile, clean).
+BOOST_AUTO_TEST_CASE(regime_a_oprod_mapping_vol1) {
+  namespace blas = TiledArray::math::blas;
+  const std::size_t NB = 2;
+  const std::size_t P = 3, Q = 4;
+  const std::size_t VOL = 1;  // single contraction cell -> K=1
+
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{NB}, /*batch=*/1, [&](std::size_t) { return TA::Range{P, Q}; });
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{VOL}, NB, [&](std::size_t) { return TA::Range{P}; });
+  for (std::size_t o = 0; o < NB * VOL; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{VOL}, NB, [&](std::size_t) { return TA::Range{Q}; });
+  for (std::size_t o = 0; o < NB * VOL; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.02 * o + e;
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TA::detail::g_strided_dgemm_ce_e_calls.store(0);
+#endif
+  auto cview = C.reshape(TA::Range{1}, NB);
+  TA::detail::arena_strided_dgemm_ce_e(cview, L, R, /*M=*/std::size_t{1},
+                                       /*N=*/std::size_t{1}, /*K=*/VOL,
+                                       blas::NoTranspose, blas::NoTranspose, 1.0);
+  for (std::size_t b = 0; b < NB; ++b) {
+    std::vector<double> ref(P * Q, 0.0);
+    const double* l = L.data()[b * VOL + 0].data();  // P
+    const double* r = R.data()[b * VOL + 0].data();  // Q
+    for (std::size_t p = 0; p < P; ++p)
+      for (std::size_t q = 0; q < Q; ++q) ref[p * Q + q] += l[p] * r[q];
+    const double* got = C.data()[b].data();
+    for (std::size_t e = 0; e < P * Q; ++e)
+      BOOST_CHECK_CLOSE(got[e], ref[e], 1e-12);
+  }
+#ifdef TA_STRIDED_DGEMM_COUNT
+  // one DGEMM per batch, one (clean) tile.
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_e_calls.load(), NB);
+#endif
+}
+
+// Task 3.3 Step 2b: P=1 and Q=1 inner-extent edges -> the rank-1 outer product
+// degenerates to a scalar*scalar per k. Two operand tiles (like R1), cross-tile
+// beta=1; (count build) == 2*NB DGEMMs (one per tile per batch, all clean).
+BOOST_AUTO_TEST_CASE(regime_a_oprod_mapping_p1_q1) {
+  namespace blas = TiledArray::math::blas;
+  const std::size_t NB = 2;
+  const std::size_t P = 1, Q = 1;  // degenerate inner extents
+  const std::size_t VOL0 = 3, VOL1 = 2;
+
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{NB}, /*batch=*/1, [&](std::size_t) { return TA::Range{P, Q}; });
+  auto make_L = [&](std::size_t VOL, double seed) {
+    Outer L = TA::detail::arena_outer_init<Outer>(
+        TA::Range{VOL}, NB, [&](std::size_t) { return TA::Range{P}; });
+    for (std::size_t o = 0; o < NB * VOL; ++o)
+      for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+        L.data()[o].data()[e] = seed + 0.01 * o + e;
+    return L;
+  };
+  auto make_R = [&](std::size_t VOL, double seed) {
+    Outer R = TA::detail::arena_outer_init<Outer>(
+        TA::Range{VOL}, NB, [&](std::size_t) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < NB * VOL; ++o)
+      for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+        R.data()[o].data()[e] = seed + 0.02 * o + e;
+    return R;
+  };
+  Outer L0 = make_L(VOL0, 1.0), R0 = make_R(VOL0, 2.0);
+  Outer L1 = make_L(VOL1, 5.0), R1 = make_R(VOL1, 7.0);
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TA::detail::g_strided_dgemm_ce_e_calls.store(0);
+#endif
+  auto cview = C.reshape(TA::Range{1}, NB);
+  TA::detail::arena_strided_dgemm_ce_e(cview, L0, R0, /*M=*/std::size_t{1},
+                                       /*N=*/std::size_t{1}, /*K=*/VOL0,
+                                       blas::NoTranspose, blas::NoTranspose, 1.0);
+  auto cview2 = C.reshape(TA::Range{1}, NB);
+  TA::detail::arena_strided_dgemm_ce_e(cview2, L1, R1, /*M=*/std::size_t{1},
+                                       /*N=*/std::size_t{1}, /*K=*/VOL1,
+                                       blas::NoTranspose, blas::NoTranspose, 1.0);
+
+  for (std::size_t b = 0; b < NB; ++b) {
+    std::vector<double> ref(P * Q, 0.0);
+    auto add_tile = [&](const Outer& L, const Outer& R, std::size_t VOL) {
+      for (std::size_t k = 0; k < VOL; ++k) {
+        const double* l = L.data()[b * VOL + k].data();  // P==1
+        const double* r = R.data()[b * VOL + k].data();  // Q==1
+        ref[0] += l[0] * r[0];  // scalar*scalar
+      }
+    };
+    add_tile(L0, R0, VOL0);
+    add_tile(L1, R1, VOL1);
+    const double* got = C.data()[b].data();
+    BOOST_CHECK_CLOSE(got[0], ref[0], 1e-12);
+  }
+#ifdef TA_STRIDED_DGEMM_COUNT
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_e_calls.load(),
+                    std::size_t{2} * NB);
+#endif
+}
+
+// ------------------------------- ce+ce ------------------------------------
+
+BOOST_AUTO_TEST_CASE(ce_ce_matches_reference_canonical) {
+  const std::size_t Mmu = 3, nK = 2, P = 4, Q = 5;
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  // canonical (right_op==Transpose) storage: R outer (mu,k), mu slow, k fast.
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t){return TA::Range{Q};});
+  for (std::size_t mu = 0; mu < Mmu; ++mu)
+    for (std::size_t k = 0; k < nK; ++k) {
+      double* r = R.data()[mu * nK + k].data();
+      for (std::size_t e = 0; e < Q; ++e) r[e] = 2.0 + 0.01 * (mu * nK + k) + e;
+    }
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){return TA::Range{P};});  // zero-init
+  namespace blas = TiledArray::math::blas;
+  // Mo=1 (no left external), No=Mmu (mu), Ko=nK (k); canonical right_op=Transpose.
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, /*Mo=*/1, /*No=*/Mmu, /*Ko=*/nK,
+                                        blas::NoTranspose, blas::Transpose, 1.0);
+  auto ref = ref_ce_ce(L, R, Mmu, nK, P, Q, 1.0);
+  for (std::size_t mu = 0; mu < Mmu; ++mu) {
+    const double* got = C.data()[mu].data();
+    for (std::size_t a1 = 0; a1 < P; ++a1)
+      BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(ce_ce_orientation_aware_no_transpose) {
+  const std::size_t Mmu = 3, nK = 2, P = 4, Q = 5;
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  // non-canonical: R outer (k,mu) -> k slow, mu fast => right_op == NoTranspose
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK, Mmu}, 1, [&](std::size_t){return TA::Range{Q};});
+  for (std::size_t k = 0; k < nK; ++k)
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      double* r = R.data()[k * Mmu + mu].data();
+      for (std::size_t e = 0; e < Q; ++e) r[e] = 2.0 + 0.01 * (mu * nK + k) + e;
+    }
+  // canonical mirror (mu*nK+k) holding the SAME (k,mu) data, for the reference
+  Outer Rc = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t){return TA::Range{Q};});
+  for (std::size_t k = 0; k < nK; ++k)
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      const double* s = R.data()[k * Mmu + mu].data();
+      double* d = Rc.data()[mu * nK + k].data();
+      for (std::size_t e = 0; e < Q; ++e) d[e] = s[e];
+    }
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){return TA::Range{P};});
+  namespace blas = TiledArray::math::blas;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, 1, Mmu, nK,
+                                        blas::NoTranspose, blas::NoTranspose, 1.0);
+  auto ref = ref_ce_ce(L, Rc, Mmu, nK, P, Q, 1.0);
+  for (std::size_t mu = 0; mu < Mmu; ++mu) {
+    const double* got = C.data()[mu].data();
+    for (std::size_t a1 = 0; a1 < P; ++a1)
+      BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(ce_ce_multi_batch) {
+  const std::size_t Mmu = 2, nK = 3, P = 3, Q = 4, NB = 2;
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK}, NB, [&](std::size_t){return TA::Range{P, Q};});
+  Outer R = TA::detail::arena_outer_init<Outer>(   // (mu,k) canonical
+      TA::Range{Mmu, nK}, NB, [&](std::size_t){return TA::Range{Q};});
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, NB, [&](std::size_t){return TA::Range{P};});
+  for (std::size_t o = 0; o < NB * nK; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  for (std::size_t o = 0; o < NB * Mmu * nK; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  namespace blas = TiledArray::math::blas;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, 1, Mmu, nK,
+                                        blas::NoTranspose, blas::Transpose, 1.0);
+  for (std::size_t b = 0; b < NB; ++b) {
+    std::vector<double> ref(Mmu * P, 0.0);
+    for (std::size_t k = 0; k < nK; ++k) {
+      const double* l = L.data()[b * nK + k].data();
+      for (std::size_t mu = 0; mu < Mmu; ++mu) {
+        const double* r = R.data()[b * Mmu * nK + mu * nK + k].data();
+        for (std::size_t a1 = 0; a1 < P; ++a1) {
+          double acc = 0;
+          for (std::size_t a4 = 0; a4 < Q; ++a4) acc += l[a1 * Q + a4] * r[a4];
+          ref[mu * P + a1] += acc;
+        }
+      }
+    }
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      const double* got = C.data()[b * Mmu + mu].data();
+      for (std::size_t a1 = 0; a1 < P; ++a1)
+        BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+    }
+  }
+}
+
+BOOST_AUTO_TEST_CASE(ce_ce_ragged_batch_falls_back_inline) {
+  // batch 0 clean; batch 1 ragged Q across mu -> inline fallback for batch 1.
+  const std::size_t Mmu = 2, nK = 2, P = 3, Q0 = 4, Q1 = 5, NB = 2;
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK}, NB, [&](std::size_t){return TA::Range{P, Q0};});
+  for (std::size_t o = 0; o < NB * nK; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, NB, [&](std::size_t o){
+        const std::size_t batch = o / (Mmu * nK);
+        const std::size_t ord = o % (Mmu * nK);
+        const std::size_t mu = ord / nK;
+        const bool ragged = (batch == 1 && mu == 1);
+        return TA::Range{ragged ? Q1 : Q0};
+      });
+  for (std::size_t o = 0; o < NB * Mmu * nK; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, NB, [&](std::size_t){return TA::Range{P};});
+  namespace blas = TiledArray::math::blas;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, 1, Mmu, nK,
+                                        blas::NoTranspose, blas::Transpose, 1.0);
+  for (std::size_t b = 0; b < NB; ++b) {
+    std::vector<double> ref(Mmu * P, 0.0);
+    for (std::size_t k = 0; k < nK; ++k) {
+      const auto& lk = L.data()[b * nK + k];
+      const double* l = lk.data();
+      for (std::size_t mu = 0; mu < Mmu; ++mu) {
+        const auto& rk = R.data()[b * Mmu * nK + mu * nK + k];
+        const std::size_t Ql = rk.size();
+        if (lk.size() != P * Ql) continue;  // guard
+        const double* r = rk.data();
+        for (std::size_t a1 = 0; a1 < P; ++a1) {
+          double acc = 0;
+          for (std::size_t a4 = 0; a4 < Ql; ++a4) acc += l[a1 * Ql + a4] * r[a4];
+          ref[mu * P + a1] += acc;
+        }
+      }
+    }
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      const double* got = C.data()[b * Mmu + mu].data();
+      for (std::size_t a1 = 0; a1 < P; ++a1)
+        BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+    }
+  }
+}
+
+BOOST_AUTO_TEST_CASE(ce_ce_applies_factor) {
+  const std::size_t Mmu = 2, nK = 4, P = 3, Q = 3;  // nK=4 -> multi-step beta=1
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t){return TA::Range{Q};});
+  for (std::size_t mu = 0; mu < Mmu; ++mu)
+    for (std::size_t k = 0; k < nK; ++k) {
+      double* r = R.data()[mu * nK + k].data();
+      for (std::size_t e = 0; e < Q; ++e) r[e] = 2.0 + 0.01 * (mu * nK + k) + e;
+    }
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){return TA::Range{P};});
+  namespace blas = TiledArray::math::blas;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, 1, Mmu, nK,
+                                        blas::NoTranspose, blas::Transpose, 0.5);
+  auto ref = ref_ce_ce(L, R, Mmu, nK, P, Q, 0.5);
+  for (std::size_t mu = 0; mu < Mmu; ++mu) {
+    const double* got = C.data()[mu].data();
+    for (std::size_t a1 = 0; a1 < P; ++a1)
+      BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+  }
+}
+
+// Presence-first clean-check (no stride-probe UB): a genuinely EMPTY mid-run
+// R cell drives the presence guard BEFORE any cell-0->1 pointer subtraction.
+BOOST_AUTO_TEST_CASE(ce_ce_empty_mid_run_falls_back) {
+  const std::size_t Mmu = 3, nK = 2, P = 4, Q = 5;
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t o){
+        const std::size_t mu = o / nK, k = o % nK;
+        return (mu == 1 && k == 0) ? TA::Range{} : TA::Range{Q};
+      });
+  for (std::size_t o = 0; o < R.range().volume(); ++o) {
+    Inner& c = R.data()[o];
+    if (!c) continue;
+    for (std::size_t e = 0; e < c.size(); ++e) c.data()[e] = 2.0 + 0.01 * o + e;
+  }
+  BOOST_REQUIRE(!R.data()[1 * nK + 0]);
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){return TA::Range{P};});
+  namespace blas = TiledArray::math::blas;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, /*Mo=*/1, /*No=*/Mmu, /*Ko=*/nK,
+                                        blas::NoTranspose, blas::Transpose, 1.0);
+  std::vector<double> ref(Mmu * P, 0.0);
+  for (std::size_t k = 0; k < nK; ++k) {
+    const double* l = L.data()[k].data();
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      const auto& rk = R.data()[mu * nK + k];
+      if (!rk) continue;
+      const double* r = rk.data();
+      for (std::size_t a1 = 0; a1 < P; ++a1) {
+        double acc = 0;
+        for (std::size_t a4 = 0; a4 < Q; ++a4) acc += l[a1 * Q + a4] * r[a4];
+        ref[mu * P + a1] += acc;
+      }
+    }
+  }
+  for (std::size_t mu = 0; mu < Mmu; ++mu) {
+    const double* got = C.data()[mu].data();
+    for (std::size_t a1 = 0; a1 < P; ++a1)
+      BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+  }
+}
+
+// Addition B: left-external Mo>1 (single batch). The left-external m rides as
+// an outer loop; R (function of k,mu only) is reused across m. Independent
+// reference: c[m,mu](a1) = sum_k sum_{a4} L[m,k](a1,a4) * R[mu,k](a4).
+BOOST_AUTO_TEST_CASE(ce_ce_left_external) {
+  const std::size_t Mo = 2, Mmu = 3, nK = 2, P = 4, Q = 5;
+  // L outer (Mo,nK) row-major (m slow, k fast), inner {P,Q}.
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, nK}, 1, [&](std::size_t){return TA::Range{P, Q};});
+  for (std::size_t o = 0; o < Mo * nK; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  // R outer (Mmu,nK) canonical (mu slow, k fast), inner {Q}.
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t){return TA::Range{Q};});
+  for (std::size_t o = 0; o < Mmu * nK; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  // C outer (Mo,Mmu) row-major (m slow, mu fast), inner {P}.
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, Mmu}, 1, [&](std::size_t){return TA::Range{P};});
+  namespace blas = TiledArray::math::blas;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, /*Mo=*/Mo, /*No=*/Mmu,
+                                        /*Ko=*/nK, blas::NoTranspose,
+                                        blas::Transpose, 1.0);
+  for (std::size_t m = 0; m < Mo; ++m) {
+    std::vector<double> ref(Mmu * P, 0.0);
+    for (std::size_t k = 0; k < nK; ++k) {
+      const double* l = L.data()[m * nK + k].data();  // P x Q row-major
+      for (std::size_t mu = 0; mu < Mmu; ++mu) {
+        const double* r = R.data()[mu * nK + k].data();
+        for (std::size_t a1 = 0; a1 < P; ++a1) {
+          double acc = 0;
+          for (std::size_t a4 = 0; a4 < Q; ++a4) acc += l[a1 * Q + a4] * r[a4];
+          ref[mu * P + a1] += acc;
+        }
+      }
+    }
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      const double* got = C.data()[m * Mmu + mu].data();
+      for (std::size_t a1 = 0; a1 < P; ++a1)
+        BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+    }
+  }
+}
+
+// Addition B: Mo>1 AND nbatch>1.
+BOOST_AUTO_TEST_CASE(ce_ce_left_external_multi_batch) {
+  const std::size_t Mo = 2, Mmu = 2, nK = 3, P = 3, Q = 4, NB = 2;
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, nK}, NB, [&](std::size_t){return TA::Range{P, Q};});
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, NB, [&](std::size_t){return TA::Range{Q};});
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, Mmu}, NB, [&](std::size_t){return TA::Range{P};});
+  for (std::size_t o = 0; o < NB * Mo * nK; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  for (std::size_t o = 0; o < NB * Mmu * nK; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  namespace blas = TiledArray::math::blas;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+                                        blas::NoTranspose, blas::Transpose, 1.0);
+  for (std::size_t b = 0; b < NB; ++b)
+    for (std::size_t m = 0; m < Mo; ++m) {
+      std::vector<double> ref(Mmu * P, 0.0);
+      for (std::size_t k = 0; k < nK; ++k) {
+        const double* l = L.data()[b * Mo * nK + m * nK + k].data();
+        for (std::size_t mu = 0; mu < Mmu; ++mu) {
+          const double* r = R.data()[b * Mmu * nK + mu * nK + k].data();
+          for (std::size_t a1 = 0; a1 < P; ++a1) {
+            double acc = 0;
+            for (std::size_t a4 = 0; a4 < Q; ++a4) acc += l[a1 * Q + a4] * r[a4];
+            ref[mu * P + a1] += acc;
+          }
+        }
+      }
+      for (std::size_t mu = 0; mu < Mmu; ++mu) {
+        const double* got = C.data()[b * Mo * Mmu + m * Mmu + mu].data();
+        for (std::size_t a1 = 0; a1 < P; ++a1)
+          BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+      }
+    }
+}
+
+// Addition B coverage: Mo>1 with the non-canonical right orientation
+// (right_op == NoTranspose, R outer (k,mu)). Mirrors
+// ce_ce_orientation_aware_no_transpose but with a left-external loop, so the
+// orientation-aware r_off is exercised together with the m-loop.
+BOOST_AUTO_TEST_CASE(ce_ce_left_external_orientation_no_transpose) {
+  const std::size_t Mo = 2, Mmu = 3, nK = 2, P = 4, Q = 5;
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, nK}, 1, [&](std::size_t){return TA::Range{P, Q};});
+  for (std::size_t o = 0; o < Mo * nK; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  // non-canonical: R outer (k,mu) -> k slow, mu fast => right_op==NoTranspose.
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK, Mmu}, 1, [&](std::size_t){return TA::Range{Q};});
+  for (std::size_t k = 0; k < nK; ++k)
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      double* r = R.data()[k * Mmu + mu].data();
+      for (std::size_t e = 0; e < Q; ++e) r[e] = 2.0 + 0.01 * (k * Mmu + mu) + e;
+    }
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, Mmu}, 1, [&](std::size_t){return TA::Range{P};});
+  namespace blas = TiledArray::math::blas;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK, blas::NoTranspose,
+                                        blas::NoTranspose, 1.0);
+  for (std::size_t m = 0; m < Mo; ++m) {
+    std::vector<double> ref(Mmu * P, 0.0);
+    for (std::size_t k = 0; k < nK; ++k) {
+      const double* l = L.data()[m * nK + k].data();
+      for (std::size_t mu = 0; mu < Mmu; ++mu) {
+        const double* r = R.data()[k * Mmu + mu].data();  // (k,mu) layout
+        for (std::size_t a1 = 0; a1 < P; ++a1) {
+          double acc = 0;
+          for (std::size_t a4 = 0; a4 < Q; ++a4) acc += l[a1 * Q + a4] * r[a4];
+          ref[mu * P + a1] += acc;
+        }
+      }
+    }
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      const double* got = C.data()[m * Mmu + mu].data();
+      for (std::size_t a1 = 0; a1 < P; ++a1)
+        BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+    }
+  }
+}
+
+// Addition B coverage: ragged inner size under Mo>1. One R mu-cell is ragged,
+// so the strided clean-check declines (per-m) and the inline per-cell fallback
+// runs for EACH left-external block (each cell computed exactly once, the
+// size-mismatched (mu,k) skipped just as the kernel skips it).
+BOOST_AUTO_TEST_CASE(ce_ce_left_external_ragged_falls_back) {
+  const std::size_t Mo = 2, Mmu = 2, nK = 2, P = 3, Q0 = 4, Q1 = 5;
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, nK}, 1, [&](std::size_t){return TA::Range{P, Q0};});
+  for (std::size_t o = 0; o < Mo * nK; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  // R[mu=1,k=0] ragged (Q1) -> mu-run not uniform -> clean-check fails (both m).
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t o){
+        const std::size_t mu = o / nK, k = o % nK;
+        return TA::Range{(mu == 1 && k == 0) ? Q1 : Q0};
+      });
+  for (std::size_t o = 0; o < Mmu * nK; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, Mmu}, 1, [&](std::size_t){return TA::Range{P};});
+  namespace blas = TiledArray::math::blas;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK, blas::NoTranspose,
+                                        blas::Transpose, 1.0);
+  for (std::size_t m = 0; m < Mo; ++m) {
+    std::vector<double> ref(Mmu * P, 0.0);
+    for (std::size_t k = 0; k < nK; ++k) {
+      const auto& lk = L.data()[m * nK + k];
+      const double* l = lk.data();
+      for (std::size_t mu = 0; mu < Mmu; ++mu) {
+        const auto& rk = R.data()[mu * nK + k];
+        const std::size_t Ql = rk.size();
+        if (lk.size() != P * Ql) continue;  // mirror kernel fallback skip
+        const double* r = rk.data();
+        for (std::size_t a1 = 0; a1 < P; ++a1) {
+          double acc = 0;
+          for (std::size_t a4 = 0; a4 < Ql; ++a4) acc += l[a1 * Ql + a4] * r[a4];
+          ref[mu * P + a1] += acc;
+        }
+      }
+    }
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      const double* got = C.data()[m * Mmu + mu].data();
+      for (std::size_t a1 = 0; a1 < P; ++a1)
+        BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+    }
+  }
+}
+
+// LEFT-clean mirror core: the LEFT operand inner is the pure contraction vector
+// L[m,k](a4); the RIGHT operand carries the inner external R[k,n](a4,b1); result
+// inner is the right inner-external b1. The kernel rides the LEFT-external m into
+// BLAS M and loops the RIGHT-external n as an outer loop. Independent reference:
+//   c[m,n](p) = sum_k sum_{a4} L[m,k](a4) * R[k,n](a4,p).
+BOOST_AUTO_TEST_CASE(ce_ce_left_clean_core) {
+  const std::size_t Mo = 2, No = 3, nK = 2, P = 4, Q = 5;
+  // L (clean) outer (Mo,nK) row-major (m slow, k fast), inner {Q}.
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, nK}, 1, [&](std::size_t) { return TA::Range{Q}; });
+  for (std::size_t o = 0; o < Mo * nK; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  // R (matrix) outer (nK,No) canonical (k slow, n fast), inner {Q,P} row-major.
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK, No}, 1, [&](std::size_t) { return TA::Range{Q, P}; });
+  for (std::size_t o = 0; o < nK * No; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  // C outer (Mo,No) row-major (m slow, n fast), inner {P}.
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, No}, 1, [&](std::size_t) { return TA::Range{P}; });
+  namespace blas = TiledArray::math::blas;
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, R, /*Mo=*/Mo, /*No=*/No,
+                                             /*Ko=*/nK, blas::NoTranspose,
+                                             blas::NoTranspose, 1.0);
+  for (std::size_t m = 0; m < Mo; ++m)
+    for (std::size_t n = 0; n < No; ++n) {
+      std::vector<double> ref(P, 0.0);
+      for (std::size_t k = 0; k < nK; ++k) {
+        const double* l = L.data()[m * nK + k].data();  // Q vector
+        const double* r = R.data()[k * No + n].data();  // Q x P row-major
+        for (std::size_t p = 0; p < P; ++p) {
+          double acc = 0;
+          for (std::size_t a4 = 0; a4 < Q; ++a4) acc += l[a4] * r[a4 * P + p];
+          ref[p] += acc;
+        }
+      }
+      const double* got = C.data()[m * No + n].data();
+      for (std::size_t p = 0; p < P; ++p)
+        BOOST_CHECK_CLOSE(got[p], ref[p], 1e-12);
+    }
+}
+
+namespace {
+// Compare every result cell against the sparsity-aware reference: a present
+// reference vector must match to 1e-10. An empty reference vector means the
+// cell received no contribution: an ABSENT (hole) result cell must stay absent
+// (invariant 2 / "no writes to holes"), and a PRESENT result cell (e.g. a dense
+// C whose only k was size-mismatched) must remain exactly its zero-init value.
+// ordinal of C[b,row,col] = (b*nrow+row)*ncol + col.
+void check_ce_ce(const Outer& C, const std::vector<std::vector<double>>& ref,
+                 std::size_t nbatch, std::size_t nrow, std::size_t ncol,
+                 std::size_t P) {
+  for (std::size_t b = 0; b < nbatch; ++b)
+    for (std::size_t r = 0; r < nrow; ++r)
+      for (std::size_t cc = 0; cc < ncol; ++cc) {
+        const std::size_t ord = (b * nrow + r) * ncol + cc;
+        const Inner& cell = C.data()[ord];
+        const std::vector<double>& want = ref[ord];
+        if (want.empty()) {
+          // No contribution: a hole stays absent; a present cell stays zero
+          // (the kernel must not have written anything to it).
+          if (cell)
+            for (std::size_t a1 = 0; a1 < cell.size(); ++a1)
+              BOOST_CHECK_SMALL(cell.data()[a1], 1e-12);
+          continue;
+        }
+        BOOST_REQUIRE(bool(cell));
+        BOOST_REQUIRE_EQUAL(cell.size(), P);
+        for (std::size_t a1 = 0; a1 < P; ++a1)
+          BOOST_CHECK_CLOSE(cell.data()[a1], want[a1], 1e-10);
+      }
+}
+
+// Zero-fill the present cells of an already-built result tile (the kernel
+// accumulates with beta=1, so C must start at zero).
+void zero_result(Outer& C, std::size_t ncells) {
+  for (std::size_t o = 0; o < ncells; ++o) {
+    Inner& c = C.data()[o];
+    if (!c) continue;
+    for (std::size_t e = 0; e < c.size(); ++e) c.data()[e] = 0.0;
+  }
+}
+}  // namespace
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+BOOST_AUTO_TEST_CASE(ce_ce_fires_clean_path) {
+  const std::size_t Mmu = 3, nK = 2, P = 4, Q = 5, NB = 2;
+  // NB batches, all uniform => every (b,k) DGEMM fires => count == NB*Mo*nK.
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK}, NB, [&](std::size_t){return TA::Range{P, Q};});
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, NB, [&](std::size_t){return TA::Range{Q};});
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, NB, [&](std::size_t){return TA::Range{P};});
+  for (std::size_t o = 0; o < NB * nK; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  for (std::size_t o = 0; o < NB * Mmu * nK; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  namespace blas = TiledArray::math::blas;
+  TA::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, 1, Mmu, nK,
+                                        blas::NoTranspose, blas::Transpose, 1.0);
+  // per-DGEMM count: Mo(==1) * nK per batch
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_ce_right_calls.load(), NB * 1 * nK);
+}
+
+// Addition B firing count: Mo>1, nbatch>1 -> clean count == NB*Mo*nK.
+BOOST_AUTO_TEST_CASE(ce_ce_left_external_fires_clean_path) {
+  const std::size_t Mo = 2, Mmu = 2, nK = 3, P = 3, Q = 4, NB = 2;
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, nK}, NB, [&](std::size_t){return TA::Range{P, Q};});
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, NB, [&](std::size_t){return TA::Range{Q};});
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, Mmu}, NB, [&](std::size_t){return TA::Range{P};});
+  for (std::size_t o = 0; o < NB * Mo * nK; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  for (std::size_t o = 0; o < NB * Mmu * nK; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  namespace blas = TiledArray::math::blas;
+  TA::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+                                        blas::NoTranspose, blas::Transpose, 1.0);
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_ce_right_calls.load(), NB * Mo * nK);
+}
+
+// Was "does_not_fire": under per-k segmentation a mid-run hole no longer drops
+// the whole run to scalar -- it splits into per-k contiguous segments that still
+// fire as strided GEMMs. Correctness is unchanged; the count now reflects the
+// segments the walker issues (the empty mu=1 row breaks each k's run).
+BOOST_AUTO_TEST_CASE(ce_ce_scattered_run_segments_and_is_correct) {
+  const std::size_t Mmu = 3, nK = 2, P = 4, Q0 = 5, Q1 = 6;
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q0};}, 1.0);
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t o){
+        const std::size_t mu = o / nK;
+        return TA::Range{mu == 1 ? Q1 : Q0};
+      });
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){return TA::Range{P};});
+  namespace blas = TiledArray::math::blas;
+  TA::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, /*Mo=*/1, /*No=*/Mmu, /*Ko=*/nK,
+                                        blas::NoTranspose, blas::Transpose, 1.0);
+  // mu=1 has a mismatched size, so per k the walker emits segments {0} and {2}
+  // (the size-mismatched mu=1 cannot join either) => 2 segments x nK(=2) = 4.
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_ce_right_calls.load(), 4u);
+  std::vector<double> ref(Mmu * P, 0.0);
+  for (std::size_t k = 0; k < nK; ++k) {
+    const auto& lk = L.data()[k];
+    const double* l = lk.data();
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      const auto& rk = R.data()[mu * nK + k];
+      const std::size_t Ql = rk.size();
+      if (lk.size() != P * Ql) continue;
+      const double* r = rk.data();
+      for (std::size_t a1 = 0; a1 < P; ++a1) {
+        double acc = 0;
+        for (std::size_t a4 = 0; a4 < Ql; ++a4) acc += l[a1 * Ql + a4] * r[a4];
+        ref[mu * P + a1] += acc;
+      }
+    }
+  }
+  for (std::size_t mu = 0; mu < Mmu; ++mu) {
+    const double* got = C.data()[mu].data();
+    for (std::size_t a1 = 0; a1 < P; ++a1)
+      BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+  }
+}
+
+// Page-jump / constant-stride guard in isolation: every R cell has the SAME
+// size Q (uniform-size guard passes), but the per-k mu-run is laid at a
+// NON-CONSTANT stride -> page-jump guard rejects. Fully-independent reference.
+// Was "does_not_fire": a page-jump (non-constant inter-cell stride) no longer
+// drops the whole run -- the walker ends a segment at the stride break and
+// starts a new strided GEMM, so it fires (correctly) while staying exact.
+BOOST_AUTO_TEST_CASE(ce_ce_page_jump_run_segments_and_is_correct) {
+  const std::size_t Mmu = 3, nK = 2, P = 4, Q = 5;
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer Rsrc = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t){return TA::Range{Q};});
+  for (std::size_t o = 0; o < Rsrc.range().volume(); ++o)
+    for (std::size_t e = 0; e < Rsrc.data()[o].size(); ++e)
+      Rsrc.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  const std::ptrdiff_t S = Rsrc.data()[1].data() - Rsrc.data()[0].data();
+  BOOST_REQUIRE_GT(S, 0);
+  for (std::size_t o = 0; o < Rsrc.range().volume(); ++o)
+    BOOST_REQUIRE_EQUAL(Rsrc.data()[o].data(),
+                        Rsrc.data()[0].data() + static_cast<std::ptrdiff_t>(o) * S);
+  const std::size_t perm[Mmu] = {0, 2, 1};  // non-monotone -> non-constant stride
+  std::vector<std::size_t> phys(Mmu * nK);
+  for (std::size_t mu = 0; mu < Mmu; ++mu)
+    for (std::size_t k = 0; k < nK; ++k)
+      phys[mu * nK + k] = perm[mu] * nK + k;
+  Outer R = assemble_aliased(Rsrc, TA::Range{Mmu, nK}, phys);
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    BOOST_REQUIRE_EQUAL(R.data()[o].size(), Q);
+  for (std::size_t k = 0; k < nK; ++k) {
+    const std::ptrdiff_t d01 =
+        R.data()[1 * nK + k].data() - R.data()[0 * nK + k].data();
+    const std::ptrdiff_t d12 =
+        R.data()[2 * nK + k].data() - R.data()[1 * nK + k].data();
+    BOOST_REQUIRE_NE(d01, d12);
+  }
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){return TA::Range{P};});
+  namespace blas = TiledArray::math::blas;
+  TA::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, /*Mo=*/1, /*No=*/Mmu, /*Ko=*/nK,
+                                        blas::NoTranspose, blas::Transpose, 1.0);
+  // perm {0,2,1} makes the inter-cell stride non-constant, so per k the walker
+  // breaks the run into constant-stride segments {0,1} and {2} => 2 x nK(=2) = 4.
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_ce_right_calls.load(), 4u);
+  auto ref = ref_ce_ce(L, R, Mmu, nK, P, Q, 1.0);
+  for (std::size_t mu = 0; mu < Mmu; ++mu) {
+    const double* got = C.data()[mu].data();
+    for (std::size_t a1 = 0; a1 < P; ++a1)
+      BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+  }
+}
+
+// LEFT-clean firing count: Mo>1, No>1, single batch -> one DGEMM per (n,k)
+// (the left-external m is ridden into BLAS M) => count == No*nK.
+BOOST_AUTO_TEST_CASE(ce_ce_left_clean_fires_clean_path) {
+  const std::size_t Mo = 2, No = 3, nK = 2, P = 4, Q = 5;
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, nK}, 1, [&](std::size_t) { return TA::Range{Q}; });
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK, No}, 1, [&](std::size_t) { return TA::Range{Q, P}; });
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, No}, 1, [&](std::size_t) { return TA::Range{P}; });
+  for (std::size_t o = 0; o < Mo * nK; ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  for (std::size_t o = 0; o < nK * No; ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  namespace blas = TiledArray::math::blas;
+  TA::detail::g_strided_dgemm_ce_ce_left_calls.store(0);
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, R, Mo, No, nK,
+                                             blas::NoTranspose, blas::NoTranspose,
+                                             1.0);
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_ce_left_calls.load(),
+                    No * nK);
+}
+
+// LEFT page-jump: a non-constant stride on the strided LEFT operand L (along m)
+// must break the m-run into constant-stride segments. perm {0,2,1} over Mo=3
+// makes L[1],L[2] non-uniformly spaced => per (n,k) the walker emits segments
+// {0,1} and {2} => 2 x No(=1) x nK(=2) = 4 segment GEMMs. Result stays exact.
+BOOST_AUTO_TEST_CASE(ce_ce_left_page_jump_run_segments_and_is_correct) {
+  const std::size_t Mo = 3, No = 1, nK = 2, P = 4, Q = 5;
+  Outer Lsrc = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, nK}, 1, [&](std::size_t){ return TA::Range{Q}; });
+  for (std::size_t o = 0; o < Lsrc.range().volume(); ++o)
+    for (std::size_t e = 0; e < Lsrc.data()[o].size(); ++e)
+      Lsrc.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  // Alias L cells in a non-monotone m order (k stays fast): logical (m,k) ->
+  // physical (perm[m],k). This yields a uniform-size run with non-constant stride.
+  const std::size_t perm[Mo] = {0, 2, 1};
+  std::vector<std::size_t> phys(Mo * nK);
+  for (std::size_t m = 0; m < Mo; ++m)
+    for (std::size_t k = 0; k < nK; ++k)
+      phys[m * nK + k] = perm[m] * nK + k;
+  Outer L = assemble_aliased(Lsrc, TA::Range{Mo, nK}, phys);
+  // Confirm the stride really is non-constant for at least one k.
+  for (std::size_t k = 0; k < nK; ++k) {
+    const std::ptrdiff_t d01 =
+        L.data()[1 * nK + k].data() - L.data()[0 * nK + k].data();
+    const std::ptrdiff_t d12 =
+        L.data()[2 * nK + k].data() - L.data()[1 * nK + k].data();
+    BOOST_REQUIRE_NE(d01, d12);
+  }
+  Outer R = make_filled(TA::Range{nK, No},
+                        [&](std::size_t){ return TA::Range{Q, P}; }, 2.0);
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, No}, 1, [&](std::size_t){ return TA::Range{P}; });
+  zero_result(C, C.range().volume());
+  namespace blas = TiledArray::math::blas;
+  TA::detail::g_strided_dgemm_ce_ce_left_calls.store(0);
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, R, Mo, No, nK,
+      blas::NoTranspose, blas::NoTranspose, 1.0);
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_ce_left_calls.load(), 4u);
+  auto ref = ref_ce_ce_left_sparse(L, R, Mo, No, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, No, P);
+}
+
+// RIGHT result-C page-jump: a non-constant stride on the RESULT C cells (along
+// μ̃) must also break segments (the sC guard, distinct from the sR operand
+// guard). perm {0,2,1} over Mmu=3 aliases C in non-monotone order. The kernel
+// writes to logical C[μ̃]; with non-constant C stride it segments {0,1},{2}
+// per k => 2 x nK(=2) = 4 segment GEMMs, still exact.
+BOOST_AUTO_TEST_CASE(ce_ce_right_result_page_jump_segments_and_is_correct) {
+  const std::size_t Mmu = 3, nK = 2, P = 4, Q = 5;
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t){return TA::Range{Q};});
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  // Build a dense C, then alias its cells in non-monotone μ̃ order so the result
+  // run has a non-constant .data() stride. C is 1-D over μ̃ (Mo=1).
+  Outer Csrc = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){return TA::Range{P};});
+  zero_result(Csrc, Csrc.range().volume());
+  const std::size_t perm[Mmu] = {0, 2, 1};
+  std::vector<std::size_t> phys(Mmu);
+  for (std::size_t mu = 0; mu < Mmu; ++mu) phys[mu] = perm[mu];
+  Outer C = assemble_aliased(Csrc, TA::Range{Mmu}, phys);
+  const std::ptrdiff_t d01 = C.data()[1].data() - C.data()[0].data();
+  const std::ptrdiff_t d12 = C.data()[2].data() - C.data()[1].data();
+  BOOST_REQUIRE_NE(d01, d12);  // non-constant result stride
+  namespace blas = TiledArray::math::blas;
+  TA::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, /*Mo=*/1, /*No=*/Mmu, /*Ko=*/nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_ce_right_calls.load(), 4u);
+  auto ref = ref_ce_ce(L, R, Mmu, nK, P, Q, 1.0);
+  for (std::size_t mu = 0; mu < Mmu; ++mu) {
+    const double* got = C.data()[mu].data();
+    for (std::size_t a1 = 0; a1 < P; ++a1)
+      BOOST_CHECK_CLOSE(got[a1], ref[mu * P + a1], 1e-12);
+  }
+}
+#endif
+
+// ===========================================================================
+// Per-k segmented strided-DGEMM tests (T1-T15). The kernel walks each present k
+// and emits one strided GEMM per maximal contiguous (present + uniform-stride +
+// size-matched) segment along the strided axis, skipping holes, accumulating
+// with beta=1 across k and across segments, never touching absent result cells.
+// All use the canonical right convention (right_op=Transpose: R outer mu slow,
+// k fast). Result/operand outer layouts match ref_ce_ce_right/left_sparse.
+// ===========================================================================
+
+// T1: dense run, no holes -> one full-run segment per k == today's clean path.
+// Pins golden values (also matches the original dense ref_ce_ce).
+BOOST_AUTO_TEST_CASE(ce_ce_seg_dense_regression) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 6, nK = 2, P = 3, Q = 4;
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t){ return TA::Range{Q}; });
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){ return TA::Range{P}; });
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  auto ref = ref_ce_ce_right_sparse(L, R, Mo, Mmu, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, Mmu, P);
+  // Independent naive-oracle cross-check.
+  auto ref0 = ref_ce_ce(L, R, Mmu, nK, P, Q, 1.0);
+  for (std::size_t mu = 0; mu < Mmu; ++mu)
+    for (std::size_t a1 = 0; a1 < P; ++a1)
+      BOOST_CHECK_CLOSE(C.data()[mu].data()[a1], ref0[mu * P + a1], 1e-10);
+  // Frozen golden baseline: the verified dense-path output for exactly this
+  // Mmu=6,nK=2,P=3,Q=4 dense fill, cross-validated by the independent naive
+  // ref_ce_ce oracle (checked above at 1e-10). The segmented kernel on a dense
+  // run must reproduce these bitwise-stable values; a drift here flags a
+  // regression the recomputed references could not (invariant 4 / T1 golden).
+  static const double T1_GOLD[18] = {
+      80.2404000000, 192.4004000000, 304.5604000000,
+      80.6412000000, 193.4412000000, 306.2412000000,
+      81.0420000000, 194.4820000000, 307.9220000000,
+      81.4428000000, 195.5228000000, 309.6028000000,
+      81.8436000000, 196.5636000000, 311.2836000000,
+      82.2444000000, 197.6044000000, 312.9644000000,
+  };
+  for (std::size_t mu = 0; mu < Mmu; ++mu)
+    for (std::size_t a1 = 0; a1 < P; ++a1)
+      BOOST_CHECK_CLOSE(C.data()[mu].data()[a1], T1_GOLD[mu * P + a1], 1e-10);
+}
+
+// T2: single interior hole at mu=4 in BOTH C and R (all k) -> two segments
+// [0,4),[5,8); C[4] stays absent; rest exact.
+BOOST_AUTO_TEST_CASE(ce_ce_seg_single_interior_hole) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 8, nK = 2, P = 3, Q = 4;
+  auto rhole = [&](std::size_t o) { return (o / nK) == 4; };  // R[mu=4,*]
+  auto chole = [&](std::size_t o) { return o == 4; };          // C[mu=4]
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, rhole, 2.0);
+  Outer C = make_sparse(TA::Range{Mmu}, 1,
+                        [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+  zero_result(C, Mmu);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  auto ref = ref_ce_ce_right_sparse(L, R, Mo, Mmu, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, Mmu, P);
+}
+
+// Invariant 2 (no writes to absent result cells): a result run with holes must
+// leave those hole cells ABSENT (null) after the kernel runs -- the segmenter
+// only ever writes into pre-existing present cells, never allocates a tile in a
+// hole. Strictly pins what check_ce_ce's empty-ref branch only checks leniently.
+BOOST_AUTO_TEST_CASE(ce_ce_seg_holes_stay_absent) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 5, nK = 1, P = 3, Q = 4;
+  auto rhole = [&](std::size_t o) { return o == 1 || o == 3; };
+  auto chole = [&](std::size_t o) { return o == 1 || o == 3; };
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, rhole, 2.0);
+  Outer C = make_sparse(TA::Range{Mmu}, 1,
+                        [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+  zero_result(C, Mmu);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  // hole cells stay absent; present cells stay present.
+  for (std::size_t mu = 0; mu < Mmu; ++mu) {
+    if (chole(mu))
+      BOOST_CHECK(!C.data()[mu]);
+    else
+      BOOST_CHECK(bool(C.data()[mu]));
+  }
+  auto ref = ref_ce_ce_right_sparse(L, R, Mo, Mmu, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, Mmu, P);
+}
+
+// T3: holes at both edges (mu=0 and mu=Mmu-1) -> one interior segment.
+BOOST_AUTO_TEST_CASE(ce_ce_seg_edge_holes) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 6, nK = 2, P = 3, Q = 4;
+  auto rhole = [&](std::size_t o) {
+    const std::size_t mu = o / nK;
+    return mu == 0 || mu == Mmu - 1;
+  };
+  auto chole = [&](std::size_t o) { return o == 0 || o == Mmu - 1; };
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, rhole, 2.0);
+  Outer C = make_sparse(TA::Range{Mmu}, 1,
+                        [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+  zero_result(C, Mmu);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  auto ref = ref_ce_ce_right_sparse(L, R, Mo, Mmu, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, Mmu, P);
+}
+
+// T4: present at even mu, absent at odd (all k) -> every segment is M=1 (GEMV).
+BOOST_AUTO_TEST_CASE(ce_ce_seg_alternating_gemv) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 6, nK = 1, P = 3, Q = 4;
+  auto rhole = [&](std::size_t o) { return ((o / nK) % 2) == 1; };
+  auto chole = [&](std::size_t o) { return (o % 2) == 1; };
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, rhole, 2.0);
+  Outer C = make_sparse(TA::Range{Mmu}, 1,
+                        [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+  zero_result(C, Mmu);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  auto ref = ref_ce_ce_right_sparse(L, R, Mo, Mmu, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, Mmu, P);
+}
+
+// T5 (crux): per-k misaligned holes. nK=2, Mmu=3, Mo=1.
+//   R canonical index = mu*nK + k. k=0 present at mu={0,2} (hole mu=1,k=0 -> 2);
+//   k=1 present at mu={1,2} (hole mu=0,k=1 -> 1). C present at {0,1,2} (union).
+//   Per-k segmentation: k=0 -> {0},{2}; k=1 -> {1,2}. C[2] both-k accumulated.
+BOOST_AUTO_TEST_CASE(ce_ce_seg_per_k_misaligned) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 3, nK = 2, P = 3, Q = 4;
+  auto rhole = [&](std::size_t o) { return o == 2 || o == 1; };  // (mu=1,k=0),(mu=0,k=1)
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, rhole, 2.0);
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){ return TA::Range{P}; });
+  zero_result(C, Mmu);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  auto ref = ref_ce_ce_right_sparse(L, R, Mo, Mmu, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, Mmu, P);
+}
+
+// T6: one full k has L[m,k] absent -> that k skipped; others contribute.
+BOOST_AUTO_TEST_CASE(ce_ce_seg_absent_k) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 5, nK = 3, P = 3, Q = 4;
+  auto lhole = [&](std::size_t o) { return o == 1; };  // L[k=1] absent
+  Outer L = make_sparse(TA::Range{nK}, 1,
+                        [&](std::size_t){ return TA::Range{P, Q}; }, lhole, 1.0);
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t){ return TA::Range{Q}; });
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){ return TA::Range{P}; });
+  zero_result(C, Mmu);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  auto ref = ref_ce_ce_right_sparse(L, R, Mo, Mmu, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, Mmu, P);
+}
+
+// T7: left_inner_transposed=true together with an interior hole; verifies the
+// transb folding is correct per segment. L stored Q x P (matrix_transpose).
+BOOST_AUTO_TEST_CASE(ce_ce_seg_transposed_inner) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 6, nK = 2, P = 3, Q = 4;
+  auto rhole = [&](std::size_t o) { return (o / nK) == 3; };
+  auto chole = [&](std::size_t o) { return o == 3; };
+  // L stored Q x P (transposed layout), filled deterministically.
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{Q, P};}, 1.0);
+  Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, rhole, 2.0);
+  Outer C = make_sparse(TA::Range{Mmu}, 1,
+                        [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+  zero_result(C, Mmu);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0, /*left_inner_transposed=*/true);
+  auto ref = ref_ce_ce_right_sparse(L, R, Mo, Mmu, nK, P, 1.0, 1, /*lt=*/true);
+  check_ce_ce(C, ref, 1, Mo, Mmu, P);
+}
+
+// T8: NB=2; batch 0 dense, batch 1 has the T5 per-k misaligned pattern.
+BOOST_AUTO_TEST_CASE(ce_ce_seg_multi_batch_sparse) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 3, nK = 2, P = 3, Q = 4, NB = 2;
+  // R index within batch = mu*nK + k; holes only in batch 1 (offsets 2 and 1).
+  auto rhole = [&](std::size_t o) {
+    const std::size_t per = Mmu * nK;
+    return (o / per) == 1 && ((o % per) == 2 || (o % per) == 1);
+  };
+  // NB batches of L (the kernel indexes L per-batch).
+  Outer Lb = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK}, NB, [&](std::size_t){ return TA::Range{P, Q}; });
+  for (std::size_t o = 0; o < NB * nK; ++o)
+    for (std::size_t e = 0; e < Lb.data()[o].size(); ++e)
+      Lb.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  Outer R = make_sparse(TA::Range{Mmu, nK}, NB,
+                        [&](std::size_t){ return TA::Range{Q}; }, rhole, 2.0);
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, NB, [&](std::size_t){ return TA::Range{P}; });
+  zero_result(C, NB * Mmu);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, Lb, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  auto ref = ref_ce_ce_right_sparse(Lb, R, Mo, Mmu, nK, P, 1.0, NB);
+  check_ce_ce(C, ref, NB, Mo, Mmu, P);
+}
+
+// T9: T2 hole pattern with factor=2.5 -> scaling correct with holes.
+BOOST_AUTO_TEST_CASE(ce_ce_seg_applies_factor) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 8, nK = 2, P = 3, Q = 4;
+  auto rhole = [&](std::size_t o) { return (o / nK) == 4; };
+  auto chole = [&](std::size_t o) { return o == 4; };
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, rhole, 2.0);
+  Outer C = make_sparse(TA::Range{Mmu}, 1,
+                        [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+  zero_result(C, Mmu);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 2.5);
+  auto ref = ref_ce_ce_right_sparse(L, R, Mo, Mmu, nK, P, 2.5);
+  check_ce_ce(C, ref, 1, Mo, Mmu, P);
+}
+
+// T10: one R[mu,k] present but the WRONG inner size (!= Q) for the single k.
+// Defensive: a segment cannot include it (size mismatch), so it is skipped; the
+// reference skips it too via its lk.size()!=P*Q / Q-mismatch guard. No crash.
+BOOST_AUTO_TEST_CASE(ce_ce_seg_size_mismatch_defensive) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 5, nK = 1, P = 3, Q = 4;
+  // R[mu=2,k=0] gets size Q+1; that breaks size-match so it cannot join a Q-run.
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1,
+      [&](std::size_t o){ return (o / nK) == 2 ? TA::Range{Q + 1} : TA::Range{Q}; });
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){ return TA::Range{P}; });
+  zero_result(C, Mmu);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  auto ref = ref_ce_ce_right_sparse(L, R, Mo, Mmu, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, Mmu, P);
+}
+
+// T11: T5 mirrored for the LEFT kernel (strided over m). nK=2, Mo=3, No=1.
+//   L canonical index = m*nK + k. k=0 present at m={0,2} (hole m=1,k=0 -> 2);
+//   k=1 present at m={1,2} (hole m=0,k=1 -> 1). C present at {0,1,2} (union).
+BOOST_AUTO_TEST_CASE(ce_ce_left_seg_per_k_misaligned) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 3, No = 1, nK = 2, P = 3, Q = 4;
+  auto lhole = [&](std::size_t o) { return o == 2 || o == 1; };
+  Outer L = make_sparse(TA::Range{Mo, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, lhole, 1.0);
+  // R (matrix) canonical (k slow, n fast) outer (nK,No), inner {Q,P} row-major.
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK, No}, 1, [&](std::size_t){ return TA::Range{Q, P}; });
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, No}, 1, [&](std::size_t){ return TA::Range{P}; });
+  zero_result(C, Mo * No);
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, R, Mo, No, nK,
+      blas::NoTranspose, blas::NoTranspose, 1.0);
+  auto ref = ref_ce_ce_left_sparse(L, R, Mo, No, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, No, P);
+}
+
+// T12: T2 mirrored for the LEFT kernel: single interior hole along m (m=4).
+BOOST_AUTO_TEST_CASE(ce_ce_left_seg_single_interior_hole) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 8, No = 1, nK = 2, P = 3, Q = 4;
+  auto lhole = [&](std::size_t o) { return (o / nK) == 4; };  // L[m=4,*]
+  auto chole = [&](std::size_t o) { return (o / No) == 4; };  // C[m=4]
+  Outer L = make_sparse(TA::Range{Mo, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, lhole, 1.0);
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK, No}, 1, [&](std::size_t){ return TA::Range{Q, P}; });
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = make_sparse(TA::Range{Mo, No}, 1,
+                        [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+  zero_result(C, Mo * No);
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, R, Mo, No, nK,
+      blas::NoTranspose, blas::NoTranspose, 1.0);
+  auto ref = ref_ce_ce_left_sparse(L, R, Mo, No, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, No, P);
+}
+
+// ---- Segment-count assertions (need -DTA_STRIDED_DGEMM_COUNT) ----
+
+// T13: dense Mmu=6, nK=2 -> 2 segment-GEMMs (one full-run segment per k).
+BOOST_AUTO_TEST_CASE(ce_ce_seg_count_dense) {
+#ifdef TA_STRIDED_DGEMM_COUNT
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 6, nK = 2, P = 3, Q = 4;
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t){ return TA::Range{Q}; });
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){ return TA::Range{P}; });
+  zero_result(C, Mmu);
+  TA::detail::g_strided_dgemm_ce_ce_right_calls = 0;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  BOOST_CHECK_EQUAL(
+      TA::detail::g_strided_dgemm_ce_ce_right_calls.load(), std::size_t{2});
+#else
+  BOOST_TEST_MESSAGE("ce_ce_seg_count_dense skipped (no TA_STRIDED_DGEMM_COUNT)");
+#endif
+}
+
+// T14: T5 pattern -> 3 segment-GEMMs total (k=0: {0},{2}=2; k=1: {1,2}=1).
+BOOST_AUTO_TEST_CASE(ce_ce_seg_count_per_k_misaligned) {
+#ifdef TA_STRIDED_DGEMM_COUNT
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 3, nK = 2, P = 3, Q = 4;
+  auto rhole = [&](std::size_t o) { return o == 2 || o == 1; };
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, rhole, 2.0);
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu}, 1, [&](std::size_t){ return TA::Range{P}; });
+  zero_result(C, Mmu);
+  TA::detail::g_strided_dgemm_ce_ce_right_calls = 0;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  BOOST_CHECK_EQUAL(
+      TA::detail::g_strided_dgemm_ce_ce_right_calls.load(), std::size_t{3});
+#else
+  BOOST_TEST_MESSAGE(
+      "ce_ce_seg_count_per_k_misaligned skipped (no TA_STRIDED_DGEMM_COUNT)");
+#endif
+}
+
+// T15: T4 pattern (even present, Mmu=6, nK=1) -> 3 segment-GEMMs (M=1 each).
+BOOST_AUTO_TEST_CASE(ce_ce_seg_count_alternating) {
+#ifdef TA_STRIDED_DGEMM_COUNT
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 6, nK = 1, P = 3, Q = 4;
+  auto rhole = [&](std::size_t o) { return ((o / nK) % 2) == 1; };
+  auto chole = [&](std::size_t o) { return (o % 2) == 1; };
+  Outer L = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P, Q};}, 1.0);
+  Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, rhole, 2.0);
+  Outer C = make_sparse(TA::Range{Mmu}, 1,
+                        [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+  zero_result(C, Mmu);
+  TA::detail::g_strided_dgemm_ce_ce_right_calls = 0;
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  BOOST_CHECK_EQUAL(
+      TA::detail::g_strided_dgemm_ce_ce_right_calls.load(), std::size_t{3});
+#else
+  BOOST_TEST_MESSAGE(
+      "ce_ce_seg_count_alternating skipped (no TA_STRIDED_DGEMM_COUNT)");
+#endif
+}
+
+
+// T16: _left mirror of T14 -- per-k misaligned along m yields 3 segment-GEMMs
+// (k=0: m in {0},{2} = 2 segs; k=1: m in {1,2} = 1 seg). Proves the LEFT kernel
+// segments rather than dropping the whole run to the scalar fallback.
+BOOST_AUTO_TEST_CASE(ce_ce_left_seg_count_per_k_misaligned) {
+#ifdef TA_STRIDED_DGEMM_COUNT
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 3, No = 1, nK = 2, P = 3, Q = 4;
+  // L outer (Mo,nK) ordinal = m*nK + k. Present set: k=0 -> m{0,2}; k=1 -> m{1,2}.
+  // Holes: (m=1,k=0)=ord2, (m=0,k=1)=ord1.
+  auto lhole = [&](std::size_t o) { return o == 2 || o == 1; };
+  Outer L = make_sparse(TA::Range{Mo, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, lhole, 1.0);
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK, No}, 1, [&](std::size_t){ return TA::Range{Q, P}; });
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, No}, 1, [&](std::size_t){ return TA::Range{P}; });
+  zero_result(C, Mo * No);
+  TA::detail::g_strided_dgemm_ce_ce_left_calls.store(0);
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, R, Mo, No, nK,
+      blas::NoTranspose, blas::NoTranspose, 1.0);
+  BOOST_CHECK_EQUAL(TA::detail::g_strided_dgemm_ce_ce_left_calls.load(),
+                    std::size_t{3});
+  // also correctness against the sparsity-aware reference.
+  auto ref = ref_ce_ce_left_sparse(L, R, Mo, No, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, No, P);
+#else
+  BOOST_TEST_MESSAGE(
+      "ce_ce_left_seg_count_per_k_misaligned skipped (no TA_STRIDED_DGEMM_COUNT)");
+#endif
+}
+
+// T7 mirror for the LEFT kernel: right_inner_transposed=true with a hole along
+// m. Pins the transb=T / ldb=Q segment path and its scalar-fallback indexing.
+BOOST_AUTO_TEST_CASE(ce_ce_left_seg_transposed_inner) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 8, No = 1, nK = 2, P = 3, Q = 4;
+  auto lhole = [&](std::size_t o) { return (o / nK) == 3; };  // L[m=3,*]
+  auto chole = [&](std::size_t o) { return (o / No) == 3; };  // C[m=3]
+  Outer L = make_sparse(TA::Range{Mo, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, lhole, 1.0);
+  // R stored P x Q (matrix_transpose layout): (b1,a4) row-major.
+  Outer R = make_filled(TA::Range{nK, No},
+                        [&](std::size_t){ return TA::Range{P, Q}; }, 2.0);
+  Outer C = make_sparse(TA::Range{Mo, No}, 1,
+                        [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+  zero_result(C, Mo * No);
+  TA::detail::arena_strided_dgemm_ce_ce_left(
+      C, L, R, Mo, No, nK, blas::NoTranspose, blas::NoTranspose, 1.0,
+      /*right_inner_transposed=*/true);
+  auto ref = ref_ce_ce_left_sparse(L, R, Mo, No, nK, P, 1.0, 1, /*rt=*/true);
+  check_ce_ce(C, ref, 1, Mo, No, P);
+}
+
+// Non-canonical outer orientation for _right: left_op=Transpose exercises the
+// transposed l_off branch (L physically laid out (k,m) = k*Mo+m). Dense -> one
+// full-run segment per k; compared to an inline reference over logical operands.
+BOOST_AUTO_TEST_CASE(ce_ce_right_seg_left_op_transpose) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 2, Mmu = 3, nK = 2, P = 2, Q = 3;
+  Outer L = make_filled(TA::Range{nK, Mo},
+                        [&](std::size_t){ return TA::Range{P, Q}; }, 1.0);
+  Outer R = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mmu, nK}, 1, [&](std::size_t){ return TA::Range{Q}; });
+  for (std::size_t o = 0; o < R.range().volume(); ++o)
+    for (std::size_t e = 0; e < R.data()[o].size(); ++e)
+      R.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, Mmu}, 1, [&](std::size_t){ return TA::Range{P}; });
+  zero_result(C, Mo * Mmu);
+  TA::detail::arena_strided_dgemm_ce_ce_right(C, L, R, Mo, Mmu, nK,
+      blas::Transpose, blas::Transpose, 1.0);
+  // C[m,mu](a1) = sum_k sum_a4 L_phys[k*Mo+m](a1,a4) * R[mu,k](a4)
+  for (std::size_t m = 0; m < Mo; ++m)
+    for (std::size_t mu = 0; mu < Mmu; ++mu) {
+      const double* got = C.data()[m * Mmu + mu].data();
+      for (std::size_t a1 = 0; a1 < P; ++a1) {
+        double acc = 0.0;
+        for (std::size_t k = 0; k < nK; ++k) {
+          const double* l = L.data()[k * Mo + m].data();   // P x Q
+          const double* r = R.data()[mu * nK + k].data();  // Q
+          for (std::size_t a4 = 0; a4 < Q; ++a4) acc += l[a1 * Q + a4] * r[a4];
+        }
+        BOOST_CHECK_CLOSE(got[a1], acc, 1e-10);
+      }
+    }
+}
+
+// Non-canonical outer orientation for _left: right_op=Transpose exercises the
+// transposed r_off branch (R physically laid out (n,k) = n*nK+k). Dense -> one
+// full-run segment per k; compared to an inline reference over logical operands.
+BOOST_AUTO_TEST_CASE(ce_ce_left_seg_right_op_transpose) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 3, No = 2, nK = 2, P = 2, Q = 3;
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, nK}, 1, [&](std::size_t){ return TA::Range{Q}; });
+  for (std::size_t o = 0; o < L.range().volume(); ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  // R physical (n slow, k fast) = n*nK+k, inner Q x P (canonical a4,b1).
+  Outer R = make_filled(TA::Range{No, nK},
+                        [&](std::size_t){ return TA::Range{Q, P}; }, 2.0);
+  Outer C = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, No}, 1, [&](std::size_t){ return TA::Range{P}; });
+  zero_result(C, Mo * No);
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, R, Mo, No, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  // C[m,n](b1) = sum_k sum_a4 L[m*nK+k](a4) * R_phys[n*nK+k](a4,b1)
+  for (std::size_t m = 0; m < Mo; ++m)
+    for (std::size_t n = 0; n < No; ++n) {
+      const double* got = C.data()[m * No + n].data();
+      for (std::size_t b1 = 0; b1 < P; ++b1) {
+        double acc = 0.0;
+        for (std::size_t k = 0; k < nK; ++k) {
+          const double* l = L.data()[m * nK + k].data();   // Q
+          const double* r = R.data()[n * nK + k].data();   // Q x P
+          for (std::size_t a4 = 0; a4 < Q; ++a4) acc += l[a4] * r[a4 * P + b1];
+        }
+        BOOST_CHECK_CLOSE(got[b1], acc, 1e-10);
+      }
+    }
+}
+
+// L1: left mirror of T2/T9 with a hole and a non-unit factor. Hole at m=4 in
+// BOTH C and L (all k) -> two m-segments [0,4),[5,8); C[4] absent; factor=2.5.
+BOOST_AUTO_TEST_CASE(ce_ce_left_seg_hole_and_factor) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 8, No = 1, nK = 2, P = 3, Q = 4;
+  const double factor = 2.5;
+  auto lhole = [&](std::size_t o) { return (o / nK) == 4; };  // L[m=4,*]
+  auto chole = [&](std::size_t o) { return o == 4; };         // C[m=4,n=0]
+  Outer L = make_sparse(TA::Range{Mo, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, lhole, 1.0);
+  Outer R = make_filled(TA::Range{nK, No},
+                        [&](std::size_t){ return TA::Range{Q, P}; }, 2.0);
+  Outer C = make_sparse(TA::Range{Mo, No}, 1,
+                        [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+  zero_result(C, C.range().volume());
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, R, Mo, No, nK,
+      blas::NoTranspose, blas::NoTranspose, factor);
+  auto ref = ref_ce_ce_left_sparse(L, R, Mo, No, nK, P, factor);
+  check_ce_ce(C, ref, 1, Mo, No, P);
+}
+
+// L2: left mirror of T6. The SINGLE-CELL operand R[k=1,n] is absent for all n
+// -> the kernel's `if (!rk) continue;` skips k=1 entirely (beta=1); other k
+// contribute. (The complementary "strided operand absent" guard is L2b below.)
+BOOST_AUTO_TEST_CASE(ce_ce_left_seg_absent_k) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 5, No = 1, nK = 3, P = 3, Q = 4;
+  // R[k=1,n] absent for all n -> k=1 skipped entirely (beta=1).
+  auto rhole = [&](std::size_t o) { return (o / No) == 1; };  // R[k=1,*]
+  Outer L = make_filled(TA::Range{Mo, nK},
+                        [&](std::size_t){ return TA::Range{Q}; }, 1.0);
+  Outer R = make_sparse(TA::Range{nK, No}, 1,
+                        [&](std::size_t){ return TA::Range{Q, P}; }, rhole, 2.0);
+  Outer C = make_filled(TA::Range{Mo, No},
+                        [&](std::size_t){ return TA::Range{P}; }, 0.0);
+  zero_result(C, C.range().volume());
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, R, Mo, No, nK,
+      blas::NoTranspose, blas::NoTranspose, 1.0);
+  auto ref = ref_ce_ce_left_sparse(L, R, Mo, No, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, No, P);
+}
+
+// L2b: the STRIDED operand is entirely absent for one k. All L[m,k=1] are holes
+// while R[k=1,n] stays present -> Q cannot be discovered for k=1, so the kernel's
+// `if (Q <= 0) continue;` skips it; other k contribute. The reference skips the
+// same k (its `!lk` branch), so results stay exact.
+BOOST_AUTO_TEST_CASE(ce_ce_left_seg_strided_operand_absent_k) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 5, No = 1, nK = 3, P = 3, Q = 4;
+  // L[m,k=1] absent for all m (ordinal o = m*nK + k, so k==1).
+  auto lhole = [&](std::size_t o) { return (o % nK) == 1; };
+  Outer L = make_sparse(TA::Range{Mo, nK}, 1,
+                        [&](std::size_t){ return TA::Range{Q}; }, lhole, 1.0);
+  Outer R = make_filled(TA::Range{nK, No},
+                        [&](std::size_t){ return TA::Range{Q, P}; }, 2.0);
+  Outer C = make_filled(TA::Range{Mo, No},
+                        [&](std::size_t){ return TA::Range{P}; }, 0.0);
+  zero_result(C, C.range().volume());
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, R, Mo, No, nK,
+      blas::NoTranspose, blas::NoTranspose, 1.0);
+  auto ref = ref_ce_ce_left_sparse(L, R, Mo, No, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, No, P);
+}
+
+// L3: left mirror of T10 (defensive size mismatch). One L[m,k] present but the
+// wrong inner size (Q+1) -> that (m,k) is skipped by the per-cell guard, the
+// rest stay exact, no crash.
+BOOST_AUTO_TEST_CASE(ce_ce_left_seg_size_mismatch_defensive) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 4, No = 1, nK = 2, P = 3, Q = 4;
+  Outer L = TA::detail::arena_outer_init<Outer>(
+      TA::Range{Mo, nK}, 1, [&](std::size_t o) {
+        // L[m=2,k=0] (ordinal 2*nK+0 = 4) has size Q+1; all others size Q.
+        return (o == 4) ? TA::Range{Q + 1} : TA::Range{Q};
+      });
+  for (std::size_t o = 0; o < L.range().volume(); ++o)
+    for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+      L.data()[o].data()[e] = 1.0 + 0.01 * o + e;
+  Outer R = make_filled(TA::Range{nK, No},
+                        [&](std::size_t){ return TA::Range{Q, P}; }, 2.0);
+  Outer C = make_filled(TA::Range{Mo, No},
+                        [&](std::size_t){ return TA::Range{P}; }, 0.0);
+  zero_result(C, C.range().volume());
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, R, Mo, No, nK,
+      blas::NoTranspose, blas::NoTranspose, 1.0);
+  // Reference skips any k where L[m,k].size() != Q (R[k,n].size()==P*Q gate).
+  auto ref = ref_ce_ce_left_sparse(L, R, Mo, No, nK, P, 1.0);
+  check_ce_ce(C, ref, 1, Mo, No, P);
+}
+
+// L4: left mirror of T8 (per-batch independent segmentation). NB=2; batch 0
+// dense, batch 1 has a single interior hole at m=2.
+BOOST_AUTO_TEST_CASE(ce_ce_left_seg_multi_batch_sparse) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 4, No = 1, nK = 2, P = 3, Q = 4, NB = 2;
+  // hole at batch-1 m=2: ordinal in L is b*Mo*nK + m*nK + k; in C b*Mo*No+m*No+n.
+  auto lhole = [&](std::size_t o) {
+    const std::size_t b = o / (Mo * nK);
+    const std::size_t m = (o % (Mo * nK)) / nK;
+    return b == 1 && m == 2;
+  };
+  auto chole = [&](std::size_t o) {
+    const std::size_t b = o / (Mo * No);
+    const std::size_t m = (o % (Mo * No)) / No;
+    return b == 1 && m == 2;
+  };
+  Outer L = make_sparse(TA::Range{Mo, nK}, NB,
+                        [&](std::size_t){ return TA::Range{Q}; }, lhole, 1.0);
+  // R is shared per batch; build NB batches explicitly (make_filled gives nbatch=1).
+  Outer Rb = TA::detail::arena_outer_init<Outer>(
+      TA::Range{nK, No}, NB, [&](std::size_t){ return TA::Range{Q, P}; });
+  for (std::size_t o = 0; o < Rb.range().volume() * NB; ++o)
+    for (std::size_t e = 0; e < Rb.data()[o].size(); ++e)
+      Rb.data()[o].data()[e] = 2.0 + 0.01 * o + e;
+  Outer C = make_sparse(TA::Range{Mo, No}, NB,
+                        [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+  zero_result(C, C.range().volume() * NB);
+  TA::detail::arena_strided_dgemm_ce_ce_left(C, L, Rb, Mo, No, nK,
+      blas::NoTranspose, blas::NoTranspose, 1.0);
+  auto ref = ref_ce_ce_left_sparse(L, Rb, Mo, No, nK, P, 1.0, NB);
+  check_ce_ce(C, ref, NB, Mo, No, P);
+}
+
+// L5: an entirely-absent result run must be a clean no-op (P<=0 early continue)
+// for BOTH orientations -- no writes, no crash.
+BOOST_AUTO_TEST_CASE(ce_ce_seg_all_absent_run_is_noop) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 4, No = 1, nK = 2, P = 3, Q = 4;
+  // RIGHT: every C[mu] absent.
+  Outer Lr = make_filled(TA::Range{nK}, [&](std::size_t){return TA::Range{P,Q};}, 1.0);
+  Outer Rr = make_filled(TA::Range{Mmu, nK}, [&](std::size_t){return TA::Range{Q};}, 2.0);
+  Outer Cr = make_sparse(TA::Range{Mmu}, 1,
+                         [&](std::size_t){ return TA::Range{P}; },
+                         [](std::size_t){ return true; }, 0.0);  // all holes
+  BOOST_REQUIRE_NO_THROW(TA::detail::arena_strided_dgemm_ce_ce_right(
+      Cr, Lr, Rr, Mo, Mmu, nK, blas::NoTranspose, blas::Transpose, 1.0));
+  for (std::size_t o = 0; o < Cr.range().volume(); ++o)
+    BOOST_CHECK(!Cr.data()[o]);  // all stay absent
+  // LEFT: every C[m,n] absent.
+  Outer Ll = make_filled(TA::Range{Mo, nK}, [&](std::size_t){return TA::Range{Q};}, 1.0);
+  Outer Rl = make_filled(TA::Range{nK, No}, [&](std::size_t){return TA::Range{Q,P};}, 2.0);
+  Outer Cl = make_sparse(TA::Range{Mo, No}, 1,
+                         [&](std::size_t){ return TA::Range{P}; },
+                         [](std::size_t){ return true; }, 0.0);
+  BOOST_REQUIRE_NO_THROW(TA::detail::arena_strided_dgemm_ce_ce_left(
+      Cl, Ll, Rl, Mo, No, nK, blas::NoTranspose, blas::NoTranspose, 1.0));
+  for (std::size_t o = 0; o < Cl.range().volume(); ++o)
+    BOOST_CHECK(!Cl.data()[o]);
+}
+
+// K1: the kill switch is a FAITHFUL equivalent. On a hole-containing,
+// per-k-misaligned pattern, the segment-walker path (switch off) and the
+// per-cell path (switch on) must produce bitwise-identical results -- this both
+// proves the per-cell reference the bench times is correct AND that the switch
+// changes only the evaluation strategy, not the math. RIGHT orientation.
+BOOST_AUTO_TEST_CASE(ce_ce_seg_killswitch_matches_right) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 1, Mmu = 8, nK = 3, P = 3, Q = 4;
+  auto rhole = [&](std::size_t o) {
+    const std::size_t mu = o / nK, k = o % nK;
+    return ((mu + k) % 3) == 0;  // staggered per k
+  };
+  auto chole = [&](std::size_t o) {  // C[mu] present iff present for some k
+    for (std::size_t k = 0; k < nK; ++k)
+      if (!(((o + k) % 3) == 0)) return false;
+    return true;
+  };
+  auto build = [&]() {
+    Outer L = make_filled(TA::Range{nK},
+                          [&](std::size_t){ return TA::Range{P, Q}; }, 1.0);
+    Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+                          [&](std::size_t){ return TA::Range{Q}; }, rhole, 2.0);
+    Outer C = make_sparse(TA::Range{Mmu}, 1,
+                          [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+    zero_result(C, C.range().volume());
+    return std::make_tuple(std::move(L), std::move(R), std::move(C));
+  };
+  auto [Ls, Rs, Cs] = build();  // switch OFF (segment walker)
+  TA::detail::ce_ce_strided_disabled() = false;
+  TA::detail::arena_strided_dgemm_ce_ce_right(Cs, Ls, Rs, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  auto [Lp, Rp, Cp] = build();  // switch ON (per-cell)
+  TA::detail::ce_ce_strided_disabled() = true;
+  TA::detail::arena_strided_dgemm_ce_ce_right(Cp, Lp, Rp, Mo, Mmu, nK,
+      blas::NoTranspose, blas::Transpose, 1.0);
+  TA::detail::ce_ce_strided_disabled() = false;  // restore production default
+  for (std::size_t o = 0; o < Cs.range().volume(); ++o) {
+    BOOST_REQUIRE_EQUAL(bool(Cs.data()[o]), bool(Cp.data()[o]));
+    if (!Cs.data()[o]) continue;
+    BOOST_REQUIRE_EQUAL(Cs.data()[o].size(), Cp.data()[o].size());
+    for (std::size_t a1 = 0; a1 < Cs.data()[o].size(); ++a1)
+      BOOST_CHECK_CLOSE(Cs.data()[o].data()[a1], Cp.data()[o].data()[a1], 1e-12);
+  }
+}
+
+// K2: same faithful-equivalent check, LEFT orientation, per-k misaligned on L.
+BOOST_AUTO_TEST_CASE(ce_ce_seg_killswitch_matches_left) {
+  namespace blas = TA::math::blas;
+  const std::size_t Mo = 8, No = 1, nK = 3, P = 3, Q = 4;
+  auto lhole = [&](std::size_t o) {
+    const std::size_t m = o / nK, k = o % nK;
+    return ((m + k) % 3) == 0;
+  };
+  auto chole = [&](std::size_t o) {
+    for (std::size_t k = 0; k < nK; ++k)
+      if (!(((o + k) % 3) == 0)) return false;
+    return true;
+  };
+  auto build = [&]() {
+    Outer L = make_sparse(TA::Range{Mo, nK}, 1,
+                          [&](std::size_t){ return TA::Range{Q}; }, lhole, 1.0);
+    Outer R = make_filled(TA::Range{nK, No},
+                          [&](std::size_t){ return TA::Range{Q, P}; }, 2.0);
+    Outer C = make_sparse(TA::Range{Mo, No}, 1,
+                          [&](std::size_t){ return TA::Range{P}; }, chole, 0.0);
+    zero_result(C, C.range().volume());
+    return std::make_tuple(std::move(L), std::move(R), std::move(C));
+  };
+  auto [Ls, Rs, Cs] = build();
+  TA::detail::ce_ce_strided_disabled() = false;
+  TA::detail::arena_strided_dgemm_ce_ce_left(Cs, Ls, Rs, Mo, No, nK,
+      blas::NoTranspose, blas::NoTranspose, 1.0);
+  auto [Lp, Rp, Cp] = build();
+  TA::detail::ce_ce_strided_disabled() = true;
+  TA::detail::arena_strided_dgemm_ce_ce_left(Cp, Lp, Rp, Mo, No, nK,
+      blas::NoTranspose, blas::NoTranspose, 1.0);
+  TA::detail::ce_ce_strided_disabled() = false;
+  for (std::size_t o = 0; o < Cs.range().volume(); ++o) {
+    BOOST_REQUIRE_EQUAL(bool(Cs.data()[o]), bool(Cp.data()[o]));
+    if (!Cs.data()[o]) continue;
+    for (std::size_t a1 = 0; a1 < Cs.data()[o].size(); ++a1)
+      BOOST_CHECK_CLOSE(Cs.data()[o].data()[a1], Cp.data()[o].data()[a1], 1e-12);
+  }
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/einsum.cpp b/tests/einsum.cpp
index 7ef67f176d..263c2643a5 100644
--- a/tests/einsum.cpp
+++ b/tests/einsum.cpp
@@ -25,6 +25,10 @@
 
 #include "TiledArray/expressions/contraction_helpers.h"
 
+#include "TiledArray/tensor/arena_einsum.h"
+#include "TiledArray/tensor/arena_kernels.h"
+#include "TiledArray/tensor/arena_tensor.h"
+
 BOOST_AUTO_TEST_SUITE(einsum_manual)
 
 namespace {
@@ -1366,6 +1370,1846 @@ BOOST_AUTO_TEST_CASE(tensor_contract) {
 }
 #endif
 
+// --------------------------------------------------------------------------
+// strided-DGEMM ce+e (hce+e) e2e oracles
+// --------------------------------------------------------------------------
+
+// c(i,k;p,q) = sum_j a(i,j;p) * b(k,j;q): distinct externals i,k; contract
+// over j; inner OUTER product p x q (no inner contraction). Runs the SAME
+// numeric problem on (a) an *arena*-backed ToT DistArray (inner cells are
+// ArenaTensor views) and (b) an *owning* ToT DistArray (inner cells are
+// TA::Tensor), then asserts the two einsum results agree elementwise.
+BOOST_AUTO_TEST_CASE(hce_e_contraction_arena_matches_owning) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+
+  constexpr long I = 2, J = 2, K = 3, P = 3, Q = 4;
+
+  TA::TiledRange a_trange{{0l, I}, {0l, J}};  // outer (i,j)
+  TA::TiledRange b_trange{{0l, K}, {0l, J}};  // outer (k,j)
+
+  auto a_val = [](long i, long j, long p) {
+    return 1.0 + 0.5 * i + 0.25 * j + 0.125 * p;
+  };
+  auto b_val = [](long k, long j, long q) {
+    return 2.0 - 0.3 * k + 0.2 * j + 0.05 * q;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{P}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long i = static_cast<long>(o / J);
+      const long j = static_cast<long>(o % J);
+      for (long p = 0; p < P; ++p) c.data()[p] = a_val(i, j, p);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long k = static_cast<long>(o / J);
+      const long j = static_cast<long>(o % J);
+      for (long q = 0; q < Q; ++q) c.data()[q] = b_val(k, j, q);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{P});
+      const long i = static_cast<long>(o / J);
+      const long j = static_cast<long>(o % J);
+      for (long p = 0; p < P; ++p) cell.data()[p] = a_val(i, j, p);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{Q});
+      const long k = static_cast<long>(o / J);
+      const long j = static_cast<long>(o % J);
+      for (long q = 0; q < Q; ++q) cell.data()[q] = b_val(k, j, q);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  ArenaArr ca = einsum(ah("i,j;p"), bh("k,j;q"), "i,k;p,q");
+  OwnArr co = einsum(ao("i,j;p"), bo("k,j;q"), "i,k;p,q");
+  world.gop.fence();
+
+  BOOST_REQUIRE_EQUAL(ca.trange().elements_range().volume(),
+                      static_cast<std::size_t>(I * K));
+  BOOST_REQUIRE_EQUAL(co.trange().elements_range().volume(),
+                      static_cast<std::size_t>(I * K));
+
+  double max_abs_diff = 0.0;
+  std::size_t elements_compared = 0;
+  std::size_t result_outer_cells_seen = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    const bool a_here = ca.is_local(ord) && !ca.is_zero(ord);
+    const bool o_here = co.is_local(ord) && !co.is_zero(ord);
+    BOOST_REQUIRE_EQUAL(a_here, o_here);
+    if (!a_here) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    BOOST_REQUIRE_EQUAL(at.range().volume(), ot.range().volume());
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      BOOST_REQUIRE_EQUAL(bool(ac), !oc.empty());
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), oc.size());
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(P * Q));
+      ++result_outer_cells_seen;
+      for (std::size_t e = 0; e < ac.size(); ++e) {
+        const double d = std::abs(ac.data()[e] - oc.data()[e]);
+        if (d > max_abs_diff) max_abs_diff = d;
+        ++elements_compared;
+      }
+    }
+  }
+  constexpr std::size_t expected_cells = static_cast<std::size_t>(I * K);
+  constexpr std::size_t expected_elements = expected_cells * P * Q;
+  world.gop.sum(result_outer_cells_seen);
+  world.gop.sum(elements_compared);
+  world.gop.max(max_abs_diff);
+  BOOST_REQUIRE_GT(elements_compared, 0u);
+  BOOST_CHECK_EQUAL(result_outer_cells_seen, expected_cells);
+  BOOST_CHECK_EQUAL(elements_compared, expected_elements);
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+}
+
+// Regime A (NO outer-external index): outer h,i = Hadamard (both operands +
+// result), outer k = contracted (both operands, NOT in result), inner a1
+// (left-only) x a2 (right-only) => inner OUTER-PRODUCT. h folds to nbatch=2,
+// k is multi-tile {2,3}. The arena (view) path must match the owning path.
+BOOST_AUTO_TEST_CASE(regime_a_hce_e_arena_matches_owning) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+
+  constexpr long H = 2, I = 2, P = 3, Q = 4;
+  // h: one tile extent 2 (folds to nbatch=2); i: one tile extent 2 (Hadamard);
+  // k: two tiles extents {2,3}.
+  TA::TiledRange a_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I},
+                          TA::TiledRange1{0l, 2, 5}};  // outer (h,i,k)
+  TA::TiledRange b_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I},
+                          TA::TiledRange1{0l, 2, 5}};  // outer (h,i,k)
+
+  auto a_val = [](long h, long i, long k, long a1) {
+    return 1.0 + 0.5 * i + 0.25 * k + 0.125 * a1 + 0.0625 * h;
+  };
+  auto b_val = [](long h, long i, long k, long a2) {
+    return 2.0 - 0.3 * k + 0.2 * i + 0.05 * a2 + 0.03 * h;
+  };
+
+  // Decode each cell to global (h,i,k) using the passed tile's own Range
+  // (k tiles are sub-tiles, so we must use global coordinates, not ordinal/J).
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{P}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a1 = 0; a1 < P; ++a1) c.data()[a1] = a_val(h, i, k, a1);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a2 = 0; a2 < Q; ++a2) c.data()[a2] = b_val(h, i, k, a2);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{P});
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a1 = 0; a1 < P; ++a1) cell.data()[a1] = a_val(h, i, k, a1);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{Q});
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a2 = 0; a2 < Q; ++a2) cell.data()[a2] = b_val(h, i, k, a2);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  // h,i = Hadamard outer; k = contracted (not in result); a1 x a2 = inner OP.
+  ArenaArr ca = einsum(ah("h,i,k;a1"), bh("h,i,k;a2"), "h,i;a1,a2");
+  OwnArr co = einsum(ao("h,i,k;a1"), bo("h,i,k;a2"), "h,i;a1,a2");
+  world.gop.fence();
+
+  // result outer is (h,i) => 2*2 = 4 outer cells (single tile each).
+  BOOST_REQUIRE_EQUAL(ca.trange().elements_range().volume(),
+                      static_cast<std::size_t>(H * I));
+  BOOST_REQUIRE_EQUAL(co.trange().elements_range().volume(),
+                      static_cast<std::size_t>(H * I));
+
+  double max_abs_diff = 0.0;
+  std::size_t elements_compared = 0;
+  std::size_t result_outer_cells_seen = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    const bool a_here = ca.is_local(ord) && !ca.is_zero(ord);
+    const bool o_here = co.is_local(ord) && !co.is_zero(ord);
+    BOOST_REQUIRE_EQUAL(a_here, o_here);
+    if (!a_here) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    BOOST_REQUIRE_EQUAL(at.range().volume(), ot.range().volume());
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      BOOST_REQUIRE_EQUAL(bool(ac), !oc.empty());
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), oc.size());
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(P * Q));
+      ++result_outer_cells_seen;
+      for (std::size_t e = 0; e < ac.size(); ++e) {
+        const double d = std::abs(ac.data()[e] - oc.data()[e]);
+        if (d > max_abs_diff) max_abs_diff = d;
+        ++elements_compared;
+      }
+    }
+  }
+  constexpr std::size_t expected_cells = static_cast<std::size_t>(H * I);
+  constexpr std::size_t expected_elements = expected_cells * P * Q;
+  world.gop.sum(result_outer_cells_seen);
+  world.gop.sum(elements_compared);
+  world.gop.max(max_abs_diff);
+  BOOST_REQUIRE_GT(elements_compared, 0u);
+  BOOST_CHECK_EQUAL(result_outer_cells_seen, expected_cells);
+  BOOST_CHECK_EQUAL(elements_compared, expected_elements);
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+}
+
+// Task 3.1 differential: on EXACTLY the same arena inputs, the strided reroute
+// and the legacy per-cell path must produce identical results. Builds ONE
+// (a,b) arena pair (same construction as regime_a_hce_e_arena_matches_owning),
+// runs the SAME einsum twice flipping regime_a_strided_disabled(), and compares
+// the two results cell-by-cell, element-by-element. The toggle is RESTORED to
+// false before the compare loop so a failed assertion cannot leak `true` into
+// later tests.
+BOOST_AUTO_TEST_CASE(regime_a_hce_e_strided_equals_percell) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+
+  constexpr long H = 2, I = 2, P = 3, Q = 4;
+  TA::TiledRange a_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I},
+                          TA::TiledRange1{0l, 2, 5}};  // outer (h,i,k)
+  TA::TiledRange b_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I},
+                          TA::TiledRange1{0l, 2, 5}};  // outer (h,i,k)
+
+  auto a_val = [](long h, long i, long k, long a1) {
+    return 1.0 + 0.5 * i + 0.25 * k + 0.125 * a1 + 0.0625 * h;
+  };
+  auto b_val = [](long h, long i, long k, long a2) {
+    return 2.0 - 0.3 * k + 0.2 * i + 0.05 * a2 + 0.03 * h;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{P}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a1 = 0; a1 < P; ++a1) c.data()[a1] = a_val(h, i, k, a1);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a2 = 0; a2 < Q; ++a2) c.data()[a2] = b_val(h, i, k, a2);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  namespace det = TiledArray::detail;
+  // RAII guard: always restore the global toggle to false on scope exit, even
+  // if an einsum throws -- a leaked `true` would silently disable the strided
+  // path for every later test in the binary.
+  struct StridedToggleGuard {
+    ~StridedToggleGuard() { det::regime_a_strided_disabled() = false; }
+  } toggle_guard;
+
+  det::regime_a_strided_disabled() = false;  // strided path
+  auto c_strided = TiledArray::einsum(ah("h,i,k;a1"), bh("h,i,k;a2"), "h,i;a1,a2");
+  c_strided.world().gop.fence();
+  det::regime_a_strided_disabled() = true;  // per-cell path
+  auto c_percell = TiledArray::einsum(ah("h,i,k;a1"), bh("h,i,k;a2"), "h,i;a1,a2");
+  c_percell.world().gop.fence();
+  det::regime_a_strided_disabled() = false;  // RESTORE default before compares
+
+  BOOST_REQUIRE_EQUAL(c_strided.trange().elements_range().volume(),
+                      static_cast<std::size_t>(H * I));
+  BOOST_REQUIRE_EQUAL(c_percell.trange().elements_range().volume(),
+                      static_cast<std::size_t>(H * I));
+
+  std::size_t elements_compared = 0;
+  std::size_t result_outer_cells_seen = 0;
+  const auto& tiles = c_strided.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    const bool s_here = c_strided.is_local(ord) && !c_strided.is_zero(ord);
+    const bool p_here = c_percell.is_local(ord) && !c_percell.is_zero(ord);
+    BOOST_REQUIRE_EQUAL(s_here, p_here);
+    if (!s_here) continue;
+    const ArenaOuter& st = c_strided.find(ord).get();
+    const ArenaOuter& pt = c_percell.find(ord).get();
+    BOOST_REQUIRE_EQUAL(st.range().volume(), pt.range().volume());
+    for (std::size_t o = 0; o < st.range().volume(); ++o) {
+      const ArenaInner& sc = st.data()[o];
+      const ArenaInner& pc = pt.data()[o];
+      BOOST_REQUIRE_EQUAL(bool(sc), bool(pc));
+      if (!sc) continue;
+      BOOST_REQUIRE_EQUAL(sc.size(), pc.size());
+      BOOST_REQUIRE_EQUAL(sc.size(), static_cast<std::size_t>(P * Q));
+      ++result_outer_cells_seen;
+      for (std::size_t e = 0; e < sc.size(); ++e) {
+        BOOST_CHECK_CLOSE(sc.data()[e], pc.data()[e], 1e-12);
+        ++elements_compared;
+      }
+    }
+  }
+  world.gop.sum(result_outer_cells_seen);
+  world.gop.sum(elements_compared);
+  BOOST_REQUIRE_GT(elements_compared, 0u);
+  BOOST_CHECK_EQUAL(result_outer_cells_seen, static_cast<std::size_t>(H * I));
+  BOOST_CHECK_EQUAL(elements_compared,
+                    static_cast<std::size_t>(H * I) * P * Q);
+}
+
+// Task 3.3 Step 1: non-canonical result inner order (a2,a1 instead of a1,a2).
+// Identical operands/fills to regime_a_hce_e_arena_matches_owning, but the
+// result inner is reversed, firing the result inner-permute hoist (do_perm.C)
+// inside the strided path. Arena and owning both go through the same DSL, so
+// they must agree regardless of the kernel's internal [P x Q] blas layout.
+BOOST_AUTO_TEST_CASE(regime_a_hce_e_noncanonical_inner_matches_owning) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+
+  constexpr long H = 2, I = 2, P = 3, Q = 4;  // inner a1=P, a2=Q
+  TA::TiledRange a_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I},
+                          TA::TiledRange1{0l, 2, 5}};  // outer (h,i,k)
+  TA::TiledRange b_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I},
+                          TA::TiledRange1{0l, 2, 5}};  // outer (h,i,k)
+
+  auto a_val = [](long h, long i, long k, long a1) {
+    return 1.0 + 0.5 * i + 0.25 * k + 0.125 * a1 + 0.0625 * h;
+  };
+  auto b_val = [](long h, long i, long k, long a2) {
+    return 2.0 - 0.3 * k + 0.2 * i + 0.05 * a2 + 0.03 * h;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{P}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a1 = 0; a1 < P; ++a1) c.data()[a1] = a_val(h, i, k, a1);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a2 = 0; a2 < Q; ++a2) c.data()[a2] = b_val(h, i, k, a2);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{P});
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a1 = 0; a1 < P; ++a1) cell.data()[a1] = a_val(h, i, k, a1);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{Q});
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a2 = 0; a2 < Q; ++a2) cell.data()[a2] = b_val(h, i, k, a2);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  // result inner reversed (a2,a1) -> inner extents (Q,P), same total.
+  ArenaArr ca = einsum(ah("h,i,k;a1"), bh("h,i,k;a2"), "h,i;a2,a1");
+  OwnArr co = einsum(ao("h,i,k;a1"), bo("h,i,k;a2"), "h,i;a2,a1");
+  world.gop.fence();
+
+  BOOST_REQUIRE_EQUAL(ca.trange().elements_range().volume(),
+                      static_cast<std::size_t>(H * I));
+  BOOST_REQUIRE_EQUAL(co.trange().elements_range().volume(),
+                      static_cast<std::size_t>(H * I));
+
+  double max_abs_diff = 0.0;
+  std::size_t elements_compared = 0;
+  std::size_t result_outer_cells_seen = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    const bool a_here = ca.is_local(ord) && !ca.is_zero(ord);
+    const bool o_here = co.is_local(ord) && !co.is_zero(ord);
+    BOOST_REQUIRE_EQUAL(a_here, o_here);
+    if (!a_here) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    BOOST_REQUIRE_EQUAL(at.range().volume(), ot.range().volume());
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      BOOST_REQUIRE_EQUAL(bool(ac), !oc.empty());
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), oc.size());
+      // inner is now (a2,a1) = (Q,P), same total as P*Q.
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(Q * P));
+      ++result_outer_cells_seen;
+      for (std::size_t e = 0; e < ac.size(); ++e) {
+        const double d = std::abs(ac.data()[e] - oc.data()[e]);
+        if (d > max_abs_diff) max_abs_diff = d;
+        ++elements_compared;
+      }
+    }
+  }
+  world.gop.sum(result_outer_cells_seen);
+  world.gop.sum(elements_compared);
+  world.gop.max(max_abs_diff);
+  BOOST_REQUIRE_GT(elements_compared, 0u);
+  BOOST_CHECK_EQUAL(result_outer_cells_seen, static_cast<std::size_t>(H * I));
+  BOOST_CHECK_EQUAL(elements_compared,
+                    static_cast<std::size_t>(H * I) * Q * P);
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+}
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+BOOST_AUTO_TEST_CASE(hce_e_uses_strided_path) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+
+  // Hadamard mode h tiled with a single block of 2 elements => the einsum
+  // driver folds h into nbatch == 2, exercising the ce+e nbatch loop.
+  constexpr long H = 2, I = 2, J = 2, K = 3, P = 3, Q = 4;
+  TA::TiledRange a_trange{{0l, H}, {0l, I}, {0l, J}};  // (h,i,j)
+  TA::TiledRange b_trange{{0l, H}, {0l, K}, {0l, J}};  // (h,k,j)
+
+  auto a_val = [](long h, long i, long j, long p) {
+    return 1.0 + 0.5 * i + 0.25 * j + 0.125 * p + 0.0625 * h;
+  };
+  auto b_val = [](long h, long k, long j, long q) {
+    return 2.0 - 0.3 * k + 0.2 * j + 0.05 * q + 0.03 * h;
+  };
+
+  // Both operands are plain (nbatch==1) ToT tiles over the full (h,i,j)/(h,k,j)
+  // outer range; the einsum driver folds the Hadamard mode h into nbatch==2.
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{P}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long h = static_cast<long>(o / (I * J));
+      const long i = static_cast<long>((o / J) % I);
+      const long j = static_cast<long>(o % J);
+      for (long p = 0; p < P; ++p) c.data()[p] = a_val(h, i, j, p);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long h = static_cast<long>(o / (K * J));
+      const long k = static_cast<long>((o / J) % K);
+      const long j = static_cast<long>(o % J);
+      for (long q = 0; q < Q; ++q) c.data()[q] = b_val(h, k, j, q);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  TiledArray::detail::g_strided_dgemm_ce_e_calls.store(0);
+  auto ca = einsum(ah("h,i,j;p"), bh("h,k,j;q"), "h,i,k;p,q");
+  ca.world().gop.fence();
+  BOOST_CHECK_GT(TiledArray::detail::g_strided_dgemm_ce_e_calls.load(), 0u);
+}
+
+// Regime A (hc+e) firing witness: same shape/fill as
+// regime_a_hce_e_arena_matches_owning; the within-tile contraction over k must
+// be routed through the strided ce+e core (counter > 0).
+BOOST_AUTO_TEST_CASE(regime_a_hce_e_uses_strided_path) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+
+  constexpr long H = 2, I = 2, P = 3, Q = 4;
+  TA::TiledRange a_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I},
+                          TA::TiledRange1{0l, 2, 5}};  // outer (h,i,k)
+  TA::TiledRange b_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I},
+                          TA::TiledRange1{0l, 2, 5}};  // outer (h,i,k)
+
+  auto a_val = [](long h, long i, long k, long a1) {
+    return 1.0 + 0.5 * i + 0.25 * k + 0.125 * a1 + 0.0625 * h;
+  };
+  auto b_val = [](long h, long i, long k, long a2) {
+    return 2.0 - 0.3 * k + 0.2 * i + 0.05 * a2 + 0.03 * h;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{P}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a1 = 0; a1 < P; ++a1) c.data()[a1] = a_val(h, i, k, a1);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t /*ord*/) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const auto idx = tr.idx(o);
+      const long h = static_cast<long>(idx[0]);
+      const long i = static_cast<long>(idx[1]);
+      const long k = static_cast<long>(idx[2]);
+      for (long a2 = 0; a2 < Q; ++a2) c.data()[a2] = b_val(h, i, k, a2);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  TiledArray::detail::g_strided_dgemm_ce_e_calls.store(0);
+  auto ca = einsum(ah("h,i,k;a1"), bh("h,i,k;a2"), "h,i;a1,a2");
+  ca.world().gop.fence();
+  BOOST_CHECK_GT(TiledArray::detail::g_strided_dgemm_ce_e_calls.load(), 0u);
+}
+
+// Codex must-fix #2 guard: an OWNING ToT contraction (TA::Tensor inner, NOT a
+// view) must stay on the generic per-cell path -- the strided op is never
+// installed (is_tensor_view_v is false), so neither counter moves. This pins
+// the install-gate symmetry fix: the gate now requires view+double on ALL
+// THREE operands, so a non-view operand can never instantiate the
+// double-view-only kernel (which would otherwise be a hard compile error, not a
+// fallback). That this translation unit compiles with owning ToT contractions
+// present is itself the compile-time half of the guard.
+BOOST_AUTO_TEST_CASE(owning_tot_does_not_use_strided_path) {
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+  constexpr long H = 2, I = 2, J = 2, K = 3, P = 3, Q = 4;
+  TA::TiledRange a_trange{{0l, H}, {0l, I}, {0l, J}};  // (h,i,j)
+  TA::TiledRange b_trange{{0l, H}, {0l, K}, {0l, J}};  // (h,k,j)
+  auto a_val = [](long h, long i, long j, long p) {
+    return 1.0 + 0.5 * i + 0.25 * j + 0.125 * p + 0.0625 * h;
+  };
+  auto b_val = [](long h, long k, long j, long q) {
+    return 2.0 - 0.3 * k + 0.2 * j + 0.05 * q + 0.03 * h;
+  };
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{P});
+      const long h = static_cast<long>(o / (I * J));
+      const long i = static_cast<long>((o / J) % I);
+      const long j = static_cast<long>(o % J);
+      for (long p = 0; p < P; ++p) cell.data()[p] = a_val(h, i, j, p);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{Q});
+      const long h = static_cast<long>(o / (K * J));
+      const long k = static_cast<long>((o / J) % K);
+      const long j = static_cast<long>(o % J);
+      for (long q = 0; q < Q; ++q) cell.data()[q] = b_val(h, k, j, q);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  TiledArray::detail::g_strided_dgemm_ce_e_calls.store(0);
+  TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+  auto co = einsum(ao("h,i,j;p"), bo("h,k,j;q"), "h,i,k;p,q");
+  co.world().gop.fence();
+  // owning inner cells never take the view-only strided path
+  BOOST_CHECK_EQUAL(TiledArray::detail::g_strided_dgemm_ce_e_calls.load(), 0u);
+  BOOST_CHECK_EQUAL(TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.load(), 0u);
+  // sanity: the contraction actually ran and produced the expected outer shape
+  BOOST_CHECK_EQUAL(co.trange().elements_range().volume(),
+                    static_cast<std::size_t>(H * I * K));
+}
+#endif
+
+// Addition A (no-Hadamard, multi-external inner): both operands carry 2 inner
+// external modes; the result inner cell is the flat outer product.
+// c(i,k;a1,a2,a3,a4) = sum_j a(i,j;a1,a2) * b(k,j;a3,a4). Arena vs owning.
+BOOST_AUTO_TEST_CASE(ce_e_multi_external_arena_matches_owning) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+  constexpr long I = 2, J = 2, K = 2;
+  constexpr long A1 = 2, A2 = 3, A3 = 2, A4 = 2;
+  constexpr long P = A1 * A2, Q = A3 * A4;
+
+  TA::TiledRange a_trange{{0l, I}, {0l, J}};
+  TA::TiledRange b_trange{{0l, K}, {0l, J}};
+
+  auto a_val = [](long i, long j, long e) {
+    return 1.0 + 0.5 * i + 0.25 * j + 0.1 * e;
+  };
+  auto b_val = [](long k, long j, long e) {
+    return 2.0 - 0.3 * k + 0.2 * j + 0.07 * e;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{A1, A2}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long i = static_cast<long>(o / J), j = static_cast<long>(o % J);
+      for (long e = 0; e < P; ++e) c.data()[e] = a_val(i, j, e);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{A3, A4}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long k = static_cast<long>(o / J), j = static_cast<long>(o % J);
+      for (long e = 0; e < Q; ++e) c.data()[e] = b_val(k, j, e);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{A1, A2});
+      const long i = static_cast<long>(o / J), j = static_cast<long>(o % J);
+      for (long e = 0; e < P; ++e) cell.data()[e] = a_val(i, j, e);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{A3, A4});
+      const long k = static_cast<long>(o / J), j = static_cast<long>(o % J);
+      for (long e = 0; e < Q; ++e) cell.data()[e] = b_val(k, j, e);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TiledArray::detail::g_strided_dgemm_ce_e_calls.store(0);
+#endif
+  ArenaArr ca = einsum(ah("i,j;a1,a2"), bh("k,j;a3,a4"), "i,k;a1,a2,a3,a4");
+  OwnArr co = einsum(ao("i,j;a1,a2"), bo("k,j;a3,a4"), "i,k;a1,a2,a3,a4");
+  world.gop.fence();
+
+  double max_abs_diff = 0.0;
+  std::size_t elements_compared = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    if (!(ca.is_local(ord) && !ca.is_zero(ord))) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(P * Q));
+      BOOST_REQUIRE_EQUAL(ac.size(), oc.size());
+      for (std::size_t e = 0; e < ac.size(); ++e) {
+        const double d = std::abs(ac.data()[e] - oc.data()[e]);
+        if (d > max_abs_diff) max_abs_diff = d;
+        ++elements_compared;
+      }
+    }
+  }
+  world.gop.sum(elements_compared);
+  world.gop.max(max_abs_diff);
+  BOOST_REQUIRE_GT(elements_compared, 0u);
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+#ifdef TA_STRIDED_DGEMM_COUNT
+  BOOST_CHECK_GT(TiledArray::detail::g_strided_dgemm_ce_e_calls.load(), 0u);
+#endif
+}
+
+// Addition A (Hadamard nbatch>1, multi-external inner): the case that ABORTS
+// on $SRC today (nbatch==1 assert). h is a single tile of extent 2 => folds to
+// nbatch==2. c(h,i,k;a1,a2,a3,a4) = sum_j a(h,i,j;a1,a2) * b(h,k,j;a3,a4).
+BOOST_AUTO_TEST_CASE(hce_e_multi_external_arena_matches_owning) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+  constexpr long H = 2, I = 2, J = 2, K = 2;
+  constexpr long A1 = 2, A2 = 2, A3 = 2, A4 = 2;
+  constexpr long P = A1 * A2, Q = A3 * A4;
+
+  TA::TiledRange a_trange{{0l, H}, {0l, I}, {0l, J}};
+  TA::TiledRange b_trange{{0l, H}, {0l, K}, {0l, J}};
+
+  auto a_val = [](long h, long i, long j, long e) {
+    return 1.0 + 0.5 * i + 0.25 * j + 0.1 * e + 0.0625 * h;
+  };
+  auto b_val = [](long h, long k, long j, long e) {
+    return 2.0 - 0.3 * k + 0.2 * j + 0.07 * e + 0.03 * h;
+  };
+
+  // Both operands are plain (nbatch==1) ToT tiles over the full outer range;
+  // the einsum driver folds the Hadamard mode h into nbatch==2.
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{A1, A2}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long h = static_cast<long>(o / (I * J));
+      const long i = static_cast<long>((o / J) % I), j = static_cast<long>(o % J);
+      for (long e = 0; e < P; ++e) c.data()[e] = a_val(h, i, j, e);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{A3, A4}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long h = static_cast<long>(o / (K * J));
+      const long k = static_cast<long>((o / J) % K), j = static_cast<long>(o % J);
+      for (long e = 0; e < Q; ++e) c.data()[e] = b_val(h, k, j, e);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{A1, A2});
+      const long h = static_cast<long>(o / (I * J));
+      const long i = static_cast<long>((o / J) % I), j = static_cast<long>(o % J);
+      for (long e = 0; e < P; ++e) cell.data()[e] = a_val(h, i, j, e);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{A3, A4});
+      const long h = static_cast<long>(o / (K * J));
+      const long k = static_cast<long>((o / J) % K), j = static_cast<long>(o % J);
+      for (long e = 0; e < Q; ++e) cell.data()[e] = b_val(h, k, j, e);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TiledArray::detail::g_strided_dgemm_ce_e_calls.store(0);
+#endif
+  ArenaArr ca =
+      einsum(ah("h,i,j;a1,a2"), bh("h,k,j;a3,a4"), "h,i,k;a1,a2,a3,a4");
+  OwnArr co =
+      einsum(ao("h,i,j;a1,a2"), bo("h,k,j;a3,a4"), "h,i,k;a1,a2,a3,a4");
+  world.gop.fence();
+
+  double max_abs_diff = 0.0;
+  std::size_t elements_compared = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    if (!(ca.is_local(ord) && !ca.is_zero(ord))) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(P * Q));
+      BOOST_REQUIRE_EQUAL(ac.size(), oc.size());
+      for (std::size_t e = 0; e < ac.size(); ++e) {
+        const double d = std::abs(ac.data()[e] - oc.data()[e]);
+        if (d > max_abs_diff) max_abs_diff = d;
+        ++elements_compared;
+      }
+    }
+  }
+  world.gop.sum(elements_compared);
+  world.gop.max(max_abs_diff);
+  BOOST_REQUIRE_GT(elements_compared, 0u);
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+#ifdef TA_STRIDED_DGEMM_COUNT
+  BOOST_CHECK_GT(TiledArray::detail::g_strided_dgemm_ce_e_calls.load(), 0u);
+#endif
+}
+
+// --------------------------------------------------------------------------
+// strided-DGEMM ce+ce (hce+ce) e2e oracles
+// --------------------------------------------------------------------------
+
+// c(i,m;a1) = sum_{k,a4} a(i,k;a1,a4) * b(i,m,k;a4): Hadamard i (single tile of
+// 2 elements => nbatch==2), right-external m, outer-contracted k, no
+// left-external (Mo==1). Arena vs owning, elementwise.
+BOOST_AUTO_TEST_CASE(hce_ce_contraction_arena_matches_owning) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+  constexpr long I = 2, M = 3, K = 2, P = 4, Q = 5;
+
+  TA::TiledRange a_trange{{0l, I}, {0l, K}};           // outer (i,k)
+  TA::TiledRange b_trange{{0l, I}, {0l, M}, {0l, K}};  // outer (i,m,k)
+
+  auto a_val = [](long i, long k, long a1, long a4) {
+    return 1.0 + 0.5 * i + 0.25 * k + 0.125 * a1 + 0.0625 * a4;
+  };
+  auto b_val = [](long i, long m, long k, long a4) {
+    return 2.0 - 0.3 * i + 0.2 * m + 0.1 * k + 0.05 * a4;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{P, Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long i = static_cast<long>(o / K), k = static_cast<long>(o % K);
+      for (long a1 = 0; a1 < P; ++a1)
+        for (long a4 = 0; a4 < Q; ++a4)
+          c.data()[a1 * Q + a4] = a_val(i, k, a1, a4);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long i = static_cast<long>(o / (M * K));
+      const long m = static_cast<long>((o / K) % M);
+      const long k = static_cast<long>(o % K);
+      for (long a4 = 0; a4 < Q; ++a4) c.data()[a4] = b_val(i, m, k, a4);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{P, Q});
+      const long i = static_cast<long>(o / K), k = static_cast<long>(o % K);
+      for (long a1 = 0; a1 < P; ++a1)
+        for (long a4 = 0; a4 < Q; ++a4)
+          cell.data()[a1 * Q + a4] = a_val(i, k, a1, a4);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{Q});
+      const long i = static_cast<long>(o / (M * K));
+      const long m = static_cast<long>((o / K) % M);
+      const long k = static_cast<long>(o % K);
+      for (long a4 = 0; a4 < Q; ++a4) cell.data()[a4] = b_val(i, m, k, a4);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  ArenaArr ca = einsum(ah("i,k;a1,a4"), bh("i,m,k;a4"), "i,m;a1");
+  OwnArr co = einsum(ao("i,k;a1,a4"), bo("i,m,k;a4"), "i,m;a1");
+  world.gop.fence();
+
+  BOOST_REQUIRE_EQUAL(ca.trange().elements_range().volume(),
+                      static_cast<std::size_t>(I * M));
+
+  double max_abs_diff = 0.0;
+  std::size_t elements_compared = 0, result_outer_cells_seen = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    const bool a_here = ca.is_local(ord) && !ca.is_zero(ord);
+    const bool o_here = co.is_local(ord) && !co.is_zero(ord);
+    BOOST_REQUIRE_EQUAL(a_here, o_here);
+    if (!a_here) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      BOOST_REQUIRE_EQUAL(bool(ac), !oc.empty());
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(P));
+      ++result_outer_cells_seen;
+      for (std::size_t e = 0; e < ac.size(); ++e) {
+        const double d = std::abs(ac.data()[e] - oc.data()[e]);
+        if (d > max_abs_diff) max_abs_diff = d;
+        ++elements_compared;
+      }
+    }
+  }
+  world.gop.sum(result_outer_cells_seen);
+  world.gop.sum(elements_compared);
+  world.gop.max(max_abs_diff);
+  BOOST_CHECK_EQUAL(result_outer_cells_seen, static_cast<std::size_t>(I * M));
+  BOOST_CHECK_EQUAL(elements_compared, static_cast<std::size_t>(I * M * P));
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+}
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+BOOST_AUTO_TEST_CASE(hce_ce_uses_strided_path) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+  constexpr long I = 2, M = 3, K = 2, P = 4, Q = 5;
+  TA::TiledRange a_trange{{0l, I}, {0l, K}};
+  TA::TiledRange b_trange{{0l, I}, {0l, M}, {0l, K}};
+
+  auto a_val = [](long i, long k, long a1, long a4) {
+    return 1.0 + 0.5 * i + 0.25 * k + 0.125 * a1 + 0.0625 * a4;
+  };
+  auto b_val = [](long i, long m, long k, long a4) {
+    return 2.0 - 0.3 * i + 0.2 * m + 0.1 * k + 0.05 * a4;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{P, Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long i = static_cast<long>(o / K), k = static_cast<long>(o % K);
+      for (long a1 = 0; a1 < P; ++a1)
+        for (long a4 = 0; a4 < Q; ++a4)
+          c.data()[a1 * Q + a4] = a_val(i, k, a1, a4);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long i = static_cast<long>(o / (M * K));
+      const long m = static_cast<long>((o / K) % M);
+      const long k = static_cast<long>(o % K);
+      for (long a4 = 0; a4 < Q; ++a4) c.data()[a4] = b_val(i, m, k, a4);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+  auto ca = einsum(ah("i,k;a1,a4"), bh("i,m,k;a4"), "i,m;a1");
+  ca.world().gop.fence();
+  BOOST_CHECK_GT(TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.load(), 0u);
+}
+#endif
+
+// No-Hadamard ce+ce (nbatch==1) generality oracle. c(m;a1) = sum_{k,a4}
+// a(k;a1,a4) * b(m,k;a4). The strided fused core must fire (witnessed under
+// the counter) -- otherwise this oracle would pass even with the install gone.
+BOOST_AUTO_TEST_CASE(ce_ce_no_hadamard_arena_matches_owning) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+  constexpr long M = 3, K = 2, P = 4, Q = 5;
+
+  TA::TiledRange a_trange{{0l, K}};
+  TA::TiledRange b_trange{{0l, M}, {0l, K}};
+
+  auto a_val = [](long k, long a1, long a4) {
+    return 1.0 + 0.25 * k + 0.125 * a1 + 0.0625 * a4;
+  };
+  auto b_val = [](long m, long k, long a4) {
+    return 2.0 + 0.2 * m + 0.1 * k + 0.05 * a4;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{P, Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long k = static_cast<long>(o);
+      for (long a1 = 0; a1 < P; ++a1)
+        for (long a4 = 0; a4 < Q; ++a4) c.data()[a1 * Q + a4] = a_val(k, a1, a4);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long m = static_cast<long>(o / K), k = static_cast<long>(o % K);
+      for (long a4 = 0; a4 < Q; ++a4) c.data()[a4] = b_val(m, k, a4);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{P, Q});
+      const long k = static_cast<long>(o);
+      for (long a1 = 0; a1 < P; ++a1)
+        for (long a4 = 0; a4 < Q; ++a4) cell.data()[a1 * Q + a4] = a_val(k, a1, a4);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{Q});
+      const long m = static_cast<long>(o / K), k = static_cast<long>(o % K);
+      for (long a4 = 0; a4 < Q; ++a4) cell.data()[a4] = b_val(m, k, a4);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+#endif
+  ArenaArr ca = einsum(ah("k;a1,a4"), bh("m,k;a4"), "m;a1");
+  OwnArr co = einsum(ao("k;a1,a4"), bo("m,k;a4"), "m;a1");
+  world.gop.fence();
+#ifdef TA_STRIDED_DGEMM_COUNT
+  BOOST_CHECK_GT(TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.load(), 0u);
+#endif
+
+  double max_abs_diff = 0.0;
+  std::size_t elements_compared = 0, result_outer_cells_seen = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    const bool a_here = ca.is_local(ord) && !ca.is_zero(ord);
+    const bool o_here = co.is_local(ord) && !co.is_zero(ord);
+    BOOST_REQUIRE_EQUAL(a_here, o_here);
+    if (!a_here) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(P));
+      ++result_outer_cells_seen;
+      for (std::size_t e = 0; e < ac.size(); ++e) {
+        const double d = std::abs(ac.data()[e] - oc.data()[e]);
+        if (d > max_abs_diff) max_abs_diff = d;
+        ++elements_compared;
+      }
+    }
+  }
+  world.gop.sum(result_outer_cells_seen);
+  world.gop.max(max_abs_diff);
+  BOOST_CHECK_EQUAL(result_outer_cells_seen, static_cast<std::size_t>(M));
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+}
+
+// LEFT-clean ce+ce (either-side generalization): the LEFT operand inner is a
+// pure contraction vector a(m,k;a4); the RIGHT carries the inner external
+// b(k,n;a4,b1); result inner is b1. The left-oriented strided core must fire.
+//   c(m,n;b1) = sum_{k,a4} a(m,k;a4) * b(k,n;a4,b1)
+BOOST_AUTO_TEST_CASE(ce_ce_left_clean_arena_matches_owning) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+  constexpr long M = 3, N = 2, K = 2, Q = 5, P = 4;  // a4=Q, b1=P
+
+  TA::TiledRange a_trange{{0l, M}, {0l, K}};  // (m,k)
+  TA::TiledRange b_trange{{0l, K}, {0l, N}};  // (k,n)
+
+  auto a_val = [](long m, long k, long a4) {
+    return 1.0 + 0.25 * m + 0.125 * k + 0.0625 * a4;
+  };
+  auto b_val = [](long k, long n, long a4, long b1) {
+    return 2.0 + 0.2 * k + 0.1 * n + 0.05 * a4 + 0.025 * b1;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long m = static_cast<long>(o / K), k = static_cast<long>(o % K);
+      for (long a4 = 0; a4 < Q; ++a4) c.data()[a4] = a_val(m, k, a4);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{Q, P}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long k = static_cast<long>(o / N), n = static_cast<long>(o % N);
+      for (long a4 = 0; a4 < Q; ++a4)
+        for (long b1 = 0; b1 < P; ++b1)
+          c.data()[a4 * P + b1] = b_val(k, n, a4, b1);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{Q});
+      const long m = static_cast<long>(o / K), k = static_cast<long>(o % K);
+      for (long a4 = 0; a4 < Q; ++a4) cell.data()[a4] = a_val(m, k, a4);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{Q, P});
+      const long k = static_cast<long>(o / N), n = static_cast<long>(o % N);
+      for (long a4 = 0; a4 < Q; ++a4)
+        for (long b1 = 0; b1 < P; ++b1)
+          cell.data()[a4 * P + b1] = b_val(k, n, a4, b1);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TiledArray::detail::g_strided_dgemm_ce_ce_left_calls.store(0);
+#endif
+  ArenaArr ca = einsum(ah("m,k;a4"), bh("k,n;a4,b1"), "m,n;b1");
+  OwnArr co = einsum(ao("m,k;a4"), bo("k,n;a4,b1"), "m,n;b1");
+  world.gop.fence();
+#ifdef TA_STRIDED_DGEMM_COUNT
+  BOOST_CHECK_GT(TiledArray::detail::g_strided_dgemm_ce_ce_left_calls.load(), 0u);
+#endif
+
+  double max_abs_diff = 0.0;
+  std::size_t result_outer_cells_seen = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    const bool a_here = ca.is_local(ord) && !ca.is_zero(ord);
+    const bool o_here = co.is_local(ord) && !co.is_zero(ord);
+    BOOST_REQUIRE_EQUAL(a_here, o_here);
+    if (!a_here) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(P));
+      ++result_outer_cells_seen;
+      for (std::size_t e = 0; e < ac.size(); ++e)
+        max_abs_diff =
+            std::max(max_abs_diff, std::abs(ac.data()[e] - oc.data()[e]));
+    }
+  }
+  world.gop.sum(result_outer_cells_seen);
+  world.gop.max(max_abs_diff);
+  BOOST_CHECK_EQUAL(result_outer_cells_seen, static_cast<std::size_t>(M * N));
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+}
+
+// GENERAL ce+ce (matrix x matrix inner): BOTH operands carry an inner external
+// (a1 on the left, b1 on the right) alongside the inner contraction a4. Not
+// strided-castable; NEITHER the right- nor left-oriented strided core may fire,
+// and the generic per-cell path must still match the owning oracle.
+//   c(m,n;a1,b1) = sum_{k,a4} a(m,k;a1,a4) * b(k,n;a4,b1)
+BOOST_AUTO_TEST_CASE(ce_ce_general_matrix_matrix_not_strided) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+  constexpr long M = 3, N = 2, K = 2, P1 = 3, Q = 4, P2 = 5;  // a1,a4,b1
+
+  TA::TiledRange a_trange{{0l, M}, {0l, K}};  // (m,k)
+  TA::TiledRange b_trange{{0l, K}, {0l, N}};  // (k,n)
+
+  auto a_val = [](long m, long k, long a1, long a4) {
+    return 1.0 + 0.25 * m + 0.125 * k + 0.0625 * a1 + 0.03 * a4;
+  };
+  auto b_val = [](long k, long n, long a4, long b1) {
+    return 2.0 + 0.2 * k + 0.1 * n + 0.05 * a4 + 0.025 * b1;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{P1, Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long m = static_cast<long>(o / K), k = static_cast<long>(o % K);
+      for (long a1 = 0; a1 < P1; ++a1)
+        for (long a4 = 0; a4 < Q; ++a4)
+          c.data()[a1 * Q + a4] = a_val(m, k, a1, a4);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{Q, P2}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long k = static_cast<long>(o / N), n = static_cast<long>(o % N);
+      for (long a4 = 0; a4 < Q; ++a4)
+        for (long b1 = 0; b1 < P2; ++b1)
+          c.data()[a4 * P2 + b1] = b_val(k, n, a4, b1);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{P1, Q});
+      const long m = static_cast<long>(o / K), k = static_cast<long>(o % K);
+      for (long a1 = 0; a1 < P1; ++a1)
+        for (long a4 = 0; a4 < Q; ++a4)
+          cell.data()[a1 * Q + a4] = a_val(m, k, a1, a4);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{Q, P2});
+      const long k = static_cast<long>(o / N), n = static_cast<long>(o % N);
+      for (long a4 = 0; a4 < Q; ++a4)
+        for (long b1 = 0; b1 < P2; ++b1)
+          cell.data()[a4 * P2 + b1] = b_val(k, n, a4, b1);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+  TiledArray::detail::g_strided_dgemm_ce_ce_left_calls.store(0);
+#endif
+  ArenaArr ca = einsum(ah("m,k;a1,a4"), bh("k,n;a4,b1"), "m,n;a1,b1");
+  OwnArr co = einsum(ao("m,k;a1,a4"), bo("k,n;a4,b1"), "m,n;a1,b1");
+  world.gop.fence();
+#ifdef TA_STRIDED_DGEMM_COUNT
+  // matrix x matrix inner: neither strided orientation may fire.
+  BOOST_CHECK_EQUAL(TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.load(),
+                    0u);
+  BOOST_CHECK_EQUAL(TiledArray::detail::g_strided_dgemm_ce_ce_left_calls.load(),
+                    0u);
+#endif
+
+  double max_abs_diff = 0.0;
+  std::size_t result_outer_cells_seen = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    const bool a_here = ca.is_local(ord) && !ca.is_zero(ord);
+    const bool o_here = co.is_local(ord) && !co.is_zero(ord);
+    BOOST_REQUIRE_EQUAL(a_here, o_here);
+    if (!a_here) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(P1 * P2));
+      ++result_outer_cells_seen;
+      for (std::size_t e = 0; e < ac.size(); ++e)
+        max_abs_diff =
+            std::max(max_abs_diff, std::abs(ac.data()[e] - oc.data()[e]));
+    }
+  }
+  world.gop.sum(result_outer_cells_seen);
+  world.gop.max(max_abs_diff);
+  BOOST_CHECK_EQUAL(result_outer_cells_seen, static_cast<std::size_t>(M * N));
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+}
+
+// Addition B (left-external + Hadamard): c(h,i,k;a1) = sum_{j,a4}
+// a(h,i,j;a1,a4) * b(h,j,k;a4). h Hadamard (nbatch>1), i left-external (Mo>1),
+// j outer-contracted, k right-external. On $SRC this falls to the per-cell
+// path (left external rejected); after Addition B it must fire AND match.
+BOOST_AUTO_TEST_CASE(hce_ce_left_external_arena_matches_owning) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+  constexpr long H = 2, I = 2, J = 2, KK = 3, P = 4, Q = 5;
+
+  TA::TiledRange a_trange{{0l, H}, {0l, I}, {0l, J}};   // (h,i,j)
+  TA::TiledRange b_trange{{0l, H}, {0l, J}, {0l, KK}};  // (h,j,k)
+
+  auto a_val = [](long h, long i, long j, long a1, long a4) {
+    return 1.0 + 0.5 * h + 0.3 * i + 0.2 * j + 0.1 * a1 + 0.05 * a4;
+  };
+  auto b_val = [](long h, long j, long k, long a4) {
+    return 2.0 - 0.4 * h + 0.25 * j + 0.15 * k + 0.07 * a4;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{P, Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long h = static_cast<long>(o / (I * J));
+      const long i = static_cast<long>((o / J) % I), j = static_cast<long>(o % J);
+      for (long a1 = 0; a1 < P; ++a1)
+        for (long a4 = 0; a4 < Q; ++a4)
+          c.data()[a1 * Q + a4] = a_val(h, i, j, a1, a4);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{Q}; });
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      const long h = static_cast<long>(o / (J * KK));
+      const long j = static_cast<long>((o / KK) % J), k = static_cast<long>(o % KK);
+      for (long a4 = 0; a4 < Q; ++a4) c.data()[a4] = b_val(h, j, k, a4);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{P, Q});
+      const long h = static_cast<long>(o / (I * J));
+      const long i = static_cast<long>((o / J) % I), j = static_cast<long>(o % J);
+      for (long a1 = 0; a1 < P; ++a1)
+        for (long a4 = 0; a4 < Q; ++a4)
+          cell.data()[a1 * Q + a4] = a_val(h, i, j, a1, a4);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    for (std::size_t o = 0; o < t.range().volume(); ++o) {
+      OwnInner cell(TA::Range{Q});
+      const long h = static_cast<long>(o / (J * KK));
+      const long j = static_cast<long>((o / KK) % J), k = static_cast<long>(o % KK);
+      for (long a4 = 0; a4 < Q; ++a4) cell.data()[a4] = b_val(h, j, k, a4);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+#endif
+  ArenaArr ca = einsum(ah("h,i,j;a1,a4"), bh("h,j,k;a4"), "h,i,k;a1");
+  OwnArr co = einsum(ao("h,i,j;a1,a4"), bo("h,j,k;a4"), "h,i,k;a1");
+  world.gop.fence();
+#ifdef TA_STRIDED_DGEMM_COUNT
+  BOOST_CHECK_GT(TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.load(), 0u);
+#endif
+
+  double max_abs_diff = 0.0;
+  std::size_t elements_compared = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    if (!(ca.is_local(ord) && !ca.is_zero(ord))) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(P));
+      BOOST_REQUIRE_EQUAL(ac.size(), oc.size());
+      for (std::size_t e = 0; e < ac.size(); ++e) {
+        const double d = std::abs(ac.data()[e] - oc.data()[e]);
+        if (d > max_abs_diff) max_abs_diff = d;
+        ++elements_compared;
+      }
+    }
+  }
+  world.gop.sum(elements_compared);
+  world.gop.max(max_abs_diff);
+  BOOST_REQUIRE_GT(elements_compared, 0u);
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+}
+
+// --------------------------------------------------------------------------
+// strided-DGEMM combined equivalence matrices (REVISION: multi-external +
+// left-external + nbatch>1, multi-tile contracted index, edge sizes).
+// --------------------------------------------------------------------------
+
+// ce+e combined: Hadamard h (nbatch>1) + MULTI-TILE contracted j + multi-
+// external inner on both operands. c(h,i,k;a1,a2,a3) = sum_j a(h,i,j;a1,a2) *
+// b(h,k,j;a3). j tiled as two blocks {2},{1} so the contracted run spans
+// multiple tiles (the SUMMA K-panel stream). Arena vs owning + firing witness.
+BOOST_AUTO_TEST_CASE(hce_e_combined_equivalence_strided) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+  constexpr long H = 2, I = 2, KK = 2, A1 = 2, A2 = 2, A3 = 2;
+  constexpr long P = A1 * A2, Q = A3, J = 3;  // J = 2+1 elements over two tiles
+
+  TA::TiledRange a_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I},
+                          TA::TiledRange1{0l, 2l, 3l}};  // (h,i,j)
+  TA::TiledRange b_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, KK},
+                          TA::TiledRange1{0l, 2l, 3l}};  // (h,k,j)
+
+  auto a_val = [](long h, long i, long j, long e) {
+    return 1.0 + 0.5 * h + 0.3 * i + 0.2 * j + 0.1 * e;
+  };
+  auto b_val = [](long h, long k, long j, long e) {
+    return 2.0 - 0.4 * h + 0.25 * k + 0.15 * j + 0.07 * e;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{A1, A2}; });
+    const auto& r = t.range();
+    for (std::size_t o = 0; o < r.volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      auto idx = r.idx(o);  // (h,i,j) global
+      for (long e = 0; e < P; ++e) c.data()[e] = a_val(idx[0], idx[1], idx[2], e);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{A3}; });
+    const auto& r = t.range();
+    for (std::size_t o = 0; o < r.volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      auto idx = r.idx(o);  // (h,k,j) global
+      for (long e = 0; e < Q; ++e) c.data()[e] = b_val(idx[0], idx[1], idx[2], e);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    const auto& r = t.range();
+    for (std::size_t o = 0; o < r.volume(); ++o) {
+      OwnInner cell(TA::Range{A1, A2});
+      auto idx = r.idx(o);
+      for (long e = 0; e < P; ++e) cell.data()[e] = a_val(idx[0], idx[1], idx[2], e);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    const auto& r = t.range();
+    for (std::size_t o = 0; o < r.volume(); ++o) {
+      OwnInner cell(TA::Range{A3});
+      auto idx = r.idx(o);
+      for (long e = 0; e < Q; ++e) cell.data()[e] = b_val(idx[0], idx[1], idx[2], e);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TiledArray::detail::g_strided_dgemm_ce_e_calls.store(0);
+#endif
+  ArenaArr ca = einsum(ah("h,i,j;a1,a2"), bh("h,k,j;a3"), "h,i,k;a1,a2,a3");
+  OwnArr co = einsum(ao("h,i,j;a1,a2"), bo("h,k,j;a3"), "h,i,k;a1,a2,a3");
+  world.gop.fence();
+#ifdef TA_STRIDED_DGEMM_COUNT
+  BOOST_CHECK_GT(TiledArray::detail::g_strided_dgemm_ce_e_calls.load(), 0u);
+#endif
+
+  double max_abs_diff = 0.0;
+  std::size_t elements_compared = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    if (!(ca.is_local(ord) && !ca.is_zero(ord))) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(P * Q));
+      BOOST_REQUIRE_EQUAL(ac.size(), oc.size());
+      for (std::size_t e = 0; e < ac.size(); ++e) {
+        const double d = std::abs(ac.data()[e] - oc.data()[e]);
+        if (d > max_abs_diff) max_abs_diff = d;
+        ++elements_compared;
+      }
+    }
+  }
+  world.gop.sum(elements_compared);
+  world.gop.max(max_abs_diff);
+  BOOST_REQUIRE_GT(elements_compared, 0u);
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+  (void)J;
+}
+
+// ce+ce combined: Hadamard h (nbatch>1) + left-external i (Mo>1) + MULTI-TILE
+// contracted j + multi-external kept-inner a1,a2 (P>1) and contracted-inner a4.
+// c(h,i,k;a1,a2) = sum_{j,a4} a(h,i,j;a1,a2,a4) * b(h,j,k;a4). Arena vs owning
+// + firing witness.
+BOOST_AUTO_TEST_CASE(hce_ce_combined_equivalence_strided) {
+  using ArenaInner = TA::ArenaTensor<double, TA::Range>;
+  using ArenaOuter = TA::Tensor<ArenaInner>;
+  using ArenaArr = TA::DistArray<ArenaOuter, TA::DensePolicy>;
+  using OwnInner = TA::Tensor<double>;
+  using OwnOuter = TA::Tensor<OwnInner>;
+  using OwnArr = TA::DistArray<OwnOuter, TA::DensePolicy>;
+
+  auto& world = TiledArray::get_default_world();
+  constexpr long H = 2, I = 2, KK = 2, A1 = 2, A2 = 2, A4 = 3;
+  constexpr long P = A1 * A2, Q = A4;
+
+  TA::TiledRange a_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, I},
+                          TA::TiledRange1{0l, 2l, 3l}};  // (h,i,j)
+  TA::TiledRange b_trange{TA::TiledRange1{0l, H}, TA::TiledRange1{0l, 2l, 3l},
+                          TA::TiledRange1{0l, KK}};  // (h,j,k)
+
+  auto a_val = [](long h, long i, long j, long e) {
+    return 1.0 + 0.5 * h + 0.3 * i + 0.2 * j + 0.1 * e;
+  };
+  auto b_val = [](long h, long j, long k, long e) {
+    return 2.0 - 0.4 * h + 0.25 * j + 0.15 * k + 0.07 * e;
+  };
+
+  ArenaArr ah(world, a_trange);
+  ah.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{A1, A2, A4}; });
+    const auto& r = t.range();
+    for (std::size_t o = 0; o < r.volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      auto idx = r.idx(o);  // (h,i,j)
+      for (long e = 0; e < P * Q; ++e)
+        c.data()[e] = a_val(idx[0], idx[1], idx[2], e);
+    }
+    return t;
+  });
+  ArenaArr bh(world, b_trange);
+  bh.init_tiles([&](const TA::Range& tr) {
+    ArenaOuter t = TA::detail::arena_outer_init<ArenaOuter>(
+        tr, 1, [](std::size_t) { return TA::Range{A4}; });
+    const auto& r = t.range();
+    for (std::size_t o = 0; o < r.volume(); ++o) {
+      ArenaInner& c = t.data()[o];
+      if (!c) continue;
+      auto idx = r.idx(o);  // (h,j,k)
+      for (long e = 0; e < Q; ++e) c.data()[e] = b_val(idx[0], idx[1], idx[2], e);
+    }
+    return t;
+  });
+  world.gop.fence();
+
+  OwnArr ao(world, a_trange);
+  ao.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    const auto& r = t.range();
+    for (std::size_t o = 0; o < r.volume(); ++o) {
+      OwnInner cell(TA::Range{A1, A2, A4});
+      auto idx = r.idx(o);
+      for (long e = 0; e < P * Q; ++e) cell.data()[e] = a_val(idx[0], idx[1], idx[2], e);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  OwnArr bo(world, b_trange);
+  bo.init_tiles([&](const TA::Range& tr) {
+    OwnOuter t(tr);
+    const auto& r = t.range();
+    for (std::size_t o = 0; o < r.volume(); ++o) {
+      OwnInner cell(TA::Range{A4});
+      auto idx = r.idx(o);
+      for (long e = 0; e < Q; ++e) cell.data()[e] = b_val(idx[0], idx[1], idx[2], e);
+      t.data()[o] = cell;
+    }
+    return t;
+  });
+  world.gop.fence();
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+  TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+#endif
+  ArenaArr ca = einsum(ah("h,i,j;a1,a2,a4"), bh("h,j,k;a4"), "h,i,k;a1,a2");
+  OwnArr co = einsum(ao("h,i,j;a1,a2,a4"), bo("h,j,k;a4"), "h,i,k;a1,a2");
+  world.gop.fence();
+#ifdef TA_STRIDED_DGEMM_COUNT
+  BOOST_CHECK_GT(TiledArray::detail::g_strided_dgemm_ce_ce_right_calls.load(), 0u);
+#endif
+
+  double max_abs_diff = 0.0;
+  std::size_t elements_compared = 0;
+  const auto& tiles = ca.trange().tiles_range();
+  for (std::size_t ord = 0; ord < tiles.volume(); ++ord) {
+    if (!(ca.is_local(ord) && !ca.is_zero(ord))) continue;
+    const ArenaOuter& at = ca.find(ord).get();
+    const OwnOuter& ot = co.find(ord).get();
+    for (std::size_t o = 0; o < at.range().volume(); ++o) {
+      const ArenaInner& ac = at.data()[o];
+      const OwnInner& oc = ot.data()[o];
+      if (!ac) continue;
+      BOOST_REQUIRE_EQUAL(ac.size(), static_cast<std::size_t>(P));
+      BOOST_REQUIRE_EQUAL(ac.size(), oc.size());
+      for (std::size_t e = 0; e < ac.size(); ++e) {
+        const double d = std::abs(ac.data()[e] - oc.data()[e]);
+        if (d > max_abs_diff) max_abs_diff = d;
+        ++elements_compared;
+      }
+    }
+  }
+  world.gop.sum(elements_compared);
+  world.gop.max(max_abs_diff);
+  BOOST_REQUIRE_GT(elements_compared, 0u);
+  BOOST_CHECK_LT(max_abs_diff, 1e-12);
+}
+
 BOOST_AUTO_TEST_SUITE_END()  // einsum_tot
 
 BOOST_AUTO_TEST_SUITE(einsum_tot_t)