diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31e4838f31..c3f01d9e74 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,6 +147,13 @@ add_feature_info(TENSOR_MEM_PROFILE TA_TENSOR_MEM_PROFILE "instrumented profilin
option(TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED "Turn on TA_ASSERT that no mutable operations occur on TA::{Tensor,Tile} objects that share data" OFF)
add_feature_info(TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED TA_TENSOR_ASSERT_NO_MUTABLE_OPS_WHILE_SHARED "TA_ASSERT that no mutable operations occur on TA::{Tensor,Tile} objects that share data")
+option(TA_STRIDED_DGEMM_COUNT
+ "Compile-in atomic counters that witness strided-DGEMM firing (tests/benches)"
+ OFF)
+if (TA_STRIDED_DGEMM_COUNT)
+ add_compile_definitions(TA_STRIDED_DGEMM_COUNT)
+endif()
+
option(TA_EXPERT "TiledArray Expert mode: disables automatically downloading or building dependencies" OFF)
redefaultable_option(TA_WERROR "Treat compiler warnings as errors when compiling TiledArray's own translation units (does not propagate to consumers of installed TiledArray targets)" OFF)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d240192893..0a14b99fa6 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -37,3 +37,4 @@ add_subdirectory (fock)
add_subdirectory (mpi_tests)
add_subdirectory (pmap_test)
add_subdirectory (vector_tests)
+add_subdirectory (tot_bench)
diff --git a/examples/tot_bench/CMakeLists.txt b/examples/tot_bench/CMakeLists.txt
new file mode 100644
index 0000000000..5236b48e2d
--- /dev/null
+++ b/examples/tot_bench/CMakeLists.txt
@@ -0,0 +1,26 @@
+#
+# This file is a part of TiledArray.
+# Copyright (C) 2013 Virginia Tech
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+#
+# CMakeLists.txt
+#
+
+# Strided-DGEMM ToT throughput benches (strided-vs-current arena DGEMM).
+
+foreach(_exec opa_strided_arena_dgemm opb_strided_arena_dgemm regime_a_hce_e_strided_bench ce_ce_segmented_strided_bench)
+ add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray")
+ add_dependencies(examples-tiledarray ${_exec})
+endforeach()
diff --git a/examples/tot_bench/ce_ce_segmented_strided_bench.cpp b/examples/tot_bench/ce_ce_segmented_strided_bench.cpp
new file mode 100644
index 0000000000..5732b95dcb
--- /dev/null
+++ b/examples/tot_bench/ce_ce_segmented_strided_bench.cpp
@@ -0,0 +1,266 @@
+// ce_ce_segmented_strided_bench.cpp
+// ---------------------------------------------------------------------------
+// Tile/BLAS-level benchmark for the hce+ce per-k SEGMENTED strided DGEMM
+// (arena_strided_dgemm_ce_ce_right). It times the SAME kernel on the SAME
+// hole-containing arena operands under the two states of the runtime kill
+// switch TiledArray::detail::ce_ce_strided_disabled():
+//
+// segmented (disabled=false): per k, walk μ̃ and emit one strided GEMM per
+// maximal contiguous present+uniform-stride segment; skip holes.
+// per-cell (disabled=true) : the legacy path -- one length-Q GEMV per
+// present (μ̃) cell (what TA did before, reverting to per-cell
+// whenever results/operands contained holes).
+//
+// The ONLY variable between the two timings is that toggle, so the ratio
+// isolates the kernel strategy swap. Operands model the measured CSV-CCk
+// fallback regime: present cells are CLUSTERED (mean segment length ~ --cluster)
+// and per-k MISALIGNED (each k shifts its hole phase), the pattern the old
+// all-or-nothing gate fell back to scalar on. The right-kernel walker is
+// identical to the left's, so this speedup represents hce+ce overall.
+// ---------------------------------------------------------------------------
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace TA = TiledArray;
+namespace tablas = TA::math::blas;
+
+using Inner = TA::ArenaTensor;
+using Outer = TA::Tensor;
+
+using clock_type = std::chrono::steady_clock;
+static double ms_since(clock_type::time_point t0) {
+ return std::chrono::duration_cast(clock_type::now() -
+ t0)
+ .count() /
+ 1.0e6;
+}
+
+struct Cli {
+ int reps = 30; // timed reps per path
+ int warmup = 5; // untimed warmup reps per path
+ long Mmu = 256; // strided axis (right external) -> BLAS M
+ long nK = 8; // outer-contraction slabs (looped, beta=1)
+ long P = 16; // result inner free (a1)
+ long Q = 16; // contraction inner (a4)
+ long cluster = 6; // mean present-run length (~ mean segment M)
+ double c_fraction = 0.0; // fraction of otherwise-present C cells to drop to holes (sparse result)
+};
+
+static void usage() {
+ std::fprintf(stderr,
+ "ce_ce_segmented_strided_bench\n"
+ " --reps R timed reps per path (default 30)\n"
+ " --warmup W untimed warmup reps (default 5)\n"
+ " --Mmu N strided axis extent (default 256)\n"
+ " --nK N outer-contraction slabs (default 8)\n"
+ " --P N result inner free a1 (default 16)\n"
+ " --Q N contraction inner a4 (default 16)\n"
+ " --cluster N mean present-run length (default 6)\n"
+ " --c_fraction F fraction of C cells dropped to holes (default 0.0)\n");
+}
+
+static Cli parse_cli(int argc, char** argv) {
+ Cli c;
+ for (int i = 1; i < argc; ++i) {
+ std::string a = argv[i];
+ auto need = [&]() -> std::string {
+ if (i + 1 >= argc) { usage(); std::exit(1); }
+ return argv[++i];
+ };
+ if (a == "--reps") c.reps = std::stoi(need());
+ else if (a == "--warmup") c.warmup = std::stoi(need());
+ else if (a == "--Mmu") c.Mmu = std::stol(need());
+ else if (a == "--nK") c.nK = std::stol(need());
+ else if (a == "--P") c.P = std::stol(need());
+ else if (a == "--Q") c.Q = std::stol(need());
+ else if (a == "--cluster") c.cluster = std::stol(need());
+ else if (a == "--c_fraction") c.c_fraction = std::stod(need());
+ else if (a == "-h" || a == "--help") { usage(); std::exit(0); }
+ else { std::fprintf(stderr, "unknown flag: %s\n", a.c_str()); usage();
+ std::exit(1); }
+ }
+ return c;
+}
+
+// Build an arena Outer with holes: dense_shape(o) unless is_hole(o) -> Range{}.
+static Outer make_sparse(const TA::Range& outer_range, std::size_t nbatch,
+ const std::function& dense_shape,
+ const std::function& is_hole,
+ double base) {
+ Outer t = TA::detail::arena_outer_init(
+ outer_range, nbatch,
+ [&](std::size_t o) { return is_hole(o) ? TA::Range{} : dense_shape(o); });
+ for (std::size_t o = 0; o < t.range().volume() * nbatch; ++o) {
+ Inner& c = t.data()[o];
+ if (!c) continue;
+ for (std::size_t e = 0; e < c.size(); ++e) c.data()[e] = base + 0.001 * o + e;
+ }
+ return t;
+}
+
+int main(int argc, char** argv) {
+ Cli cli = parse_cli(argc, argv);
+ if (cli.reps < 1) { std::fprintf(stderr, "--reps must be >= 1\n"); return 1; }
+ auto& world = TA_SCOPED_INITIALIZE(argc, argv);
+ (void)world;
+
+ const std::size_t Mo = 1;
+ const std::size_t Mmu = static_cast(cli.Mmu);
+ const std::size_t nK = static_cast(cli.nK);
+ const long P = cli.P, Q = cli.Q;
+ const long cl = std::max(1, cli.cluster);
+
+ // Clustered + per-k-misaligned presence on R[mu,k] (canonical mu slow, k fast,
+ // ordinal o = mu*nK + k). A cell is a hole when, within its k-shifted phase,
+ // it falls in the 1-wide gap after each run of length `cl`. period = cl + 1.
+ auto rhole = [&](std::size_t o) {
+ const std::size_t mu = o / nK, k = o % nK;
+ const long period = cl + 1;
+ const long phase = (static_cast(mu) + static_cast(k) * 2) % period;
+ return phase == cl; // the single gap cell each period
+ };
+ const double c_frac = std::max(0.0, std::min(1.0, cli.c_fraction));
+ // C[mu] present iff present for at least one k (the union the kernel writes),
+ // optionally thinned: a deterministic fraction c_frac of otherwise-present
+ // cells are dropped to holes to model a genuinely SPARSE result. A hole C cell
+ // is absent regardless of operand presence (its (k,mu) contributions skip).
+ auto chole = [&](std::size_t o) {
+ bool union_present = false;
+ for (std::size_t k = 0; k < nK; ++k)
+ if (!rhole(o * nK + k)) { union_present = true; break; }
+ if (!union_present) return true; // absent for all k -> hole
+ if (c_frac > 0.0) {
+ const std::size_t h = (o * 2654435761ull) & 0xffffull; // cheap hash
+ if (static_cast(h) / 65536.0 < c_frac) return true;
+ }
+ return false;
+ };
+
+ Outer L = TA::detail::arena_outer_init(
+ TA::Range{nK}, 1, [&](std::size_t) {
+ return TA::Range{static_cast(P),
+ static_cast(Q)};
+ });
+ for (std::size_t o = 0; o < L.range().volume(); ++o)
+ for (std::size_t e = 0; e < L.data()[o].size(); ++e)
+ L.data()[o].data()[e] = 1.0 + 0.001 * o + e;
+ Outer R = make_sparse(TA::Range{Mmu, nK}, 1,
+ [&](std::size_t){ return TA::Range{static_cast(Q)}; },
+ rhole, 2.0);
+ Outer Ctemplate = make_sparse(TA::Range{Mmu}, 1,
+ [&](std::size_t){ return TA::Range{static_cast(P)}; },
+ chole, 0.0);
+
+ std::size_t present_C = 0;
+ for (std::size_t o = 0; o < Ctemplate.range().volume(); ++o)
+ if (Ctemplate.data()[o]) ++present_C;
+ std::size_t present_R = 0;
+ for (std::size_t o = 0; o < R.range().volume(); ++o)
+ if (R.data()[o]) ++present_R;
+
+ std::printf("=== hce+ce segmented-vs-per-cell strided DGEMM bench ===\n");
+ std::printf("Mmu=%zu nK=%zu P=%ld Q=%ld cluster=%ld "
+ "present C=%zu/%zu present R=%zu/%zu\n",
+ Mmu, nK, P, Q, cl, present_C, Ctemplate.range().volume(),
+ present_R, R.range().volume());
+ std::printf("reps=%d warmup=%d\n", cli.reps, cli.warmup);
+
+ // FLOP estimate: 2*P*Q per present (mu,k) contributing cell.
+ double flop = 0.0;
+ for (std::size_t mu = 0; mu < Mmu; ++mu)
+ for (std::size_t k = 0; k < nK; ++k)
+ if (R.data()[mu * nK + k] && Ctemplate.data()[mu])
+ flop += 2.0 * P * Q;
+
+ auto zero_C = [&](Outer& C) {
+ for (std::size_t o = 0; o < C.range().volume(); ++o) {
+ Inner& c = C.data()[o];
+ if (!c) continue;
+ for (std::size_t e = 0; e < c.size(); ++e) c.data()[e] = 0.0;
+ }
+ };
+ auto make_C = [&]() {
+ return make_sparse(TA::Range{Mmu}, 1,
+ [&](std::size_t){ return TA::Range{static_cast(P)}; },
+ chole, 0.0);
+ };
+ auto median = [](std::vector v) -> double {
+ std::sort(v.begin(), v.end());
+ const std::size_t n = v.size();
+ if (n == 0) return 0.0;
+ return (n % 2) ? v[n / 2] : 0.5 * (v[n / 2 - 1] + v[n / 2]);
+ };
+ // Time ONLY the kernel call. C is allocated once per path and re-zeroed
+ // OUTSIDE the timed window each rep (beta=1 needs a zero start), so the
+ // measured time isolates the segment-walker vs per-cell strategy, not the
+ // per-rep tile allocation (which is identical on both sides and would
+ // otherwise dominate this ~0.1 ms kernel and compress the ratio).
+ auto time_path = [&](bool disabled) {
+ Outer C = make_C();
+ TA::detail::ce_ce_strided_disabled() = disabled;
+ for (int w = 0; w < cli.warmup; ++w) { zero_C(C);
+ TA::detail::arena_strided_dgemm_ce_ce_right(
+ C, L, R, Mo, Mmu, nK, tablas::NoTranspose, tablas::Transpose, 1.0); }
+ std::vector ms;
+ ms.reserve(cli.reps);
+ for (int r = 0; r < cli.reps; ++r) {
+ zero_C(C); // untimed
+ auto t0 = clock_type::now();
+ TA::detail::arena_strided_dgemm_ce_ce_right(
+ C, L, R, Mo, Mmu, nK, tablas::NoTranspose, tablas::Transpose, 1.0);
+ ms.push_back(ms_since(t0));
+ }
+ return ms;
+ };
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+ TA::detail::g_strided_dgemm_ce_ce_right_calls.store(0);
+#endif
+ auto seg_ms = time_path(/*disabled=*/false);
+#ifdef TA_STRIDED_DGEMM_COUNT
+ const std::size_t seg_calls =
+ TA::detail::g_strided_dgemm_ce_ce_right_calls.load();
+#endif
+ auto pc_ms = time_path(/*disabled=*/true);
+ TA::detail::ce_ce_strided_disabled() = false; // restore production default
+
+ const double seg_min = *std::min_element(seg_ms.begin(), seg_ms.end());
+ const double seg_med = median(seg_ms);
+ const double pc_min = *std::min_element(pc_ms.begin(), pc_ms.end());
+ const double pc_med = median(pc_ms);
+
+ std::printf("\n--- results (per kernel call, ms) ---\n");
+ std::printf("per-cell : min=%9.5f ms median=%9.5f ms (%.2f GFLOP/s)\n",
+ pc_min, pc_med, flop / (pc_min * 1e6));
+ std::printf("segmented : min=%9.5f ms median=%9.5f ms (%.2f GFLOP/s)\n",
+ seg_min, seg_med, flop / (seg_min * 1e6));
+ std::printf("speedup : min=%6.3fx median=%6.3fx (per-cell / segmented)\n",
+ seg_min > 0 ? pc_min / seg_min : 0.0,
+ seg_med > 0 ? pc_med / seg_med : 0.0);
+
+#ifdef TA_STRIDED_DGEMM_COUNT
+ std::printf("\n--- firing witness (TA_STRIDED_DGEMM_COUNT) ---\n");
+ std::printf("segment GEMMs over %d+%d (warmup+timed) reps = %zu "
+ "(mean %.1f per rep)\n",
+ cli.warmup, cli.reps, seg_calls,
+ double(seg_calls) / double(cli.warmup + cli.reps));
+ if (seg_calls == 0) {
+ std::fprintf(stderr, "ERROR: segmented path never issued a GEMM -- the "
+ "reported speedup would reflect a silent fallback.\n");
+ std::abort();
+ }
+ std::printf("OK: segmented path fired (counter > 0).\n");
+#endif
+ return 0;
+}
diff --git a/examples/tot_bench/opa_strided_arena_dgemm.cpp b/examples/tot_bench/opa_strided_arena_dgemm.cpp
new file mode 100644
index 0000000000..e36100064b
--- /dev/null
+++ b/examples/tot_bench/opa_strided_arena_dgemm.cpp
@@ -0,0 +1,860 @@
+// opa_strided_arena_dgemm.cpp
+// ---------------------------------------------------------------------------
+// Standalone tile/BLAS-level benchmark for the op_A ToT x ToT -> ToT
+// product, on ArenaTensor inner cells. Companion to opb_strided_arena_dgemm.cpp.
+//
+// op_A: I(i_2,i_1,Κ; a_1,a_4) * I(i_2,i_1,μ̃,Κ; a_4) -> I(μ̃,i_2,i_1; a_1)
+// Hadamard = {i_1,i_2}, outer-contracted = {Κ}, outer-external = {μ̃},
+// inner-contracted = {a_4}, inner-external (kept) = {a_1}.
+// P = |a_1|, Q = |a_4| depend (jaggedly) on the Hadamard slice.
+//
+// C(μ̃,i_2,i_1; a_1) = sum_Κ sum_{a_4} L(i_2,i_1,Κ; a_1,a_4)*R(i_2,i_1,μ̃,Κ; a_4)
+//
+// Unlike op_B (inner OUTER-product), op_A's inner part is a real CONTRACTION
+// over a_4, and the full reduction spans an OUTER index (Κ) AND an INNER index
+// (a_4). That means the *direct* op_B-style fusion of the outer contraction Κ
+// would have to merge (Κ ⊗ a_4) into a single BLAS K axis -- a two-level stride
+// no single leading-dimension can express -> a pack/deep-clone is required.
+//
+// But a DIFFERENT, ZERO-COPY fusion exists: ride the outer EXTERNAL μ̃ into the
+// GEMM M axis (one outer index, via inter-cell stride), keep a_4 as the
+// contiguous inner K, keep a_1 as inner N, and loop the outer contraction Κ
+// with beta-accumulation:
+//
+// for Κ: C̃[μ̃, a_1] += R̃_Κ[μ̃, a_4] · L_Κ[a_1, a_4]^T (M=|μ̃|, N=P, K=Q)
+//
+// This benchmark compares four ways to evaluate one Hadamard slice, all reading
+// already-fusable arena slabs (operands laid contracted/contiguous-friendly):
+//
+// current_gemm : Mμ·nK tiny (P x 1, K=Q) GEMMs -- exactly what TA dispatches
+// today (per result cell, per Κ: one inner gemm with N=1).
+// current_gemv : Mμ·nK BLAS dgemv calls (the natural mat-vec primitive).
+// strided : nK strided GEMMs (μ̃ ridden into M), zero-copy, beta-accum.
+// packed : pack (Κ,a_4) contiguous then ONE GEMM (M=Mμ,N=P,K=nK·Q);
+// the pack cost (= the deep-clone tradeoff) is timed too.
+//
+// Work units {P,Q,Mμ,nK} per Hadamard slice are reconstructed from op_dump.txt.
+// (Idealization: the μ̃ x Κ block per slice is treated as dense; Mμ and nK are
+// the per-slice nonzero μ̃ / Κ counts. The shape distribution -- which is what
+// drives the GEMV->GEMM win -- is faithful.)
+// ---------------------------------------------------------------------------
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include