From 25111765b7c1bdd9e5f7e797ca56bb08ecd3a1bd Mon Sep 17 00:00:00 2001 From: Alan-S-Andrade Date: Fri, 26 Jun 2026 10:29:02 -0700 Subject: [PATCH] Add opt-in MI_SINGLE_THREADED specialization for the free fast path In a program that only ever uses mimalloc from a single thread, every freed block is guaranteed to be thread-local, so the per-free thread ownership check in mi_free_ex (a thread-id TLS read, a relaxed atomic load of segment->thread_id, and a compare) is redundant. This adds an opt-in, off-by-default compile-time switch (-DMI_SINGLE_THREADED=1) that forces is_local=true and skips that check. The default build is unchanged (the #else path is identical to before and the full multi-threaded test suite still passes). Measured on a single-threaded, allocation-heavy workload (mimalloc-bench alloc-test, 1 thread, pinned, interleaved median-of-11): ~0.4% faster wall time, with perf showing ~2.5% fewer branches and ~1.1% fewer instructions (the eliminated per-free ownership branch). cfrac/espresso output is byte-identical to baseline. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/free.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/free.c b/src/free.c index 43b362720..c39745c84 100644 --- a/src/free.c +++ b/src/free.c @@ -153,7 +153,14 @@ static inline void mi_free_ex(void* p, size_t* usable) mi_attr_noexcept mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free"); if mi_unlikely(segment==NULL) return; + #if defined(MI_SINGLE_THREADED) && (MI_SINGLE_THREADED) + // Single-threaded specialization (ExGen-Malloc, CAL'25): in a program that only ever + // uses mimalloc from one thread, every block is thread-local, so we can skip the + // thread-id TLS read + atomic load + compare on every free. + const bool is_local = true; + #else const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); + #endif mi_page_t* const page = _mi_segment_page_of(segment, p); if (usable!=NULL) { *usable = mi_page_usable_block_size(page); }