From ed099ad26e60cea5e5a7643a8a607c7de2fdac1a Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 1 May 2026 12:50:01 -0500
Subject: [PATCH 1/4] cuda.core benchmarks

---
 benchmarks/cuda_bindings/runner/main.py       |  65 +-
 benchmarks/cuda_core/.gitignore               |  16 +
 benchmarks/cuda_core/AGENTS.md                |  11 +
 benchmarks/cuda_core/BENCHMARK_PLAN.md        | 111 ++
 benchmarks/cuda_core/README.md                |  73 ++
 .../cuda_core/benchmarks/bench_ctx_device.py  |  55 +
 .../cuda_core/benchmarks/bench_event.py       |  53 +
 .../cuda_core/benchmarks/bench_launch.py      | 183 ++++
 .../cuda_core/benchmarks/bench_memory.py      |  37 +
 .../cuda_core/benchmarks/bench_stream.py      |  29 +
 benchmarks/cuda_core/compare.py               | 159 +++
 benchmarks/cuda_core/pixi.lock                | 972 ++++++++++++++++++
 benchmarks/cuda_core/pixi.toml                |  52 +
 benchmarks/cuda_core/run_pyperf.py            |  32 +
 benchmarks/cuda_core/runtime.py               |  53 +
 15 files changed, 1877 insertions(+), 24 deletions(-)
 create mode 100644 benchmarks/cuda_core/.gitignore
 create mode 100644 benchmarks/cuda_core/AGENTS.md
 create mode 100644 benchmarks/cuda_core/BENCHMARK_PLAN.md
 create mode 100644 benchmarks/cuda_core/README.md
 create mode 100644 benchmarks/cuda_core/benchmarks/bench_ctx_device.py
 create mode 100644 benchmarks/cuda_core/benchmarks/bench_event.py
 create mode 100644 benchmarks/cuda_core/benchmarks/bench_launch.py
 create mode 100644 benchmarks/cuda_core/benchmarks/bench_memory.py
 create mode 100644 benchmarks/cuda_core/benchmarks/bench_stream.py
 create mode 100644 benchmarks/cuda_core/compare.py
 create mode 100644 benchmarks/cuda_core/pixi.lock
 create mode 100644 benchmarks/cuda_core/pixi.toml
 create mode 100644 benchmarks/cuda_core/run_pyperf.py
 create mode 100644 benchmarks/cuda_core/runtime.py

diff --git a/benchmarks/cuda_bindings/runner/main.py b/benchmarks/cuda_bindings/runner/main.py
index 2e7312c3cef..f4a138fd4e4 100644
--- a/benchmarks/cuda_bindings/runner/main.py
+++ b/benchmarks/cuda_bindings/runner/main.py
@@ -16,30 +16,30 @@
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 BENCH_DIR = PROJECT_ROOT / "benchmarks"
 DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json"
+DEFAULT_MODULE_NAME_PREFIX = "cuda_bindings_bench"
 # Env var used to propagate the --benchmark filter from the parent to pyperf
 # worker subprocesses. pyperf reconstructs worker argv from scratch and drops
 # custom flags like --benchmark, so without this the worker would register the
 # full bench list and pyperf would run the wrong bench by task index.
-BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER"
+DEFAULT_BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER"
 
-PYPERF_INHERITED_ENV_VARS = (
+BASE_PYPERF_INHERITED_ENV_VARS = (
     "CUDA_HOME",
     "CUDA_PATH",
     "CUDA_VISIBLE_DEVICES",
     "LD_LIBRARY_PATH",
     "NVIDIA_VISIBLE_DEVICES",
-    BENCH_FILTER_ENV_VAR,
 )
 _MODULE_CACHE: dict[Path, ModuleType] = {}
 
 
-def load_module(module_path: Path) -> ModuleType:
+def load_module(module_path: Path, module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX) -> ModuleType:
     module_path = module_path.resolve()
     cached_module = _MODULE_CACHE.get(module_path)
     if cached_module is not None:
         return cached_module
 
-    module_name = f"cuda_bindings_bench_{module_path.stem}"
+    module_name = f"{module_name_prefix}_{module_path.stem}"
     spec = importlib.util.spec_from_file_location(module_name, module_path)
     if spec is None or spec.loader is None:
         raise RuntimeError(f"Failed to load benchmark module: {module_path}")
@@ -64,13 +64,17 @@ def _discover_module_functions(module_path: Path) -> list[str]:
     ]
 
 
-def _lazy_benchmark(module_path: Path, function_name: str) -> Callable[[int], float]:
+def _lazy_benchmark(
+    module_path: Path,
+    function_name: str,
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
+) -> Callable[[int], float]:
     loaded_function: Callable[[int], float] | None = None
 
     def run(loops: int) -> float:
         nonlocal loaded_function
         if loaded_function is None:
-            module = load_module(module_path)
+            module = load_module(module_path, module_name_prefix=module_name_prefix)
             loaded_function = getattr(module, function_name)
         return loaded_function(loops)
 
@@ -86,6 +90,7 @@ def run(loops: int) -> float:
 def _collect_skipped_benchmarks(
     bench_ids: list[str],
     registry: dict[str, Callable[[int], float]],
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
 ) -> set[str]:
     """Return bench IDs that the owning module has marked as unsupported.
 
@@ -106,7 +111,7 @@ def _collect_skipped_benchmarks(
             continue
         module = loaded_modules.get(module_path)
         if module is None:
-            module = load_module(module_path)
+            module = load_module(module_path, module_name_prefix=module_name_prefix)
             loaded_modules[module_path] = module
         module_skip = getattr(module, "SKIPPED_BENCHMARKS", None)
         if module_skip and function_name in module_skip:
@@ -114,7 +119,10 @@ def _collect_skipped_benchmarks(
     return skipped
 
 
-def discover_benchmarks() -> dict[str, Callable[[int], float]]:
+def discover_benchmarks(
+    bench_dir: Path = BENCH_DIR,
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
+) -> dict[str, Callable[[int], float]]:
     """Discover bench_ functions.
 
     Each bench_ function must have the signature: bench_*(loops: int) -> float
@@ -122,13 +130,13 @@ def discover_benchmarks() -> dict[str, Callable[[int], float]]:
     time in seconds (using time.perf_counter).
     """
     registry: dict[str, Callable[[int], float]] = {}
-    for module_path in sorted(BENCH_DIR.glob("bench_*.py")):
+    for module_path in sorted(bench_dir.glob("bench_*.py")):
         module_name = module_path.stem
         for function_name in _discover_module_functions(module_path):
             bench_id = benchmark_id(module_name, function_name)
             if bench_id in registry:
                 raise ValueError(f"Duplicate benchmark ID discovered: {bench_id}")
-            registry[bench_id] = _lazy_benchmark(module_path, function_name)
+            registry[bench_id] = _lazy_benchmark(module_path, function_name, module_name_prefix=module_name_prefix)
     return registry
 
 
@@ -152,7 +160,10 @@ def _split_env_vars(arg_value: str) -> list[str]:
     return [env_var for env_var in arg_value.split(",") if env_var]
 
 
-def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
+def ensure_pyperf_worker_env(
+    argv: list[str],
+    extra_env_vars: tuple[str, ...] = (DEFAULT_BENCH_FILTER_ENV_VAR,),
+) -> list[str]:
     if "--copy-env" in argv:
         return list(argv)
 
@@ -175,7 +186,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
     if skip_next:
         raise ValueError("Missing value for --inherit-environ")
 
-    for env_var in PYPERF_INHERITED_ENV_VARS:
+    for env_var in (*BASE_PYPERF_INHERITED_ENV_VARS, *extra_env_vars):
         if env_var in os.environ:
             inherited_env.append(env_var)
 
@@ -190,7 +201,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]:
     return cleaned
 
 
-def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
+def parse_args(argv: list[str], default_output: Path = DEFAULT_OUTPUT) -> tuple[argparse.Namespace, list[str]]:
     parser = argparse.ArgumentParser(add_help=False)
     parser.add_argument(
         "--benchmark",
@@ -207,19 +218,25 @@ def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]:
         "-o",
         "--output",
         type=Path,
-        default=DEFAULT_OUTPUT,
-        help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})",
+        default=default_output,
+        help=f"JSON output file path (default: {default_output.name})",
     )
     parsed, remaining = parser.parse_known_args(argv)
     return parsed, remaining
 
 
-def main() -> None:
-    parsed, remaining_argv = parse_args(sys.argv[1:])
+def main(
+    *,
+    bench_dir: Path = BENCH_DIR,
+    default_output: Path = DEFAULT_OUTPUT,
+    module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
+    bench_filter_env_var: str = DEFAULT_BENCH_FILTER_ENV_VAR,
+) -> None:
+    parsed, remaining_argv = parse_args(sys.argv[1:], default_output=default_output)
 
-    registry = discover_benchmarks()
+    registry = discover_benchmarks(bench_dir=bench_dir, module_name_prefix=module_name_prefix)
     if not registry:
-        raise RuntimeError(f"No benchmark functions found in {BENCH_DIR}")
+        raise RuntimeError(f"No benchmark functions found in {bench_dir}")
 
     if parsed.list:
         for bench_id in sorted(registry):
@@ -231,7 +248,7 @@ def main() -> None:
     # the wrong bench. pyperf drops unknown CLI flags when spawning workers,
     # so fall back to an env var carrying the filter.
     requested = list(parsed.benchmark)
-    env_filter = os.environ.get(BENCH_FILTER_ENV_VAR, "")
+    env_filter = os.environ.get(bench_filter_env_var, "")
     if not requested and env_filter:
         requested = [bid for bid in env_filter.split(",") if bid]
 
@@ -243,21 +260,21 @@ def main() -> None:
             raise ValueError(f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}")
         benchmark_ids = requested
         # Propagate to any pyperf worker we're about to spawn.
-        os.environ[BENCH_FILTER_ENV_VAR] = ",".join(benchmark_ids)
+        os.environ[bench_filter_env_var] = ",".join(benchmark_ids)
     else:
         benchmark_ids = sorted(registry)
 
     # Strip any --output args to avoid conflicts with our output handling.
     output_path = parsed.output.resolve()
     remaining_argv = strip_pyperf_output_args(remaining_argv)
-    remaining_argv = ensure_pyperf_worker_env(remaining_argv)
+    remaining_argv = ensure_pyperf_worker_env(remaining_argv, extra_env_vars=(bench_filter_env_var,))
     is_worker = "--worker" in remaining_argv
 
     # Drop benchmarks that the owning module has marked as unavailable on
     # this driver/device. Without this step a single unsupported bench
     # (e.g. TMA on a pre-Hopper GPU) would abort the whole pyperf run,
     # since pyperf treats a raised exception as a fatal worker failure.
-    skipped = _collect_skipped_benchmarks(benchmark_ids, registry)
+    skipped = _collect_skipped_benchmarks(benchmark_ids, registry, module_name_prefix=module_name_prefix)
     if skipped and not is_worker:
         for bench_id in sorted(skipped):
             print(f"Skipping {bench_id}: unsupported on this driver/device", file=sys.stderr)
diff --git a/benchmarks/cuda_core/.gitignore b/benchmarks/cuda_core/.gitignore
new file mode 100644
index 00000000000..b795782a321
--- /dev/null
+++ b/benchmarks/cuda_core/.gitignore
@@ -0,0 +1,16 @@
+# Build artifacts
+.build/
+__pycache__/
+
+# Benchmark results
+*.json
+.benchmarks/
+
+# Pixi environments
+.pixi/
+
+# Override root .gitignore *.cpp rule (which targets Cython-generated files)
+!benchmarks/cpp/*.cpp
+
+results-python.json
+results-cpp.json
diff --git a/benchmarks/cuda_core/AGENTS.md b/benchmarks/cuda_core/AGENTS.md
new file mode 100644
index 00000000000..d2a3a3357cd
--- /dev/null
+++ b/benchmarks/cuda_core/AGENTS.md
@@ -0,0 +1,11 @@
+# cuda.core benchmarks
+
+Read the README.md in this directory for more details about the benchmarks.
+
+When generating code verify that that the code is correct based on the source for cuda-core
+that can be found in ../../cuda_core.
+
+This suite shares the pyperf runner with `../cuda_bindings/` via a sys.path
+insert in `run_pyperf.py`. The per-suite setup (`runtime.py`, the `benchmarks/`
+module files) lives here. Benchmark IDs are kept identical to the cuda.bindings
+suite so `compare.py` can diff them directly.
diff --git a/benchmarks/cuda_core/BENCHMARK_PLAN.md b/benchmarks/cuda_core/BENCHMARK_PLAN.md
new file mode 100644
index 00000000000..93d29a735a2
--- /dev/null
+++ b/benchmarks/cuda_core/BENCHMARK_PLAN.md
@@ -0,0 +1,111 @@
+# cuda.core Benchmark Plan
+
+## Goal
+
+Measure the **additional Python-side latency** that `cuda.core` adds on top
+of `cuda.bindings` for every public API with a clear cuda.bindings
+counterpart. Target is **< 1 µs** of extra overhead per call.
+
+The baseline for this suite is not C++: it is the cuda.bindings Python
+results file (`../cuda_bindings/results-python.json`). Benchmark IDs are
+kept identical across suites so a direct diff is possible — see
+`compare.py`.
+
+## Phase 1 coverage
+
+Benchmark IDs shipped in this PR (all map 1:1 to cuda.bindings):
+
+| ID                                        | cuda.core surface                                  |
+|-------------------------------------------|----------------------------------------------------|
+| `ctx_device.ctx_get_current`              | `Device()`                                         |
+| `ctx_device.ctx_set_current`              | `dev.set_current()`                                |
+| `ctx_device.device_get`                   | `Device(0)`                                        |
+| `ctx_device.device_get_attribute`         | `dev.properties.compute_capability_major` (cached) |
+| `stream.stream_create_destroy`            | `dev.create_stream()` + `stream.close()`           |
+| `stream.stream_synchronize`               | `stream.sync()`                                    |
+| `event.event_create_destroy`              | `dev.create_event()` + `event.close()`             |
+| `event.event_record`                      | `stream.record(event)`                             |
+| `event.event_query`                       | `event.is_done`                                    |
+| `event.event_synchronize`                 | `event.sync()`                                     |
+| `memory.mem_alloc_free`                   | `dev.allocate(size)` + `buf.close()` (async pool!) |
+| `memory.mem_alloc_async_free_async`       | `dev.allocate(size, stream)` + `buf.close(stream)` |
+| `launch.launch_empty_kernel`              | `launch(stream, config, kernel)`                   |
+| `launch.launch_small_kernel`              | `launch(..., ptr)`                                 |
+| `launch.launch_16_args`                   | `launch(..., *16 ptrs)`                            |
+| `launch.launch_256_args`                  | `launch(..., *256 ptrs)`                           |
+| `launch.launch_512_args`                  | `launch(..., *512 ptrs)`                           |
+
+## Intentionally not covered in Phase 1
+
+- `ctx_device.ctx_get_device`, `ctx_device.device_primary_ctx_retain`:
+  cuda.core abstracts CUDA contexts away — no direct counterpart.
+- `enum.*`: cuda.core does not re-export cuda.bindings enums; those
+  benches measure a cuda.bindings-specific cost.
+- `stream.stream_query`: no public `Stream.query()` in cuda.core.
+- `launch.launch_*_pre_packed`: pre-packing is a cuda.bindings-specific
+  optimization of its tuple-of-args API; cuda.core's `ParamHolder`
+  handles packing internally on every call.
+- `launch.launch_512_bools / _ints / _doubles / _bytes / _longlongs`:
+  non-pointer scalar arg variants — deferred to Phase 2.
+- `launch.launch_2048b`: struct-by-value arg — requires a
+  `TensorMapDescriptor`/ctypes path that is not yet settled in cuda.core.
+- `memory.memcpy_htod / _dtoh / _dtod`: cuda.core's `Buffer.copy_to /
+  copy_from` only go buffer-to-buffer through `cuMemcpyAsync`; pairing
+  with cuda.bindings' synchronous `cuMemcpyDtoD` / `HtoD` / `DtoH` would
+  be apples-to-oranges. Deferred until the comparable host-memory
+  resource path is finalized.
+- NVRTC / module benches: cuda.core's `Program` / `ObjectCode` pipeline
+  is meaningfully different from raw NVRTC; deserves its own set of
+  bench functions rather than reusing cuda.bindings IDs.
+
+## Audit notes: known driver-call mismatches
+
+The IDs above match cuda.bindings 1:1 at the *public API* level, but a few
+measure a different underlying driver call. Readers of `compare.py` should
+know which deltas are "pure cuda.core Python overhead" vs. a deliberate
+different driver path:
+
+- `ctx_device.ctx_get_current`: `Device()` reads a TLS-cached device
+  object; cuda.bindings calls `cuCtxGetCurrent` every iteration. Expect
+  cuda.core to be faster. Not apples-to-apples at the driver level;
+  apples-to-apples at the user-facing "give me the current device" level.
+- `ctx_device.device_get_attribute`: `DeviceProperties` caches the first
+  lookup in a Python dict (`_get_cached_attribute`, `_device.pyx:75`).
+  After the first iteration this is a dict hit, not a `cuDeviceGetAttribute`
+  driver call. Expect cuda.core to be faster here too. A future
+  paired bench can use an uncached attribute (e.g. `compute_mode`) to
+  measure the wrapper overhead on the driver-call path.
+- `stream.stream_create_destroy`: default `StreamOptions(nonblocking=True)`
+  yields the same `CU_STREAM_NON_BLOCKING` flag as the cuda.bindings
+  bench, but cuda.core additionally calls `cuCtxGetStreamPriorityRange`
+  and builds a `StreamOptions` dataclass per create — real cuda.core
+  overhead, fair to measure.
+- `memory.mem_alloc_free`: **deliberate mismatch**. `dev.allocate(size)`
+  with `stream=None` routes through `_MP_allocate` → `cuMemAllocFromPoolAsync`
+  on the cached default stream (`_memory_pool.pyx:302`). cuda.bindings
+  measures the synchronous `cuMemAlloc`. The bench captures the
+  user-visible cost of `dev.allocate(size)`, which is what a cuda.core
+  user actually pays; it does **not** isolate "Python wrapper overhead
+  on top of `cuMemAlloc`" because cuda.core does not expose a sync
+  `cuMemAlloc` path.
+- `memory.mem_alloc_async_free_async`: same internal path as
+  `mem_alloc_free` (both go through `cuMemAllocFromPoolAsync`); the
+  only difference is whether `default_stream()` is fetched or a stream
+  is passed in. Driver call matches cuda.bindings' `cuMemAllocAsync`
+  semantically but uses the pool-backed variant.
+- `launch.*`: cuda.core uses `cuLaunchKernelEx` (takes a
+  `CUlaunchConfig` struct) and allocates a fresh `ParamHolder` +
+  `LaunchConfig._to_native_launch_config()` per call. cuda.bindings
+  uses `cuLaunchKernel` with pre-built arg tuples. The delta captures
+  both the Python-side per-call work and the `Ex` vs non-`Ex` driver
+  cost; this is real and expected cuda.core overhead.
+
+## Next up (not in this PR)
+
+1. Scalar launch variants (512 bools/ints/doubles/bytes/longlongs) so
+   arg-packing overhead is covered beyond the pointer fast-path.
+2. Buffer-based memcpy benchmarks once the host-memory resource path is
+   stable in cuda.core.
+3. NVRTC / `Program` / `ObjectCode` latency benches.
+4. TMA (`TensorMapDescriptor`) benches when cuda.core's CCCL-backed
+   helper is formalised.
diff --git a/benchmarks/cuda_core/README.md b/benchmarks/cuda_core/README.md
new file mode 100644
index 00000000000..de124f7ff73
--- /dev/null
+++ b/benchmarks/cuda_core/README.md
@@ -0,0 +1,73 @@
+# cuda.core benchmarks
+
+These benchmarks measure the latency overhead of the `cuda.core` public API
+on top of `cuda.bindings`. Every benchmark ID here has a 1:1 counterpart in
+`../cuda_bindings/benchmarks/` so a `compare.py` run produces a side-by-side
+"bindings vs core" overhead table for every operation.
+
+This suite is **not** a throughput benchmark and does not test kernel
+performance — it measures Python-side call overhead only. No C++ baseline
+is built or run for `cuda.core`: the comparative baseline is the
+`cuda.bindings` Python results file at
+`../cuda_bindings/results-python.json`.
+
+The pyperf runner (`runner/main.py`) is shared with the cuda.bindings
+suite via a `sys.path` insert in `run_pyperf.py`; only the per-suite
+`runtime.py` and `benchmarks/*.py` live here.
+
+## Usage
+
+Requires pixi.
+
+Environments:
+
+- `wheel`: Installs released `cuda-core` from conda-forge.
+- `source`: Installs `cuda-core` and `cuda-bindings` from the in-tree
+  sources, so local changes are exercised.
+
+Tasks:
+
+- `bench`: Runs the full suite.
+- `bench-smoke-test`: Runs each bench with `--debug-single-value` for
+  a quick smoke check (not meaningful for timing).
+- `bench-compare`: Prints a side-by-side table against
+  `../cuda_bindings/results-python.json`.
+
+### System tuning
+
+For more stable results on Linux, tune the system before running.
+See: https://pyperf.readthedocs.io/en/latest/system.html#system
+
+```bash
+pixi run -e wheel -- python -m pyperf system show
+$(pixi run -e wheel -- which python) -m pyperf system tune
+```
+
+### Running benchmarks
+
+```bash
+# Wheel env
+pixi run -e wheel bench
+pixi run -e wheel bench --min-time 0.1
+
+# Source env (picks up local cuda.core / cuda.bindings changes)
+pixi run -e source bench
+
+# Side-by-side comparison vs cuda.bindings
+pixi run -e wheel bench-compare
+```
+
+Results are saved to `results-python.json` in this directory. Compare
+against the cuda.bindings baseline by running that suite's `bench` task
+first so `../cuda_bindings/results-python.json` exists.
+
+## Output JSON and analysis
+
+The suite uses [pyperf](https://pyperf.readthedocs.io/en/latest/). The
+output JSON is pyperf-compatible:
+
+```bash
+pixi run -e wheel -- python -m pyperf stats results-python.json
+pixi run -e wheel -- python -m pyperf compare_to \
+    ../cuda_bindings/results-python.json results-python.json
+```
diff --git a/benchmarks/cuda_core/benchmarks/bench_ctx_device.py b/benchmarks/cuda_core/benchmarks/bench_ctx_device.py
new file mode 100644
index 00000000000..ebe7f28730f
--- /dev/null
+++ b/benchmarks/cuda_core/benchmarks/bench_ctx_device.py
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runtime import ensure_device
+
+from cuda.core import Device
+
+DEV = ensure_device()
+
+
+def bench_ctx_get_current(loops: int) -> float:
+    # Device() with no args returns the TLS-cached "current" device.
+    _fn = Device
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn()
+    return time.perf_counter() - t0
+
+
+def bench_ctx_set_current(loops: int) -> float:
+    _fn = DEV.set_current
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn()
+    return time.perf_counter() - t0
+
+
+def bench_device_get(loops: int) -> float:
+    # Device(id) hits the same TLS cache after the first construction.
+    _fn = Device
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn(0)
+    return time.perf_counter() - t0
+
+
+def bench_device_get_attribute(loops: int) -> float:
+    # Matches the cuda.bindings bench's CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
+    # call. cuda.core caches this attribute in DeviceProperties, so every
+    # iteration past the first is a dict lookup rather than a driver call
+    # — the bench therefore measures the user-visible cost of the public
+    # API, which is legitimately faster than cuda.bindings here. See
+    # BENCHMARK_PLAN.md for the rationale.
+    _props = DEV.properties
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _props.compute_capability_major  # noqa: B018
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_core/benchmarks/bench_event.py b/benchmarks/cuda_core/benchmarks/bench_event.py
new file mode 100644
index 00000000000..883c470be1f
--- /dev/null
+++ b/benchmarks/cuda_core/benchmarks/bench_event.py
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runtime import ensure_device
+
+DEV = ensure_device()
+STREAM = DEV.create_stream()
+
+# Pre-recorded event, completed, reused for record/query/sync benches.
+EVENT = STREAM.record()
+STREAM.sync()
+
+
+def bench_event_create_destroy(loops: int) -> float:
+    _create = DEV.create_event
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        e = _create()
+        e.close()
+    return time.perf_counter() - t0
+
+
+def bench_event_record(loops: int) -> float:
+    # Reuse EVENT so we measure cuEventRecord, not event allocation.
+    _record = STREAM.record
+    _event = EVENT
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _record(_event)
+    return time.perf_counter() - t0
+
+
+def bench_event_query(loops: int) -> float:
+    _event = EVENT
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _event.is_done  # noqa: B018
+    return time.perf_counter() - t0
+
+
+def bench_event_synchronize(loops: int) -> float:
+    _fn = EVENT.sync
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn()
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_core/benchmarks/bench_launch.py b/benchmarks/cuda_core/benchmarks/bench_launch.py
new file mode 100644
index 00000000000..09d418b0a05
--- /dev/null
+++ b/benchmarks/cuda_core/benchmarks/bench_launch.py
@@ -0,0 +1,183 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runtime import compile_module, ensure_device
+
+from cuda.core import LaunchConfig, launch
+
+# Same source as the cuda.bindings launch bench, minus the variants we
+# don't need here. Pointer args are passed as Python ints (via int(handle))
+# or Buffer instances; cuda.core's ParamHolder treats both as intptr_t.
+KERNEL_SOURCE = """\
+extern "C" __global__ void empty_kernel() { return; }
+extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }
+
+#define ITEM_PARAM(x, T) T x
+#define REP1(x, T)   , ITEM_PARAM(x, T)
+#define REP2(x, T)   REP1(x##0, T)   REP1(x##1, T)
+#define REP4(x, T)   REP2(x##0, T)   REP2(x##1, T)
+#define REP8(x, T)   REP4(x##0, T)   REP4(x##1, T)
+#define REP16(x, T)  REP8(x##0, T)   REP8(x##1, T)
+#define REP32(x, T)  REP16(x##0, T)  REP16(x##1, T)
+#define REP64(x, T)  REP32(x##0, T)  REP32(x##1, T)
+#define REP128(x, T) REP64(x##0, T)  REP64(x##1, T)
+#define REP256(x, T) REP128(x##0, T) REP128(x##1, T)
+
+extern "C" __global__
+void small_kernel_16_args(
+    ITEM_PARAM(F, int*)
+    REP1(A, int*)
+    REP2(A, int*)
+    REP4(A, int*)
+    REP8(A, int*))
+{ *F = 0; }
+
+extern "C" __global__
+void small_kernel_256_args(
+    ITEM_PARAM(F, int*)
+    REP1(A, int*)
+    REP2(A, int*)
+    REP4(A, int*)
+    REP8(A, int*)
+    REP16(A, int*)
+    REP32(A, int*)
+    REP64(A, int*)
+    REP128(A, int*))
+{ *F = 0; }
+
+extern "C" __global__
+void small_kernel_512_args(
+    ITEM_PARAM(F, int*)
+    REP1(A, int*)
+    REP2(A, int*)
+    REP4(A, int*)
+    REP8(A, int*)
+    REP16(A, int*)
+    REP32(A, int*)
+    REP64(A, int*)
+    REP128(A, int*)
+    REP256(A, int*))
+{ *F = 0; }
+"""
+
+KERNEL_NAMES = (
+    "empty_kernel",
+    "small_kernel",
+    "small_kernel_16_args",
+    "small_kernel_256_args",
+    "small_kernel_512_args",
+)
+
+DEV = ensure_device()
+STREAM = DEV.create_stream()
+CONFIG = LaunchConfig(grid=1, block=1)
+
+MODULE = None
+EMPTY_KERNEL = None
+SMALL_KERNEL = None
+KERNEL_16_ARGS = None
+KERNEL_256_ARGS = None
+KERNEL_512_ARGS = None
+FLOAT_BUF = None
+INT_BUFS_512: tuple = ()
+INT_PTRS_512: tuple = ()
+
+
+def _ensure_launch_state() -> None:
+    global MODULE, EMPTY_KERNEL, SMALL_KERNEL
+    global KERNEL_16_ARGS, KERNEL_256_ARGS, KERNEL_512_ARGS
+    global FLOAT_BUF, INT_BUFS_512, INT_PTRS_512
+
+    if EMPTY_KERNEL is not None:
+        return
+
+    module = compile_module(KERNEL_SOURCE, KERNEL_NAMES)
+
+    # Pre-allocate buffers for the kernel args. Use ints (raw pointer
+    # addresses) in the launch hot path so ParamHolder skips the Buffer
+    # type check and goes through its int fast-path.
+    float_buf = DEV.allocate(4)
+    int_bufs_512 = tuple(DEV.allocate(4) for _ in range(512))
+    int_ptrs_512 = tuple(int(b.handle) for b in int_bufs_512)
+
+    MODULE = module
+    EMPTY_KERNEL = module.get_kernel("empty_kernel")
+    SMALL_KERNEL = module.get_kernel("small_kernel")
+    KERNEL_16_ARGS = module.get_kernel("small_kernel_16_args")
+    KERNEL_256_ARGS = module.get_kernel("small_kernel_256_args")
+    KERNEL_512_ARGS = module.get_kernel("small_kernel_512_args")
+    FLOAT_BUF = float_buf
+    INT_BUFS_512 = int_bufs_512
+    INT_PTRS_512 = int_ptrs_512
+
+
+def bench_launch_empty_kernel(loops: int) -> float:
+    _ensure_launch_state()
+    _launch = launch
+    _kernel = EMPTY_KERNEL
+    _stream = STREAM
+    _config = CONFIG
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _launch(_stream, _config, _kernel)
+    return time.perf_counter() - t0
+
+
+def bench_launch_small_kernel(loops: int) -> float:
+    _ensure_launch_state()
+    _launch = launch
+    _kernel = SMALL_KERNEL
+    _stream = STREAM
+    _config = CONFIG
+    _ptr = int(FLOAT_BUF.handle)
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _launch(_stream, _config, _kernel, _ptr)
+    return time.perf_counter() - t0
+
+
+def bench_launch_16_args(loops: int) -> float:
+    _ensure_launch_state()
+    _launch = launch
+    _kernel = KERNEL_16_ARGS
+    _stream = STREAM
+    _config = CONFIG
+    _args = INT_PTRS_512[:16]
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _launch(_stream, _config, _kernel, *_args)
+    return time.perf_counter() - t0
+
+
+def bench_launch_256_args(loops: int) -> float:
+    _ensure_launch_state()
+    _launch = launch
+    _kernel = KERNEL_256_ARGS
+    _stream = STREAM
+    _config = CONFIG
+    _args = INT_PTRS_512[:256]
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _launch(_stream, _config, _kernel, *_args)
+    return time.perf_counter() - t0
+
+
+def bench_launch_512_args(loops: int) -> float:
+    _ensure_launch_state()
+    _launch = launch
+    _kernel = KERNEL_512_ARGS
+    _stream = STREAM
+    _config = CONFIG
+    _args = INT_PTRS_512
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _launch(_stream, _config, _kernel, *_args)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_core/benchmarks/bench_memory.py b/benchmarks/cuda_core/benchmarks/bench_memory.py
new file mode 100644
index 00000000000..445f85e5121
--- /dev/null
+++ b/benchmarks/cuda_core/benchmarks/bench_memory.py
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runtime import ensure_device
+
+DEV = ensure_device()
+STREAM = DEV.create_stream()
+
+# Small allocation size: we measure call overhead, not the allocator.
+ALLOC_SIZE = 1024
+
+
+def bench_mem_alloc_free(loops: int) -> float:
+    # No-stream variant: dev.allocate() uses the default stream internally.
+    _alloc = DEV.allocate
+    _size = ALLOC_SIZE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        buf = _alloc(_size)
+        buf.close()
+    return time.perf_counter() - t0
+
+
+def bench_mem_alloc_async_free_async(loops: int) -> float:
+    _alloc = DEV.allocate
+    _size = ALLOC_SIZE
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        buf = _alloc(_size, _stream)
+        buf.close(_stream)
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_core/benchmarks/bench_stream.py b/benchmarks/cuda_core/benchmarks/bench_stream.py
new file mode 100644
index 00000000000..f1501c6f658
--- /dev/null
+++ b/benchmarks/cuda_core/benchmarks/bench_stream.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runtime import ensure_device
+
+DEV = ensure_device()
+STREAM = DEV.create_stream()
+
+
+def bench_stream_create_destroy(loops: int) -> float:
+    _create = DEV.create_stream
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        s = _create()
+        s.close()
+    return time.perf_counter() - t0
+
+
+def bench_stream_synchronize(loops: int) -> float:
+    _fn = STREAM.sync
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _fn()
+    return time.perf_counter() - t0
diff --git a/benchmarks/cuda_core/compare.py b/benchmarks/cuda_core/compare.py
new file mode 100644
index 00000000000..99cbca31a69
--- /dev/null
+++ b/benchmarks/cuda_core/compare.py
@@ -0,0 +1,159 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Compare cuda.core and cuda.bindings benchmark results side by side.
+
+Benchmark IDs are kept intentionally identical across the two suites
+(e.g. ``stream.stream_create_destroy``) so a result present in both
+files can be diffed directly. IDs that exist in only one suite are
+rendered with ``-`` in the missing column.
+"""
+
+import argparse
+import json
+import statistics
+import sys
+from pathlib import Path
+
+HERE = Path(__file__).resolve().parent
+DEFAULT_CORE = HERE / "results-python.json"
+DEFAULT_BINDINGS = HERE.parent / "cuda_bindings" / "results-python.json"
+
+
+def load_benchmarks(path: Path) -> dict[str, list[float]]:
+    """Load a pyperf JSON file and return {name: [values]}."""
+    with open(path) as f:
+        data = json.load(f)
+
+    results: dict[str, list[float]] = {}
+    for bench in data.get("benchmarks", []):
+        name = bench.get("metadata", {}).get("name", "")
+        if not name:
+            for run in bench.get("runs", []):
+                name = run.get("metadata", {}).get("name", "")
+                if name:
+                    break
+        values: list[float] = []
+        for run in bench.get("runs", []):
+            values.extend(run.get("values", []))
+        if name and values:
+            results[name] = values
+    return results
+
+
+def stats(values: list[float]) -> tuple[float, float, float, int]:
+    mean = statistics.mean(values)
+    stdev = statistics.pstdev(values) if len(values) > 1 else 0.0
+    rsd = (stdev / mean) if mean else 0.0
+    return mean, stdev, rsd, len(values)
+
+
+def fmt_rsd(rsd: float | None) -> str:
+    if rsd is None:
+        return "-"
+    return f"{rsd * 100:.1f}%"
+
+
+def fmt_ns(seconds: float) -> str:
+    return f"{seconds * 1e9:,.0f}"
+
+
+def fmt_overhead_ns(core_mean: float, bindings_mean: float) -> str:
+    return f"{(core_mean - bindings_mean) * 1e9:+,.0f}"
+
+
+def fmt_overhead_pct(core_mean: float, bindings_mean: float) -> str:
+    if bindings_mean <= 0.0:
+        return "-"
+    pct = (core_mean - bindings_mean) / bindings_mean * 100
+    return f"{pct:+,.0f}%"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compare cuda.core vs cuda.bindings benchmark results")
+    parser.add_argument(
+        "--core",
+        type=Path,
+        default=DEFAULT_CORE,
+        help=f"cuda.core results JSON (default: {DEFAULT_CORE.name})",
+    )
+    parser.add_argument(
+        "--bindings",
+        type=Path,
+        default=DEFAULT_BINDINGS,
+        help=f"cuda.bindings results JSON (default: {DEFAULT_BINDINGS})",
+    )
+    args = parser.parse_args()
+
+    if not args.core.exists():
+        print(f"cuda.core results not found: {args.core}", file=sys.stderr)
+        print("Run: pixi run -e source bench", file=sys.stderr)
+        sys.exit(1)
+
+    core_benchmarks = load_benchmarks(args.core)
+    bindings_benchmarks = load_benchmarks(args.bindings) if args.bindings.exists() else {}
+
+    if not core_benchmarks:
+        print("No benchmarks found in cuda.core results.", file=sys.stderr)
+        sys.exit(1)
+
+    all_names = sorted(set(core_benchmarks) | set(bindings_benchmarks))
+    name_width = max(len(n) for n in all_names)
+    name_width = max(name_width, len("Benchmark"))
+
+    bind_w = 14
+    core_w = 14
+    rsd_w = 8
+    oh_ns_w = 12
+    oh_pct_w = 10
+
+    if bindings_benchmarks:
+        header = (
+            f"{'Benchmark':<{name_width}}  "
+            f"{'bindings (ns)':>{bind_w}}  {'RSD':>{rsd_w}}  "
+            f"{'core (ns)':>{core_w}}  {'RSD':>{rsd_w}}  "
+            f"{'Overhead ns':>{oh_ns_w}}  {'Overhead %':>{oh_pct_w}}"
+        )
+    else:
+        header = f"{'Benchmark':<{name_width}}  {'core (ns)':>{core_w}}  {'RSD':>{rsd_w}}"
+
+    sep = "-" * len(header)
+    print(sep)
+    print(header)
+    print(sep)
+
+    for name in all_names:
+        core_vals = core_benchmarks.get(name)
+        bind_vals = bindings_benchmarks.get(name)
+
+        core_stats = stats(core_vals) if core_vals else None
+        bind_stats = stats(bind_vals) if bind_vals else None
+
+        core_str = fmt_ns(core_stats[0]) if core_stats else "-"
+        bind_str = fmt_ns(bind_stats[0]) if bind_stats else "-"
+        core_rsd = fmt_rsd(core_stats[2]) if core_stats else "-"
+        bind_rsd = fmt_rsd(bind_stats[2]) if bind_stats else "-"
+
+        if core_stats and bind_stats:
+            overhead_ns_str = fmt_overhead_ns(core_stats[0], bind_stats[0])
+            overhead_pct_str = fmt_overhead_pct(core_stats[0], bind_stats[0])
+        else:
+            overhead_ns_str = "-"
+            overhead_pct_str = "-"
+
+        if bindings_benchmarks:
+            print(
+                f"{name:<{name_width}}  "
+                f"{bind_str:>{bind_w}}  {bind_rsd:>{rsd_w}}  "
+                f"{core_str:>{core_w}}  {core_rsd:>{rsd_w}}  "
+                f"{overhead_ns_str:>{oh_ns_w}}  {overhead_pct_str:>{oh_pct_w}}"
+            )
+        else:
+            print(f"{name:<{name_width}}  {core_str:>{core_w}}  {core_rsd:>{rsd_w}}")
+
+    print(sep)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/cuda_core/pixi.lock b/benchmarks/cuda_core/pixi.lock
new file mode 100644
index 00000000000..e5eca8b36ee
--- /dev/null
+++ b/benchmarks/cuda_core/pixi.lock
@@ -0,0 +1,972 @@
+version: 6
+environments:
+  default:
+    channels:
+    - url: https://conda.anaconda.org/conda-forge/
+    options:
+      channel-priority: disabled
+      pypi-prerelease-mode: if-necessary-or-explicit
+    packages: {}
+  source:
+    channels:
+    - url: https://conda.anaconda.org/conda-forge/
+    options:
+      channel-priority: disabled
+      pypi-prerelease-mode: if-necessary-or-explicit
+    packages:
+      linux-64:
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.2.0-py314ha0b5721_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.75-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.75-h376f20c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.78-h4bc722e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.5.4-pyhc364b38_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.29.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.19-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-6_h4a7cf45_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-hd0affe5_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-6_h0358290_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.5-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-6_h47877c9_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.78-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.32-pthreads_h94d23a6_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.0-hf4e2dac_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.13-hd0affe5_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.13-hd0affe5_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42-h5347b49_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.6-hdb14827_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.3-py314h2b28147_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.2-h35e630c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.6-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.6.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.4-habeac84_100_cp314.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.2.2-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.3.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.1-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
+      - conda: ../../cuda_core
+  wheel:
+    channels:
+    - url: https://conda.anaconda.org/conda-forge/
+    options:
+      channel-priority: disabled
+      pypi-prerelease-mode: if-necessary-or-explicit
+    packages:
+      linux-64:
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.2.0-py314ha0b5721_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-core-0.7.0-cuda13_py314h025f531_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.5.4-pyhc364b38_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.29.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.19-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-6_h4a7cf45_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-hd0affe5_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-6_h0358290_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.5-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-6_h47877c9_openblas.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.78-hecca717_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.32-pthreads_h94d23a6_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.0-hf4e2dac_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.13-hd0affe5_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.13-hd0affe5_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42-h5347b49_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.6-hdb14827_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.3-py314h2b28147_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.2-h35e630c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.6-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.6.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.4-habeac84_100_cp314.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.2.2-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.3.0-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.1-pyhcf101f3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
+packages:
+- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda
+  build_number: 20
+  sha256: 1dd3fffd892081df9726d7eb7e0dea6198962ba775bd88842135a4ddb4deb3c9
+  md5: a9f577daf3de00bca7c3c76c0ecbd1de
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgomp >=7.5.0
+  constrains:
+  - openmp_impl <0.0a0
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 28948
+  timestamp: 1770939786096
+- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda
+  sha256: 0b75d45f0bba3e95dc693336fa51f40ea28c980131fec438afb7ce6118ed05f6
+  md5: d2ffd7602c02f2b316fd921d39876885
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: bzip2-1.0.6
+  license_family: BSD
+  size: 260182
+  timestamp: 1771350215188
+- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda
+  sha256: c9dbcc8039a52023660d6d1bbf87594a93dd69c6ac5a2a44323af2c92976728d
+  md5: e18ad67cf881dcadee8b8d9e2f8e5f73
+  depends:
+  - __unix
+  license: ISC
+  size: 131039
+  timestamp: 1776865545798
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda
+  sha256: c6339858a0aaf5d939e00d345c98b99e4558f285942b27232ac098ad17ac7f8e
+  md5: cf45f4278afd6f4e6d03eda0f435d527
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libffi >=3.5.2,<3.6.0a0
+  - libgcc >=14
+  - pycparser
+  - python >=3.14,<3.15.0a0
+  - python_abi 3.14.* *_cp314
+  license: MIT
+  license_family: MIT
+  size: 300271
+  timestamp: 1761203085220
+- conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda
+  sha256: aa589352e61bb221351a79e5946d56916e3c595783994884accdb3b97fe9d449
+  md5: 381bd45fb7aa032691f3063aff47e3a1
+  depends:
+  - python >=3.10
+  license: MIT
+  license_family: MIT
+  size: 13589
+  timestamp: 1763607964133
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.2.0-py314ha0b5721_0.conda
+  sha256: f77e397cbf7b173dae7470aca3052184779431b7a7bcce3ecc56c2822c83cdb4
+  md5: 29baeabdf5e58a696a68625b7bd8d9f8
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-nvrtc >=13,<14.0a0
+  - cuda-nvvm-impl >=13,<14.0a0
+  - cuda-pathfinder >=1.1.0,<2
+  - cuda-version >=13,<14.0a0
+  - libcufile >=1,<2.0a0
+  - libgcc >=14
+  - libnvjitlink >=13.0,<14.0a0
+  - libstdcxx >=14
+  - python >=3.14,<3.15.0a0
+  - python_abi 3.14.* *_cp314
+  constrains:
+  - cuda-cudart >=13,<14.0a0
+  - cuda-python >=13.2.0,<13.3.0a0
+  license: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+  size: 4281852
+  timestamp: 1773284503198
+- conda: ../../cuda_core
+  name: cuda-core
+  version: 0.7.0
+  build: hb0f4dca_0
+  subdir: linux-64
+  variants:
+    target_platform: linux-64
+  depends:
+  - python
+  - cuda-version
+  - numpy
+  - cuda-bindings
+  - cuda-pathfinder
+  - libgcc >=15
+  - libgcc >=15
+  - libstdcxx >=15
+  - python_abi 3.14.* *_cp314
+  - cuda-nvrtc >=13.2.78,<14.0a0
+  - cuda-cudart >=13.2.75,<14.0a0
+  license: Apache-2.0
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-core-0.7.0-cuda13_py314h025f531_0.conda
+  sha256: 25841761aa7cfb2a9295afbefea5ab96ac15f07aa56fc35acb36d254c92e6747
+  md5: 60c216b8b88821564e7f2409d6437bb5
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-bindings >=13,<14.0a0
+  - cuda-pathfinder >=1.4.2,<2
+  - cuda-version >=13,<14.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  - numpy
+  - python >=3.14,<3.15.0a0
+  - python_abi 3.14.* *_cp314
+  license: Apache-2.0
+  license_family: APACHE
+  size: 1541620
+  timestamp: 1775673907706
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.75-hecca717_0.conda
+  sha256: 633bc9ba458a12a20a42776bf3fa25cecfddc65a22e4ed207fe09b9adcd9de58
+  md5: 9b7dcd83f8a965efcf7377dc54203619
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-cudart_linux-64 13.2.75 h376f20c_0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 24542
+  timestamp: 1776110472025
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.75-h376f20c_0.conda
+  sha256: cd03c67b2005e2e74ff278f6f8b17ca7d6f18cf43fb00775833669508d301a83
+  md5: ff98f2b9b87eb8b3a4b36745d3d5b93e
+  depends:
+  - cuda-version >=13.2,<13.3.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 203339
+  timestamp: 1776110448238
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda
+  sha256: 9cc4f9df70c02eea5121cdb0e865207b04cd52591f57ebcac2ba44fada10eb5b
+  md5: df16c9049d882cdaf4f83a5b90079589
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13.1,<13.2.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 35339417
+  timestamp: 1768272955912
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda
+  sha256: 73fbc9d15c062c3ea60891e8183002f6b055fa6638402d17581677af0aaa20d8
+  md5: 66623d882c42506fa3f1780b90841400
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 35670504
+  timestamp: 1776109867257
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda
+  sha256: 12d84615684f1279799c023ce4ccc7c34f151bec2a90e0c8d04798a8c8af437c
+  md5: bf76661bc0de83a60537c4913f339fb3
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13.1,<13.2.0a0
+  - libgcc >=12
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 21873791
+  timestamp: 1768280315627
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.78-h4bc722e_0.conda
+  sha256: 944d132f61f240131abff67646da4040ae585a1f43c6b38fabebb6cc075a7c16
+  md5: 5e1021b4c73e795deabbf35ed1317dcb
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=12
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 22205958
+  timestamp: 1776121258973
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.5.4-pyhc364b38_0.conda
+  sha256: 662046880e1dd8f62cd2ad8ac9618ace683cb7a3f3af93c52e425a772c9b4f00
+  md5: 42d4610b52102122741f9bf68f2866ed
+  depends:
+  - python >=3.10
+  - cuda-version >=12.0,<14
+  - python
+  license: Apache-2.0
+  license_family: APACHE
+  size: 45892
+  timestamp: 1777330272729
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda
+  sha256: 176ac20fdb95611af8fb2bf0d3d16fee998019b1d0f12fc9ddd5fa0df4553992
+  md5: d85448460c25ee43ff2f8346bb9ad52b
+  constrains:
+  - cudatoolkit 13.1|13.1.*
+  - __cuda >=13
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 21511
+  timestamp: 1757017115788
+- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda
+  sha256: 64aebe8ccb3a2c3ff446d3c0c0e88ef4fdb069a5732c03539bf3a37243c4c679
+  md5: 45676e3dd76b30ec613f1f822d450eff
+  constrains:
+  - __cuda >=13
+  - cudatoolkit 13.2|13.2.*
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 21908
+  timestamp: 1773093709154
+- conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda
+  sha256: 6d977f0b2fc24fee21a9554389ab83070db341af6d6f09285360b2e09ef8b26e
+  md5: 003b8ba0a94e2f1e117d0bd46aebc901
+  depends:
+  - python >=3.9
+  license: Apache-2.0
+  license_family: APACHE
+  size: 275642
+  timestamp: 1752823081585
+- conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.29.0-pyhd8ed1ab_0.conda
+  sha256: 6b471a18372bbd52bdf32fc965f71de3bc1b5219418b8e6b3875a67a7b08c483
+  md5: 8fa8358d022a3a9bd101384a808044c6
+  depends:
+  - python >=3.10
+  license: Unlicense
+  size: 34211
+  timestamp: 1776621506566
+- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda
+  sha256: fbf86c4a59c2ed05bbffb2ba25c7ed94f6185ec30ecb691615d42342baa1a16a
+  md5: c80d8a3b84358cb967fa81e7075fbc8a
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: MIT
+  license_family: MIT
+  size: 12723451
+  timestamp: 1773822285671
+- conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.19-pyhd8ed1ab_0.conda
+  sha256: 381cedccf0866babfc135d65ee40b778bd20e927d2a5ec810f750c5860a7c5b8
+  md5: 84a3233b709a289a4ddd7a2fd27dd988
+  depends:
+  - python >=3.10
+  - ukkonen
+  license: MIT
+  license_family: MIT
+  size: 79757
+  timestamp: 1776455344188
+- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda
+  sha256: 82ab2a0d91ca1e7e63ab6a4939356667ef683905dea631bc2121aa534d347b16
+  md5: 080594bf4493e6bae2607e65390c520a
+  depends:
+  - python >=3.10
+  - zipp >=3.20
+  - python
+  license: Apache-2.0
+  license_family: APACHE
+  size: 34387
+  timestamp: 1773931568510
+- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda
+  sha256: 3d584956604909ff5df353767f3a2a2f60e07d070b328d109f30ac40cd62df6c
+  md5: 18335a698559cdbcd86150a48bf54ba6
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - zstd >=1.5.7,<1.6.0a0
+  constrains:
+  - binutils_impl_linux-64 2.45.1
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 728002
+  timestamp: 1774197446916
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-6_h4a7cf45_openblas.conda
+  build_number: 6
+  sha256: 7bfe936dbb5db04820cf300a9cc1f5ee8d5302fc896c2d66e30f1ee2f20fbfd6
+  md5: 6d6d225559bfa6e2f3c90ee9c03d4e2e
+  depends:
+  - libopenblas >=0.3.32,<0.3.33.0a0
+  - libopenblas >=0.3.32,<1.0a0
+  constrains:
+  - blas 2.306   openblas
+  - liblapack  3.11.0   6*_openblas
+  - liblapacke 3.11.0   6*_openblas
+  - libcblas   3.11.0   6*_openblas
+  - mkl <2026
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 18621
+  timestamp: 1774503034895
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-hd0affe5_1.conda
+  sha256: 37c41b1024d0c75da76822e3c079aabaf121618a32fe05e53a897b35a88008fc
+  md5: 499cd8e2d4358986dbe3b30e8fe1bf6a
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 124432
+  timestamp: 1774333989027
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-6_h0358290_openblas.conda
+  build_number: 6
+  sha256: 57edafa7796f6fa3ebbd5367692dd4c7f552be42109c2dd1a7c89b55089bf374
+  md5: 36ae340a916635b97ac8a0655ace2a35
+  depends:
+  - libblas 3.11.0 6_h4a7cf45_openblas
+  constrains:
+  - blas 2.306   openblas
+  - liblapack  3.11.0   6*_openblas
+  - liblapacke 3.11.0   6*_openblas
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 18622
+  timestamp: 1774503050205
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda
+  sha256: 8c44b5bf947afad827df0df49fe7483cf1b2916694081b2db4fecdfd6a2bacd1
+  md5: 48418c48dac04671fa46cb446122b8a5
+  depends:
+  - __glibc >=2.28,<3.0.a0
+  - cuda-version >=13.1,<13.2.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  - rdma-core >=60.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 990938
+  timestamp: 1768273732081
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda
+  sha256: a24ad0ca488aa3e237049cd5b5c6d7fe3d2d4330682ed329203064e332ea1d74
+  md5: 056a67706108efd1f9c24682ba8d3685
+  depends:
+  - __glibc >=2.28,<3.0.a0
+  - cuda-version >=13.2,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  - rdma-core >=61.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 1082447
+  timestamp: 1776110053053
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.5-hecca717_0.conda
+  sha256: e8c2b57f6aacabdf2f1b0924bd4831ce5071ba080baa4a9e8c0d720588b6794c
+  md5: 49f570f3bc4c874a06ea69b7225753af
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  constrains:
+  - expat 2.7.5.*
+  license: MIT
+  license_family: MIT
+  size: 76624
+  timestamp: 1774719175983
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda
+  sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6
+  md5: a360c33a5abe61c07959e449fa1453eb
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: MIT
+  license_family: MIT
+  size: 58592
+  timestamp: 1769456073053
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda
+  sha256: faf7d2017b4d718951e3a59d081eb09759152f93038479b768e3d612688f83f5
+  md5: 0aa00f03f9e39fb9876085dee11a85d4
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - _openmp_mutex >=4.5
+  constrains:
+  - libgcc-ng ==15.2.0=*_18
+  - libgomp 15.2.0 he0feb66_18
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 1041788
+  timestamp: 1771378212382
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda
+  sha256: d2c9fad338fd85e4487424865da8e74006ab2e2475bd788f624d7a39b2a72aee
+  md5: 9063115da5bc35fdc3e1002e69b9ef6e
+  depends:
+  - libgfortran5 15.2.0 h68bc16d_18
+  constrains:
+  - libgfortran-ng ==15.2.0=*_18
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 27523
+  timestamp: 1771378269450
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda
+  sha256: 539b57cf50ec85509a94ba9949b7e30717839e4d694bc94f30d41c9d34de2d12
+  md5: 646855f357199a12f02a87382d429b75
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=15.2.0
+  constrains:
+  - libgfortran 15.2.0
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 2482475
+  timestamp: 1771378241063
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda
+  sha256: 21337ab58e5e0649d869ab168d4e609b033509de22521de1bfed0c031bfc5110
+  md5: 239c5e9546c38a1e884d69effcf4c882
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 603262
+  timestamp: 1771378117851
+- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-6_h47877c9_openblas.conda
+  build_number: 6
+  sha256: 371f517eb7010b21c6cc882c7606daccebb943307cb9a3bf2c70456a5c024f7d
+  md5: 881d801569b201c2e753f03c84b85e15
+  depends:
+  - libblas 3.11.0 6_h4a7cf45_openblas
+  constrains:
+  - blas 2.306   openblas
+  - liblapacke 3.11.0   6*_openblas
+  - libcblas   3.11.0   6*_openblas
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 18624
+  timestamp: 1774503065378
+- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda
+  sha256: ec30e52a3c1bf7d0425380a189d209a52baa03f22fb66dd3eb587acaa765bd6d
+  md5: b88d90cad08e6bc8ad540cb310a761fb
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  constrains:
+  - xz 5.8.3.*
+  license: 0BSD
+  size: 113478
+  timestamp: 1775825492909
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda
+  sha256: fe171ed5cf5959993d43ff72de7596e8ac2853e9021dec0344e583734f1e0843
+  md5: 2c21e66f50753a083cbe6b80f38268fa
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: BSD-2-Clause
+  license_family: BSD
+  size: 92400
+  timestamp: 1769482286018
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda
+  sha256: ba7c5d294e3d80f08ac5a39564217702d1a752e352e486210faff794ac5001b4
+  md5: db63358239cbe1ff86242406d440e44a
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  license: LGPL-2.1-or-later
+  license_family: LGPL
+  size: 741323
+  timestamp: 1731846827427
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.78-hecca717_0.conda
+  sha256: 2ece599a2a1090eb70916061ab8b49670ee9b143e1f3b41efa0e32e0336a9465
+  md5: 641ddee63cb39856291275114ce15d13
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=13,<13.3.0a0
+  - libgcc >=14
+  - libstdcxx >=14
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 31659345
+  timestamp: 1776110062236
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.32-pthreads_h94d23a6_0.conda
+  sha256: 6dc30b28f32737a1c52dada10c8f3a41bc9e021854215efca04a7f00487d09d9
+  md5: 89d61bc91d3f39fda0ca10fcd3c68594
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - libgfortran
+  - libgfortran5 >=14.3.0
+  constrains:
+  - openblas >=0.3.32,<0.3.33.0a0
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 5928890
+  timestamp: 1774471724897
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.0-hf4e2dac_0.conda
+  sha256: ec37c79f737933bbac965f5dc0f08ef2790247129a84bb3114fad4900adce401
+  md5: 810d83373448da85c3f673fbcb7ad3a3
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - icu >=78.3,<79.0a0
+  - libgcc >=14
+  - libzlib >=1.3.2,<2.0a0
+  license: blessing
+  size: 958864
+  timestamp: 1775753750179
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda
+  sha256: 78668020064fdaa27e9ab65cd2997e2c837b564ab26ce3bf0e58a2ce1a525c6e
+  md5: 1b08cd684f34175e4514474793d44bcb
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc 15.2.0 he0feb66_18
+  constrains:
+  - libstdcxx-ng ==15.2.0=*_18
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 5852330
+  timestamp: 1771378262446
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.13-hd0affe5_0.conda
+  sha256: c5008b602cb5c819f7b52d418b3ed17e1818cbbf6705b189e7ab36bb70cce3d8
+  md5: 8ee3cb7f64be0e8c4787f3a4dbe024e6
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libcap >=2.77,<2.78.0a0
+  - libgcc >=14
+  license: LGPL-2.1-or-later
+  size: 492799
+  timestamp: 1773797095649
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.13-hd0affe5_0.conda
+  sha256: 1a1e367c04d66030aa93b4d33905f7f6fbb59cfc292e816fe3e9c1e8b3f4d1e2
+  md5: 2c2270f93d6f9073cbf72d821dfc7d72
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libcap >=2.77,<2.78.0a0
+  - libgcc >=14
+  license: LGPL-2.1-or-later
+  size: 145087
+  timestamp: 1773797108513
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42-h5347b49_0.conda
+  sha256: bc1b08c92626c91500fd9f26f2c797f3eb153b627d53e9c13cd167f1e12b2829
+  md5: 38ffe67b78c9d4de527be8315e5ada2c
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 40297
+  timestamp: 1775052476770
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda
+  sha256: 55044c403570f0dc26e6364de4dc5368e5f3fc7ff103e867c487e2b5ab2bcda9
+  md5: d87ff7921124eccd67248aa483c23fec
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  constrains:
+  - zlib 1.3.2 *_2
+  license: Zlib
+  license_family: Other
+  size: 63629
+  timestamp: 1774072609062
+- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.6-hdb14827_0.conda
+  sha256: fc89f74bbe362fb29fa3c037697a89bec140b346a2469a90f7936d1d7ea4d8a3
+  md5: fc21868a1a5aacc937e7a18747acb8a5
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  license: X11 AND BSD-3-Clause
+  size: 918956
+  timestamp: 1777422145199
+- conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda
+  sha256: 4fa40e3e13fc6ea0a93f67dfc76c96190afd7ea4ffc1bac2612d954b42cdc3ee
+  md5: eb52d14a901e23c39e9e7b4a1a5c015f
+  depends:
+  - python >=3.10
+  - setuptools
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 40866
+  timestamp: 1766261270149
+- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.3-py314h2b28147_0.conda
+  sha256: f2ba8cb0d86a6461a6bcf0d315c80c7076083f72c6733c9290086640723f79ec
+  md5: 36f5b7eb328bdc204954a2225cf908e2
+  depends:
+  - python
+  - libstdcxx >=14
+  - libgcc >=14
+  - __glibc >=2.17,<3.0.a0
+  - python_abi 3.14.* *_cp314
+  - libcblas >=3.9.0,<4.0a0
+  - liblapack >=3.9.0,<4.0a0
+  - libblas >=3.9.0,<4.0a0
+  constrains:
+  - numpy-base <0a0
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 8927860
+  timestamp: 1773839233468
+- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.2-h35e630c_0.conda
+  sha256: c0ef482280e38c71a08ad6d71448194b719630345b0c9c60744a2010e8a8e0cb
+  md5: da1b85b6a87e141f5140bb9924cecab0
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - ca-certificates
+  - libgcc >=14
+  license: Apache-2.0
+  license_family: Apache
+  size: 3167099
+  timestamp: 1775587756857
+- conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.6-pyhcf101f3_0.conda
+  sha256: 8f29915c172f1f7f4f7c9391cd5dac3ebf5d13745c8b7c8006032615246345a5
+  md5: 89c0b6d1793601a2a3a3f7d2d3d8b937
+  depends:
+  - python >=3.10
+  - python
+  license: MIT
+  license_family: MIT
+  size: 25862
+  timestamp: 1775741140609
+- conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.6.0-pyha770c72_0.conda
+  sha256: 716960bf0a9eb334458a26b3bdcb17b8d0786062138a4f48c7f335c8418c5d0b
+  md5: 7859736b4f8ebe6c8481bf48d91c9a1e
+  depends:
+  - cfgv >=2.0.0
+  - identify >=1.0.0
+  - nodeenv >=0.11.1
+  - python >=3.10
+  - pyyaml >=5.1
+  - virtualenv >=20.10.0
+  license: MIT
+  license_family: MIT
+  size: 201606
+  timestamp: 1776858157327
+- conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
+  sha256: 79db7928d13fab2d892592223d7570f5061c192f27b9febd1a418427b719acc6
+  md5: 12c566707c80111f9799308d9e265aef
+  depends:
+  - python >=3.9
+  - python
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 110100
+  timestamp: 1733195786147
+- conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda
+  sha256: 438c41b42530874928733299ca815f5994f36996c86024f3f37ca220ed910a07
+  md5: ed166875b3876d5d7e6e39d2e8d1c6e3
+  depends:
+  - python >=3.14,<3.15.0a0
+  - python_abi 3.14.* *_cp314
+  - six
+  license: MIT
+  license_family: MIT
+  size: 273897
+  timestamp: 1765980972868
+- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.4-habeac84_100_cp314.conda
+  build_number: 100
+  sha256: dec247c5badc811baa34d6085df9d0465535883cf745e22e8d79092ad54a3a7b
+  md5: a443f87920815d41bfe611296e507995
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - bzip2 >=1.0.8,<2.0a0
+  - ld_impl_linux-64 >=2.36.1
+  - libexpat >=2.7.5,<3.0a0
+  - libffi >=3.5.2,<3.6.0a0
+  - libgcc >=14
+  - liblzma >=5.8.2,<6.0a0
+  - libmpdec >=4.0.0,<5.0a0
+  - libsqlite >=3.52.0,<4.0a0
+  - libuuid >=2.42,<3.0a0
+  - libzlib >=1.3.2,<2.0a0
+  - ncurses >=6.5,<7.0a0
+  - openssl >=3.5.6,<4.0a0
+  - python_abi 3.14.* *_cp314
+  - readline >=8.3,<9.0a0
+  - tk >=8.6.13,<8.7.0a0
+  - tzdata
+  - zstd >=1.5.7,<1.6.0a0
+  license: Python-2.0
+  size: 36705460
+  timestamp: 1775614357822
+  python_site_packages_path: lib/python3.14/site-packages
+- conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.2.2-pyhcf101f3_0.conda
+  sha256: 498ad019d75ba31c7891dc6d9efc8a7ed48cd5d5973f3a9377eb1b174577d3db
+  md5: feb2e11368da12d6ce473b6573efab41
+  depends:
+  - python >=3.10
+  - filelock >=3.15.4
+  - platformdirs <5,>=4.3.6
+  - python
+  license: MIT
+  license_family: MIT
+  size: 34341
+  timestamp: 1775586706825
+- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda
+  build_number: 8
+  sha256: ad6d2e9ac39751cc0529dd1566a26751a0bf2542adb0c232533d32e176e21db5
+  md5: 0539938c55b6b1a59b560e843ad864a4
+  constrains:
+  - python 3.14.* *_cp314
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 6989
+  timestamp: 1752805904792
+- conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda
+  sha256: b318fb070c7a1f89980ef124b80a0b5ccf3928143708a85e0053cde0169c699d
+  md5: 2035f68f96be30dc60a5dfd7452c7941
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - python >=3.14,<3.15.0a0
+  - python_abi 3.14.* *_cp314
+  - yaml >=0.2.5,<0.3.0a0
+  license: MIT
+  license_family: MIT
+  size: 202391
+  timestamp: 1770223462836
+- conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda
+  sha256: 8e0b7962cf8bec9a016cd91a6c6dc1f9ebc8e7e316b1d572f7b9047d0de54717
+  md5: d487d93d170e332ab39803e05912a762
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - libnl >=3.11.0,<4.0a0
+  - libstdcxx >=14
+  - libsystemd0 >=257.10
+  - libudev1 >=257.10
+  license: Linux-OpenIB
+  license_family: BSD
+  size: 1268666
+  timestamp: 1769154883613
+- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda
+  sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002
+  md5: d7d95fc8287ea7bf33e0e7116d2b95ec
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - ncurses >=6.5,<7.0a0
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 345073
+  timestamp: 1765813471974
+- conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda
+  sha256: 82088a6e4daa33329a30bc26dc19a98c7c1d3f05c0f73ce9845d4eab4924e9e1
+  md5: 8e194e7b992f99a5015edbd4ebd38efd
+  depends:
+  - python >=3.10
+  license: MIT
+  license_family: MIT
+  size: 639697
+  timestamp: 1773074868565
+- conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda
+  sha256: 458227f759d5e3fcec5d9b7acce54e10c9e1f4f4b7ec978f3bfd54ce4ee9853d
+  md5: 3339e3b65d58accf4ca4fb8748ab16b3
+  depends:
+  - python >=3.9
+  - python
+  license: MIT
+  license_family: MIT
+  size: 18455
+  timestamp: 1753199211006
+- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda
+  sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac
+  md5: cffd3bdd58090148f4cfcd831f4b26ab
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=14
+  - libzlib >=1.3.1,<2.0a0
+  constrains:
+  - xorg-libx11 >=1.8.12,<2.0a0
+  license: TCL
+  license_family: BSD
+  size: 3301196
+  timestamp: 1769460227866
+- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
+  sha256: 032271135bca55aeb156cee361c81350c6f3fb203f57d024d7e5a1fc9ef18731
+  md5: 0caa1af407ecff61170c9437a808404d
+  depends:
+  - python >=3.10
+  - python
+  license: PSF-2.0
+  license_family: PSF
+  size: 51692
+  timestamp: 1756220668932
+- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda
+  sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c
+  md5: ad659d0a2b3e47e38d829aa8cad2d610
+  license: LicenseRef-Public-Domain
+  size: 119135
+  timestamp: 1767016325805
+- conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda
+  sha256: c84034056dc938c853e4f61e72e5bd37e2ec91927a661fb9762f678cbea52d43
+  md5: 5d3c008e54c7f49592fca9c32896a76f
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cffi
+  - libgcc >=14
+  - libstdcxx >=14
+  - python >=3.14,<3.15.0a0
+  - python_abi 3.14.* *_cp314
+  license: MIT
+  license_family: MIT
+  size: 15004
+  timestamp: 1769438727085
+- conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.3.0-pyhcf101f3_0.conda
+  sha256: defaf2bc2a3cf6f1455149531e8be4d03e18eb1d022ffe4f4d964d49bbf0fe34
+  md5: da6e70a64226740cef159121dbe40b95
+  depends:
+  - python >=3.10
+  - distlib >=0.3.7,<1
+  - filelock <4,>=3.24.2
+  - importlib-metadata >=6.6
+  - platformdirs >=3.9.1,<5
+  - python-discovery >=1
+  - typing_extensions >=4.13.2
+  - python
+  license: MIT
+  license_family: MIT
+  size: 5161814
+  timestamp: 1777321763628
+- conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda
+  sha256: 6d9ea2f731e284e9316d95fa61869fe7bbba33df7929f82693c121022810f4ad
+  md5: a77f85f77be52ff59391544bfe73390a
+  depends:
+  - libgcc >=14
+  - __glibc >=2.17,<3.0.a0
+  license: MIT
+  license_family: MIT
+  size: 85189
+  timestamp: 1753484064210
+- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.1-pyhcf101f3_0.conda
+  sha256: 523616c0530d305d2216c2b4a8dfd3872628b60083255b89c5e0d8c42e738cca
+  md5: e1c36c6121a7c9c76f2f148f1e83b983
+  depends:
+  - python >=3.10
+  - python
+  license: MIT
+  license_family: MIT
+  size: 24461
+  timestamp: 1776131454755
+- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda
+  sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7
+  md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libzlib >=1.3.1,<2.0a0
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 601375
+  timestamp: 1764777111296
diff --git a/benchmarks/cuda_core/pixi.toml b/benchmarks/cuda_core/pixi.toml
new file mode 100644
index 00000000000..419dfd91b92
--- /dev/null
+++ b/benchmarks/cuda_core/pixi.toml
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+[workspace]
+channels = ["conda-forge"]
+platforms = ["linux-64"]
+preview = ["pixi-build"]
+channel-priority = "disabled"
+
+[feature.cu13.system-requirements]
+cuda = "13"
+
+[feature.cu13-pinned.dependencies]
+cuda-version = "13.1.*"
+
+[feature.cu13-source.dependencies]
+cuda-version = "13.*"
+
+[feature.bench.dependencies]
+python = "3.14.*"
+pyperf = "*"
+numpy = "*"
+
+[feature.dev.dependencies]
+pre-commit = "*"
+
+# Wheel environment: released cuda-core from conda-forge.
+[feature.core-wheel.dependencies]
+cuda-core = "==0.7.0"
+
+# Source environment: build cuda-core + cuda-bindings from the in-tree
+# sources so local changes are exercised. Mirrors cuda_core/pixi.toml.
+[feature.core-source.dependencies]
+cuda-core = { path = "../../cuda_core" }
+cuda-bindings = { path = "../../cuda_bindings" }
+
+[environments]
+wheel = { features = ["cu13", "cu13-pinned", "bench", "dev", "core-wheel"] }
+source = { features = ["cu13", "cu13-source", "bench", "dev", "core-source"] }
+
+[target.linux.tasks.bench]
+cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py"]
+
+[target.linux.tasks.bench-smoke-test]
+cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py", "--debug-single-value"]
+
+[target.linux.tasks.bench-compare]
+cmd = ["python", "$PIXI_PROJECT_ROOT/compare.py"]
+
+[target.linux.tasks.lint]
+cmd = ["pre-commit", "run", "--all-files"]
diff --git a/benchmarks/cuda_core/run_pyperf.py b/benchmarks/cuda_core/run_pyperf.py
new file mode 100644
index 00000000000..1f0c4b130b1
--- /dev/null
+++ b/benchmarks/cuda_core/run_pyperf.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Entry point for the cuda.core latency benchmark suite.
+
+The pyperf runner lives in the cuda_bindings suite. Reuse it by putting
+that directory on sys.path, then call main() with this suite's paths.
+pyperf workers re-execute this script, so the sys.path tweak is done
+before the worker can import anything.
+"""
+
+import sys
+from pathlib import Path
+
+HERE = Path(__file__).resolve().parent
+CUDA_BINDINGS_SUITE = HERE.parent / "cuda_bindings"
+
+# Share the runner with cuda_bindings; keep cuda_core's own modules
+# (benchmarks/, runtime.py) resolvable via the script's own directory.
+if str(CUDA_BINDINGS_SUITE) not in sys.path:
+    sys.path.append(str(CUDA_BINDINGS_SUITE))
+
+from runner.main import main
+
+if __name__ == "__main__":
+    main(
+        bench_dir=HERE / "benchmarks",
+        default_output=HERE / "results-python.json",
+        module_name_prefix="cuda_core_bench",
+        bench_filter_env_var="CUDA_CORE_BENCH_FILTER",
+    )
diff --git a/benchmarks/cuda_core/runtime.py b/benchmarks/cuda_core/runtime.py
new file mode 100644
index 00000000000..d2942f40240
--- /dev/null
+++ b/benchmarks/cuda_core/runtime.py
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Shared cuda.core setup for the latency benchmarks.
+
+Holds the one Device/stream/ObjectCode instance that every bench module
+reuses. No state is shared with the cuda_bindings suite — each suite
+runs standalone in its own process.
+"""
+
+import atexit
+
+from cuda.core import Device, Program, ProgramOptions
+
+_device: Device | None = None
+_modules: list = []
+
+
+def ensure_device() -> Device:
+    """Return the primary Device, initializing it on first call."""
+    global _device
+    if _device is not None:
+        return _device
+    dev = Device()
+    dev.set_current()
+    _device = dev
+    return dev
+
+
+def register_module(module) -> object:
+    """Keep a reference to an ObjectCode so its kernels stay alive."""
+    _modules.append(module)
+    return module
+
+
+def compile_module(kernel_source: str, name_expressions: tuple[str, ...]) -> object:
+    """Compile a CUDA C++ source with NVRTC via cuda.core.Program.
+
+    name_expressions must list every __global__ function the caller
+    intends to fetch via ObjectCode.get_kernel().
+    """
+    dev = ensure_device()
+    options = ProgramOptions(arch=f"sm_{dev.arch}", fma=False)
+    prog = Program(kernel_source, code_type="c++", options=options)
+    return register_module(prog.compile("cubin", name_expressions=name_expressions))
+
+
+def cleanup() -> None:
+    _modules.clear()
+
+
+atexit.register(cleanup)

From a711361244d4ca5819a94995cb534a2962106372 Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 1 May 2026 12:53:01 -0500
Subject: [PATCH 2/4] cuda.core benchmarks


From 2144446abde43deb429354107cf9fc8303a4b1c9 Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 1 May 2026 13:27:06 -0500
Subject: [PATCH 3/4] cuda.core benchmarks

---
 benchmarks/cuda_bindings/runner/main.py       |  7 +-
 benchmarks/cuda_bindings/tests/test_runner.py |  2 +-
 .../cuda_core/benchmarks/bench_launch.py      | 14 ++--
 benchmarks/cuda_core/compare.py               | 71 +++++++++++++++----
 4 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/benchmarks/cuda_bindings/runner/main.py b/benchmarks/cuda_bindings/runner/main.py
index f4a138fd4e4..9c984c340d6 100644
--- a/benchmarks/cuda_bindings/runner/main.py
+++ b/benchmarks/cuda_bindings/runner/main.py
@@ -120,7 +120,7 @@ def _collect_skipped_benchmarks(
 
 
 def discover_benchmarks(
-    bench_dir: Path = BENCH_DIR,
+    bench_dir: Path | None = None,
     module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX,
 ) -> dict[str, Callable[[int], float]]:
     """Discover bench_ functions.
@@ -129,6 +129,11 @@ def discover_benchmarks(
     where it calls the operation `loops` times and returns the total elapsed
     time in seconds (using time.perf_counter).
     """
+    # Resolve the default inside the call so tests (and embedders) can
+    # monkeypatch ``BENCH_DIR`` at the module level — Python binds default
+    # args at def-time, so a literal default would ignore later patches.
+    if bench_dir is None:
+        bench_dir = BENCH_DIR
     registry: dict[str, Callable[[int], float]] = {}
     for module_path in sorted(bench_dir.glob("bench_*.py")):
         module_name = module_path.stem
diff --git a/benchmarks/cuda_bindings/tests/test_runner.py b/benchmarks/cuda_bindings/tests/test_runner.py
index f26baf8f5b5..56d88444c9e 100644
--- a/benchmarks/cuda_bindings/tests/test_runner.py
+++ b/benchmarks/cuda_bindings/tests/test_runner.py
@@ -135,7 +135,7 @@ def test_discover_benchmarks_is_lazy(monkeypatch, tmp_path):
 def test_ensure_pyperf_worker_env_preserves_existing_args(monkeypatch):
     runner_main = load_runner_main(monkeypatch)
 
-    for env_var in runner_main.PYPERF_INHERITED_ENV_VARS:
+    for env_var in runner_main.BASE_PYPERF_INHERITED_ENV_VARS:
         monkeypatch.delenv(env_var, raising=False)
     monkeypatch.setenv("CUDA_PATH", "/opt/cuda")
     monkeypatch.setenv("LD_LIBRARY_PATH", "/opt/cuda/lib64")
diff --git a/benchmarks/cuda_core/benchmarks/bench_launch.py b/benchmarks/cuda_core/benchmarks/bench_launch.py
index 09d418b0a05..c08f866c63c 100644
--- a/benchmarks/cuda_core/benchmarks/bench_launch.py
+++ b/benchmarks/cuda_core/benchmarks/bench_launch.py
@@ -96,11 +96,15 @@ def _ensure_launch_state() -> None:
 
     module = compile_module(KERNEL_SOURCE, KERNEL_NAMES)
 
-    # Pre-allocate buffers for the kernel args. Use ints (raw pointer
-    # addresses) in the launch hot path so ParamHolder skips the Buffer
-    # type check and goes through its int fast-path.
-    float_buf = DEV.allocate(4)
-    int_bufs_512 = tuple(DEV.allocate(4) for _ in range(512))
+    # Pre-allocate buffers for the kernel args. Allocate on STREAM so the
+    # pool-async allocs are stream-ordered with the launches we will issue
+    # on the same stream below — matches the cuda.bindings bench, which
+    # uses synchronous cuMemAlloc and therefore has the pointers ready
+    # before the first timed launch. Use ints (raw pointer addresses) in
+    # the launch hot path so ParamHolder skips the Buffer type check and
+    # goes through its int fast-path.
+    float_buf = DEV.allocate(4, STREAM)
+    int_bufs_512 = tuple(DEV.allocate(4, STREAM) for _ in range(512))
     int_ptrs_512 = tuple(int(b.handle) for b in int_bufs_512)
 
     MODULE = module
diff --git a/benchmarks/cuda_core/compare.py b/benchmarks/cuda_core/compare.py
index 99cbca31a69..2cb1ed92bd0 100644
--- a/benchmarks/cuda_core/compare.py
+++ b/benchmarks/cuda_core/compare.py
@@ -20,6 +20,33 @@
 DEFAULT_CORE = HERE / "results-python.json"
 DEFAULT_BINDINGS = HERE.parent / "cuda_bindings" / "results-python.json"
 
+# Benchmark IDs where cuda.core and cuda.bindings exercise *different*
+# underlying driver calls or hit a cuda.core-side cache, so the "Delta"
+# column is NOT pure Python wrapper overhead. See BENCHMARK_PLAN.md's
+# "Audit notes" section for a full explanation of each entry.
+DIFFERENT_CODEPATH_BENCHMARKS: frozenset[str] = frozenset(
+    {
+        # cuCtxGetDevice (core) vs cuCtxGetCurrent (bindings).
+        "ctx_device.ctx_get_current",
+        # TLS list lookup (core) vs cuDeviceGet (bindings).
+        "ctx_device.device_get",
+        # DeviceProperties dict cache hit (core) vs cuDeviceGetAttribute
+        # (bindings) on every iteration.
+        "ctx_device.device_get_attribute",
+        # cuMemAllocFromPoolAsync on default stream (core) vs synchronous
+        # cuMemAlloc (bindings).
+        "memory.mem_alloc_free",
+        # cuLaunchKernelEx + per-call ParamHolder (core) vs cuLaunchKernel
+        # with pre-built arg tuple (bindings).
+        "launch.launch_empty_kernel",
+        "launch.launch_small_kernel",
+        "launch.launch_16_args",
+        "launch.launch_256_args",
+        "launch.launch_512_args",
+    }
+)
+DIFFERENT_CODEPATH_MARKER = "*"
+
 
 def load_benchmarks(path: Path) -> dict[str, list[float]]:
     """Load a pyperf JSON file and return {name: [values]}."""
@@ -59,11 +86,11 @@ def fmt_ns(seconds: float) -> str:
     return f"{seconds * 1e9:,.0f}"
 
 
-def fmt_overhead_ns(core_mean: float, bindings_mean: float) -> str:
+def fmt_delta_ns(core_mean: float, bindings_mean: float) -> str:
     return f"{(core_mean - bindings_mean) * 1e9:+,.0f}"
 
 
-def fmt_overhead_pct(core_mean: float, bindings_mean: float) -> str:
+def fmt_delta_pct(core_mean: float, bindings_mean: float) -> str:
     if bindings_mean <= 0.0:
         return "-"
     pct = (core_mean - bindings_mean) / bindings_mean * 100
@@ -99,26 +126,42 @@ def main() -> None:
         sys.exit(1)
 
     all_names = sorted(set(core_benchmarks) | set(bindings_benchmarks))
-    name_width = max(len(n) for n in all_names)
+
+    # Reserve a trailing column of space for the "different codepath" marker
+    # so it does not collide visually with the benchmark ID.
+    display_names = {
+        name: f"{name} {DIFFERENT_CODEPATH_MARKER}" if name in DIFFERENT_CODEPATH_BENCHMARKS else name
+        for name in all_names
+    }
+    name_width = max(len(display_names[n]) for n in all_names)
     name_width = max(name_width, len("Benchmark"))
 
     bind_w = 14
     core_w = 14
     rsd_w = 8
-    oh_ns_w = 12
-    oh_pct_w = 10
+    delta_ns_w = 12
+    delta_pct_w = 10
 
     if bindings_benchmarks:
         header = (
             f"{'Benchmark':<{name_width}}  "
             f"{'bindings (ns)':>{bind_w}}  {'RSD':>{rsd_w}}  "
             f"{'core (ns)':>{core_w}}  {'RSD':>{rsd_w}}  "
-            f"{'Overhead ns':>{oh_ns_w}}  {'Overhead %':>{oh_pct_w}}"
+            f"{'Delta ns':>{delta_ns_w}}  {'Delta %':>{delta_pct_w}}"
         )
     else:
         header = f"{'Benchmark':<{name_width}}  {'core (ns)':>{core_w}}  {'RSD':>{rsd_w}}"
 
     sep = "-" * len(header)
+
+    if bindings_benchmarks:
+        # Keep legend lines shorter than the table so they don't overflow.
+        print("Delta = core mean - bindings mean (positive = cuda.core slower).")
+        print(f"{DIFFERENT_CODEPATH_MARKER} marks benchmarks where core and bindings exercise different")
+        print("  underlying driver calls or hit a cuda.core cache — see BENCHMARK_PLAN.md")
+        print("  (Audit notes) for details on each row.")
+        print()
+
     print(sep)
     print(header)
     print(sep)
@@ -136,21 +179,23 @@ def main() -> None:
         bind_rsd = fmt_rsd(bind_stats[2]) if bind_stats else "-"
 
         if core_stats and bind_stats:
-            overhead_ns_str = fmt_overhead_ns(core_stats[0], bind_stats[0])
-            overhead_pct_str = fmt_overhead_pct(core_stats[0], bind_stats[0])
+            delta_ns_str = fmt_delta_ns(core_stats[0], bind_stats[0])
+            delta_pct_str = fmt_delta_pct(core_stats[0], bind_stats[0])
         else:
-            overhead_ns_str = "-"
-            overhead_pct_str = "-"
+            delta_ns_str = "-"
+            delta_pct_str = "-"
+
+        display_name = display_names[name]
 
         if bindings_benchmarks:
             print(
-                f"{name:<{name_width}}  "
+                f"{display_name:<{name_width}}  "
                 f"{bind_str:>{bind_w}}  {bind_rsd:>{rsd_w}}  "
                 f"{core_str:>{core_w}}  {core_rsd:>{rsd_w}}  "
-                f"{overhead_ns_str:>{oh_ns_w}}  {overhead_pct_str:>{oh_pct_w}}"
+                f"{delta_ns_str:>{delta_ns_w}}  {delta_pct_str:>{delta_pct_w}}"
             )
         else:
-            print(f"{name:<{name_width}}  {core_str:>{core_w}}  {core_rsd:>{rsd_w}}")
+            print(f"{display_name:<{name_width}}  {core_str:>{core_w}}  {core_rsd:>{rsd_w}}")
 
     print(sep)
 

From c25b82f26ee089c5bcc03e77de3a230665d16b49 Mon Sep 17 00:00:00 2001
From: Daniel Rodriguez <daniel@danielfrg.com>
Date: Fri, 1 May 2026 13:43:15 -0500
Subject: [PATCH 4/4] cuda.core benchmarks

---
 benchmarks/cuda_core/BENCHMARK_PLAN.md | 111 -------------------------
 benchmarks/cuda_core/compare.py        |  24 ++++--
 2 files changed, 17 insertions(+), 118 deletions(-)
 delete mode 100644 benchmarks/cuda_core/BENCHMARK_PLAN.md

diff --git a/benchmarks/cuda_core/BENCHMARK_PLAN.md b/benchmarks/cuda_core/BENCHMARK_PLAN.md
deleted file mode 100644
index 93d29a735a2..00000000000
--- a/benchmarks/cuda_core/BENCHMARK_PLAN.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# cuda.core Benchmark Plan
-
-## Goal
-
-Measure the **additional Python-side latency** that `cuda.core` adds on top
-of `cuda.bindings` for every public API with a clear cuda.bindings
-counterpart. Target is **< 1 µs** of extra overhead per call.
-
-The baseline for this suite is not C++: it is the cuda.bindings Python
-results file (`../cuda_bindings/results-python.json`). Benchmark IDs are
-kept identical across suites so a direct diff is possible — see
-`compare.py`.
-
-## Phase 1 coverage
-
-Benchmark IDs shipped in this PR (all map 1:1 to cuda.bindings):
-
-| ID                                        | cuda.core surface                                  |
-|-------------------------------------------|----------------------------------------------------|
-| `ctx_device.ctx_get_current`              | `Device()`                                         |
-| `ctx_device.ctx_set_current`              | `dev.set_current()`                                |
-| `ctx_device.device_get`                   | `Device(0)`                                        |
-| `ctx_device.device_get_attribute`         | `dev.properties.compute_capability_major` (cached) |
-| `stream.stream_create_destroy`            | `dev.create_stream()` + `stream.close()`           |
-| `stream.stream_synchronize`               | `stream.sync()`                                    |
-| `event.event_create_destroy`              | `dev.create_event()` + `event.close()`             |
-| `event.event_record`                      | `stream.record(event)`                             |
-| `event.event_query`                       | `event.is_done`                                    |
-| `event.event_synchronize`                 | `event.sync()`                                     |
-| `memory.mem_alloc_free`                   | `dev.allocate(size)` + `buf.close()` (async pool!) |
-| `memory.mem_alloc_async_free_async`       | `dev.allocate(size, stream)` + `buf.close(stream)` |
-| `launch.launch_empty_kernel`              | `launch(stream, config, kernel)`                   |
-| `launch.launch_small_kernel`              | `launch(..., ptr)`                                 |
-| `launch.launch_16_args`                   | `launch(..., *16 ptrs)`                            |
-| `launch.launch_256_args`                  | `launch(..., *256 ptrs)`                           |
-| `launch.launch_512_args`                  | `launch(..., *512 ptrs)`                           |
-
-## Intentionally not covered in Phase 1
-
-- `ctx_device.ctx_get_device`, `ctx_device.device_primary_ctx_retain`:
-  cuda.core abstracts CUDA contexts away — no direct counterpart.
-- `enum.*`: cuda.core does not re-export cuda.bindings enums; those
-  benches measure a cuda.bindings-specific cost.
-- `stream.stream_query`: no public `Stream.query()` in cuda.core.
-- `launch.launch_*_pre_packed`: pre-packing is a cuda.bindings-specific
-  optimization of its tuple-of-args API; cuda.core's `ParamHolder`
-  handles packing internally on every call.
-- `launch.launch_512_bools / _ints / _doubles / _bytes / _longlongs`:
-  non-pointer scalar arg variants — deferred to Phase 2.
-- `launch.launch_2048b`: struct-by-value arg — requires a
-  `TensorMapDescriptor`/ctypes path that is not yet settled in cuda.core.
-- `memory.memcpy_htod / _dtoh / _dtod`: cuda.core's `Buffer.copy_to /
-  copy_from` only go buffer-to-buffer through `cuMemcpyAsync`; pairing
-  with cuda.bindings' synchronous `cuMemcpyDtoD` / `HtoD` / `DtoH` would
-  be apples-to-oranges. Deferred until the comparable host-memory
-  resource path is finalized.
-- NVRTC / module benches: cuda.core's `Program` / `ObjectCode` pipeline
-  is meaningfully different from raw NVRTC; deserves its own set of
-  bench functions rather than reusing cuda.bindings IDs.
-
-## Audit notes: known driver-call mismatches
-
-The IDs above match cuda.bindings 1:1 at the *public API* level, but a few
-measure a different underlying driver call. Readers of `compare.py` should
-know which deltas are "pure cuda.core Python overhead" vs. a deliberate
-different driver path:
-
-- `ctx_device.ctx_get_current`: `Device()` reads a TLS-cached device
-  object; cuda.bindings calls `cuCtxGetCurrent` every iteration. Expect
-  cuda.core to be faster. Not apples-to-apples at the driver level;
-  apples-to-apples at the user-facing "give me the current device" level.
-- `ctx_device.device_get_attribute`: `DeviceProperties` caches the first
-  lookup in a Python dict (`_get_cached_attribute`, `_device.pyx:75`).
-  After the first iteration this is a dict hit, not a `cuDeviceGetAttribute`
-  driver call. Expect cuda.core to be faster here too. A future
-  paired bench can use an uncached attribute (e.g. `compute_mode`) to
-  measure the wrapper overhead on the driver-call path.
-- `stream.stream_create_destroy`: default `StreamOptions(nonblocking=True)`
-  yields the same `CU_STREAM_NON_BLOCKING` flag as the cuda.bindings
-  bench, but cuda.core additionally calls `cuCtxGetStreamPriorityRange`
-  and builds a `StreamOptions` dataclass per create — real cuda.core
-  overhead, fair to measure.
-- `memory.mem_alloc_free`: **deliberate mismatch**. `dev.allocate(size)`
-  with `stream=None` routes through `_MP_allocate` → `cuMemAllocFromPoolAsync`
-  on the cached default stream (`_memory_pool.pyx:302`). cuda.bindings
-  measures the synchronous `cuMemAlloc`. The bench captures the
-  user-visible cost of `dev.allocate(size)`, which is what a cuda.core
-  user actually pays; it does **not** isolate "Python wrapper overhead
-  on top of `cuMemAlloc`" because cuda.core does not expose a sync
-  `cuMemAlloc` path.
-- `memory.mem_alloc_async_free_async`: same internal path as
-  `mem_alloc_free` (both go through `cuMemAllocFromPoolAsync`); the
-  only difference is whether `default_stream()` is fetched or a stream
-  is passed in. Driver call matches cuda.bindings' `cuMemAllocAsync`
-  semantically but uses the pool-backed variant.
-- `launch.*`: cuda.core uses `cuLaunchKernelEx` (takes a
-  `CUlaunchConfig` struct) and allocates a fresh `ParamHolder` +
-  `LaunchConfig._to_native_launch_config()` per call. cuda.bindings
-  uses `cuLaunchKernel` with pre-built arg tuples. The delta captures
-  both the Python-side per-call work and the `Ex` vs non-`Ex` driver
-  cost; this is real and expected cuda.core overhead.
-
-## Next up (not in this PR)
-
-1. Scalar launch variants (512 bools/ints/doubles/bytes/longlongs) so
-   arg-packing overhead is covered beyond the pointer fast-path.
-2. Buffer-based memcpy benchmarks once the host-memory resource path is
-   stable in cuda.core.
-3. NVRTC / `Program` / `ObjectCode` latency benches.
-4. TMA (`TensorMapDescriptor`) benches when cuda.core's CCCL-backed
-   helper is formalised.
diff --git a/benchmarks/cuda_core/compare.py b/benchmarks/cuda_core/compare.py
index 2cb1ed92bd0..02027ad43bd 100644
--- a/benchmarks/cuda_core/compare.py
+++ b/benchmarks/cuda_core/compare.py
@@ -20,10 +20,12 @@
 DEFAULT_CORE = HERE / "results-python.json"
 DEFAULT_BINDINGS = HERE.parent / "cuda_bindings" / "results-python.json"
 
-# Benchmark IDs where cuda.core and cuda.bindings exercise *different*
-# underlying driver calls or hit a cuda.core-side cache, so the "Delta"
-# column is NOT pure Python wrapper overhead. See BENCHMARK_PLAN.md's
-# "Audit notes" section for a full explanation of each entry.
+# Benchmark IDs where the cuda.core path invokes a different driver
+# symbol, makes an additional driver call, or hits a cuda.core-side cache
+# compared to the cuda.bindings bench — i.e. the "Delta" column is NOT
+# pure Python wrapper overhead on top of the same driver call.
+# Unstarred rows compare like-for-like driver calls. See
+# BENCHMARK_PLAN.md's "Audit notes" section for per-row rationale.
 DIFFERENT_CODEPATH_BENCHMARKS: frozenset[str] = frozenset(
     {
         # cuCtxGetDevice (core) vs cuCtxGetCurrent (bindings).
@@ -33,9 +35,15 @@
         # DeviceProperties dict cache hit (core) vs cuDeviceGetAttribute
         # (bindings) on every iteration.
         "ctx_device.device_get_attribute",
+        # cuStreamCreateWithPriority + cuCtxGetStreamPriorityRange (core)
+        # vs cuStreamCreate (bindings).
+        "stream.stream_create_destroy",
         # cuMemAllocFromPoolAsync on default stream (core) vs synchronous
         # cuMemAlloc (bindings).
         "memory.mem_alloc_free",
+        # cuMemAllocFromPoolAsync with explicit pool handle (core) vs
+        # cuMemAllocAsync with implicit default pool (bindings).
+        "memory.mem_alloc_async_free_async",
         # cuLaunchKernelEx + per-call ParamHolder (core) vs cuLaunchKernel
         # with pre-built arg tuple (bindings).
         "launch.launch_empty_kernel",
@@ -157,9 +165,11 @@ def main() -> None:
     if bindings_benchmarks:
         # Keep legend lines shorter than the table so they don't overflow.
         print("Delta = core mean - bindings mean (positive = cuda.core slower).")
-        print(f"{DIFFERENT_CODEPATH_MARKER} marks benchmarks where core and bindings exercise different")
-        print("  underlying driver calls or hit a cuda.core cache — see BENCHMARK_PLAN.md")
-        print("  (Audit notes) for details on each row.")
+        print(f"{DIFFERENT_CODEPATH_MARKER} marks benchmarks where the cuda.core path invokes a different driver")
+        print("  symbol, makes an additional driver call, or hits a cuda.core-side cache")
+        print("  — so Delta is not pure Python wrapper overhead on top of the same driver")
+        print("  call. Unstarred rows compare like-for-like driver calls; their Delta is")
+        print("  wrapper overhead. See BENCHMARK_PLAN.md (Audit notes) for per-row detail.")
         print()
 
     print(sep)