From ed099ad26e60cea5e5a7643a8a607c7de2fdac1a Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 1 May 2026 12:50:01 -0500 Subject: [PATCH 1/4] cuda.core benchmarks --- benchmarks/cuda_bindings/runner/main.py | 65 +- benchmarks/cuda_core/.gitignore | 16 + benchmarks/cuda_core/AGENTS.md | 11 + benchmarks/cuda_core/BENCHMARK_PLAN.md | 111 ++ benchmarks/cuda_core/README.md | 73 ++ .../cuda_core/benchmarks/bench_ctx_device.py | 55 + .../cuda_core/benchmarks/bench_event.py | 53 + .../cuda_core/benchmarks/bench_launch.py | 183 ++++ .../cuda_core/benchmarks/bench_memory.py | 37 + .../cuda_core/benchmarks/bench_stream.py | 29 + benchmarks/cuda_core/compare.py | 159 +++ benchmarks/cuda_core/pixi.lock | 972 ++++++++++++++++++ benchmarks/cuda_core/pixi.toml | 52 + benchmarks/cuda_core/run_pyperf.py | 32 + benchmarks/cuda_core/runtime.py | 53 + 15 files changed, 1877 insertions(+), 24 deletions(-) create mode 100644 benchmarks/cuda_core/.gitignore create mode 100644 benchmarks/cuda_core/AGENTS.md create mode 100644 benchmarks/cuda_core/BENCHMARK_PLAN.md create mode 100644 benchmarks/cuda_core/README.md create mode 100644 benchmarks/cuda_core/benchmarks/bench_ctx_device.py create mode 100644 benchmarks/cuda_core/benchmarks/bench_event.py create mode 100644 benchmarks/cuda_core/benchmarks/bench_launch.py create mode 100644 benchmarks/cuda_core/benchmarks/bench_memory.py create mode 100644 benchmarks/cuda_core/benchmarks/bench_stream.py create mode 100644 benchmarks/cuda_core/compare.py create mode 100644 benchmarks/cuda_core/pixi.lock create mode 100644 benchmarks/cuda_core/pixi.toml create mode 100644 benchmarks/cuda_core/run_pyperf.py create mode 100644 benchmarks/cuda_core/runtime.py diff --git a/benchmarks/cuda_bindings/runner/main.py b/benchmarks/cuda_bindings/runner/main.py index 2e7312c3cef..f4a138fd4e4 100644 --- a/benchmarks/cuda_bindings/runner/main.py +++ b/benchmarks/cuda_bindings/runner/main.py @@ -16,30 +16,30 @@ PROJECT_ROOT = Path(__file__).resolve().parent.parent BENCH_DIR = PROJECT_ROOT / "benchmarks" DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json" +DEFAULT_MODULE_NAME_PREFIX = "cuda_bindings_bench" # Env var used to propagate the --benchmark filter from the parent to pyperf # worker subprocesses. pyperf reconstructs worker argv from scratch and drops # custom flags like --benchmark, so without this the worker would register the # full bench list and pyperf would run the wrong bench by task index. -BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER" +DEFAULT_BENCH_FILTER_ENV_VAR = "CUDA_BINDINGS_BENCH_FILTER" -PYPERF_INHERITED_ENV_VARS = ( +BASE_PYPERF_INHERITED_ENV_VARS = ( "CUDA_HOME", "CUDA_PATH", "CUDA_VISIBLE_DEVICES", "LD_LIBRARY_PATH", "NVIDIA_VISIBLE_DEVICES", - BENCH_FILTER_ENV_VAR, ) _MODULE_CACHE: dict[Path, ModuleType] = {} -def load_module(module_path: Path) -> ModuleType: +def load_module(module_path: Path, module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX) -> ModuleType: module_path = module_path.resolve() cached_module = _MODULE_CACHE.get(module_path) if cached_module is not None: return cached_module - module_name = f"cuda_bindings_bench_{module_path.stem}" + module_name = f"{module_name_prefix}_{module_path.stem}" spec = importlib.util.spec_from_file_location(module_name, module_path) if spec is None or spec.loader is None: raise RuntimeError(f"Failed to load benchmark module: {module_path}") @@ -64,13 +64,17 @@ def _discover_module_functions(module_path: Path) -> list[str]: ] -def _lazy_benchmark(module_path: Path, function_name: str) -> Callable[[int], float]: +def _lazy_benchmark( + module_path: Path, + function_name: str, + module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX, +) -> Callable[[int], float]: loaded_function: Callable[[int], float] | None = None def run(loops: int) -> float: nonlocal loaded_function if loaded_function is None: - module = load_module(module_path) + module = load_module(module_path, module_name_prefix=module_name_prefix) loaded_function = getattr(module, function_name) return loaded_function(loops) @@ -86,6 +90,7 @@ def run(loops: int) -> float: def _collect_skipped_benchmarks( bench_ids: list[str], registry: dict[str, Callable[[int], float]], + module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX, ) -> set[str]: """Return bench IDs that the owning module has marked as unsupported. @@ -106,7 +111,7 @@ def _collect_skipped_benchmarks( continue module = loaded_modules.get(module_path) if module is None: - module = load_module(module_path) + module = load_module(module_path, module_name_prefix=module_name_prefix) loaded_modules[module_path] = module module_skip = getattr(module, "SKIPPED_BENCHMARKS", None) if module_skip and function_name in module_skip: @@ -114,7 +119,10 @@ def _collect_skipped_benchmarks( return skipped -def discover_benchmarks() -> dict[str, Callable[[int], float]]: +def discover_benchmarks( + bench_dir: Path = BENCH_DIR, + module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX, +) -> dict[str, Callable[[int], float]]: """Discover bench_ functions. Each bench_ function must have the signature: bench_*(loops: int) -> float @@ -122,13 +130,13 @@ def discover_benchmarks() -> dict[str, Callable[[int], float]]: time in seconds (using time.perf_counter). """ registry: dict[str, Callable[[int], float]] = {} - for module_path in sorted(BENCH_DIR.glob("bench_*.py")): + for module_path in sorted(bench_dir.glob("bench_*.py")): module_name = module_path.stem for function_name in _discover_module_functions(module_path): bench_id = benchmark_id(module_name, function_name) if bench_id in registry: raise ValueError(f"Duplicate benchmark ID discovered: {bench_id}") - registry[bench_id] = _lazy_benchmark(module_path, function_name) + registry[bench_id] = _lazy_benchmark(module_path, function_name, module_name_prefix=module_name_prefix) return registry @@ -152,7 +160,10 @@ def _split_env_vars(arg_value: str) -> list[str]: return [env_var for env_var in arg_value.split(",") if env_var] -def ensure_pyperf_worker_env(argv: list[str]) -> list[str]: +def ensure_pyperf_worker_env( + argv: list[str], + extra_env_vars: tuple[str, ...] = (DEFAULT_BENCH_FILTER_ENV_VAR,), +) -> list[str]: if "--copy-env" in argv: return list(argv) @@ -175,7 +186,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]: if skip_next: raise ValueError("Missing value for --inherit-environ") - for env_var in PYPERF_INHERITED_ENV_VARS: + for env_var in (*BASE_PYPERF_INHERITED_ENV_VARS, *extra_env_vars): if env_var in os.environ: inherited_env.append(env_var) @@ -190,7 +201,7 @@ def ensure_pyperf_worker_env(argv: list[str]) -> list[str]: return cleaned -def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]: +def parse_args(argv: list[str], default_output: Path = DEFAULT_OUTPUT) -> tuple[argparse.Namespace, list[str]]: parser = argparse.ArgumentParser(add_help=False) parser.add_argument( "--benchmark", @@ -207,19 +218,25 @@ def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]: "-o", "--output", type=Path, - default=DEFAULT_OUTPUT, - help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})", + default=default_output, + help=f"JSON output file path (default: {default_output.name})", ) parsed, remaining = parser.parse_known_args(argv) return parsed, remaining -def main() -> None: - parsed, remaining_argv = parse_args(sys.argv[1:]) +def main( + *, + bench_dir: Path = BENCH_DIR, + default_output: Path = DEFAULT_OUTPUT, + module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX, + bench_filter_env_var: str = DEFAULT_BENCH_FILTER_ENV_VAR, +) -> None: + parsed, remaining_argv = parse_args(sys.argv[1:], default_output=default_output) - registry = discover_benchmarks() + registry = discover_benchmarks(bench_dir=bench_dir, module_name_prefix=module_name_prefix) if not registry: - raise RuntimeError(f"No benchmark functions found in {BENCH_DIR}") + raise RuntimeError(f"No benchmark functions found in {bench_dir}") if parsed.list: for bench_id in sorted(registry): @@ -231,7 +248,7 @@ def main() -> None: # the wrong bench. pyperf drops unknown CLI flags when spawning workers, # so fall back to an env var carrying the filter. requested = list(parsed.benchmark) - env_filter = os.environ.get(BENCH_FILTER_ENV_VAR, "") + env_filter = os.environ.get(bench_filter_env_var, "") if not requested and env_filter: requested = [bid for bid in env_filter.split(",") if bid] @@ -243,21 +260,21 @@ def main() -> None: raise ValueError(f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}") benchmark_ids = requested # Propagate to any pyperf worker we're about to spawn. - os.environ[BENCH_FILTER_ENV_VAR] = ",".join(benchmark_ids) + os.environ[bench_filter_env_var] = ",".join(benchmark_ids) else: benchmark_ids = sorted(registry) # Strip any --output args to avoid conflicts with our output handling. output_path = parsed.output.resolve() remaining_argv = strip_pyperf_output_args(remaining_argv) - remaining_argv = ensure_pyperf_worker_env(remaining_argv) + remaining_argv = ensure_pyperf_worker_env(remaining_argv, extra_env_vars=(bench_filter_env_var,)) is_worker = "--worker" in remaining_argv # Drop benchmarks that the owning module has marked as unavailable on # this driver/device. Without this step a single unsupported bench # (e.g. TMA on a pre-Hopper GPU) would abort the whole pyperf run, # since pyperf treats a raised exception as a fatal worker failure. - skipped = _collect_skipped_benchmarks(benchmark_ids, registry) + skipped = _collect_skipped_benchmarks(benchmark_ids, registry, module_name_prefix=module_name_prefix) if skipped and not is_worker: for bench_id in sorted(skipped): print(f"Skipping {bench_id}: unsupported on this driver/device", file=sys.stderr) diff --git a/benchmarks/cuda_core/.gitignore b/benchmarks/cuda_core/.gitignore new file mode 100644 index 00000000000..b795782a321 --- /dev/null +++ b/benchmarks/cuda_core/.gitignore @@ -0,0 +1,16 @@ +# Build artifacts +.build/ +__pycache__/ + +# Benchmark results +*.json +.benchmarks/ + +# Pixi environments +.pixi/ + +# Override root .gitignore *.cpp rule (which targets Cython-generated files) +!benchmarks/cpp/*.cpp + +results-python.json +results-cpp.json diff --git a/benchmarks/cuda_core/AGENTS.md b/benchmarks/cuda_core/AGENTS.md new file mode 100644 index 00000000000..d2a3a3357cd --- /dev/null +++ b/benchmarks/cuda_core/AGENTS.md @@ -0,0 +1,11 @@ +# cuda.core benchmarks + +Read the README.md in this directory for more details about the benchmarks. + +When generating code verify that that the code is correct based on the source for cuda-core +that can be found in ../../cuda_core. + +This suite shares the pyperf runner with `../cuda_bindings/` via a sys.path +insert in `run_pyperf.py`. The per-suite setup (`runtime.py`, the `benchmarks/` +module files) lives here. Benchmark IDs are kept identical to the cuda.bindings +suite so `compare.py` can diff them directly. diff --git a/benchmarks/cuda_core/BENCHMARK_PLAN.md b/benchmarks/cuda_core/BENCHMARK_PLAN.md new file mode 100644 index 00000000000..93d29a735a2 --- /dev/null +++ b/benchmarks/cuda_core/BENCHMARK_PLAN.md @@ -0,0 +1,111 @@ +# cuda.core Benchmark Plan + +## Goal + +Measure the **additional Python-side latency** that `cuda.core` adds on top +of `cuda.bindings` for every public API with a clear cuda.bindings +counterpart. Target is **< 1 µs** of extra overhead per call. + +The baseline for this suite is not C++: it is the cuda.bindings Python +results file (`../cuda_bindings/results-python.json`). Benchmark IDs are +kept identical across suites so a direct diff is possible — see +`compare.py`. + +## Phase 1 coverage + +Benchmark IDs shipped in this PR (all map 1:1 to cuda.bindings): + +| ID | cuda.core surface | +|-------------------------------------------|----------------------------------------------------| +| `ctx_device.ctx_get_current` | `Device()` | +| `ctx_device.ctx_set_current` | `dev.set_current()` | +| `ctx_device.device_get` | `Device(0)` | +| `ctx_device.device_get_attribute` | `dev.properties.compute_capability_major` (cached) | +| `stream.stream_create_destroy` | `dev.create_stream()` + `stream.close()` | +| `stream.stream_synchronize` | `stream.sync()` | +| `event.event_create_destroy` | `dev.create_event()` + `event.close()` | +| `event.event_record` | `stream.record(event)` | +| `event.event_query` | `event.is_done` | +| `event.event_synchronize` | `event.sync()` | +| `memory.mem_alloc_free` | `dev.allocate(size)` + `buf.close()` (async pool!) | +| `memory.mem_alloc_async_free_async` | `dev.allocate(size, stream)` + `buf.close(stream)` | +| `launch.launch_empty_kernel` | `launch(stream, config, kernel)` | +| `launch.launch_small_kernel` | `launch(..., ptr)` | +| `launch.launch_16_args` | `launch(..., *16 ptrs)` | +| `launch.launch_256_args` | `launch(..., *256 ptrs)` | +| `launch.launch_512_args` | `launch(..., *512 ptrs)` | + +## Intentionally not covered in Phase 1 + +- `ctx_device.ctx_get_device`, `ctx_device.device_primary_ctx_retain`: + cuda.core abstracts CUDA contexts away — no direct counterpart. +- `enum.*`: cuda.core does not re-export cuda.bindings enums; those + benches measure a cuda.bindings-specific cost. +- `stream.stream_query`: no public `Stream.query()` in cuda.core. +- `launch.launch_*_pre_packed`: pre-packing is a cuda.bindings-specific + optimization of its tuple-of-args API; cuda.core's `ParamHolder` + handles packing internally on every call. +- `launch.launch_512_bools / _ints / _doubles / _bytes / _longlongs`: + non-pointer scalar arg variants — deferred to Phase 2. +- `launch.launch_2048b`: struct-by-value arg — requires a + `TensorMapDescriptor`/ctypes path that is not yet settled in cuda.core. +- `memory.memcpy_htod / _dtoh / _dtod`: cuda.core's `Buffer.copy_to / + copy_from` only go buffer-to-buffer through `cuMemcpyAsync`; pairing + with cuda.bindings' synchronous `cuMemcpyDtoD` / `HtoD` / `DtoH` would + be apples-to-oranges. Deferred until the comparable host-memory + resource path is finalized. +- NVRTC / module benches: cuda.core's `Program` / `ObjectCode` pipeline + is meaningfully different from raw NVRTC; deserves its own set of + bench functions rather than reusing cuda.bindings IDs. + +## Audit notes: known driver-call mismatches + +The IDs above match cuda.bindings 1:1 at the *public API* level, but a few +measure a different underlying driver call. Readers of `compare.py` should +know which deltas are "pure cuda.core Python overhead" vs. a deliberate +different driver path: + +- `ctx_device.ctx_get_current`: `Device()` reads a TLS-cached device + object; cuda.bindings calls `cuCtxGetCurrent` every iteration. Expect + cuda.core to be faster. Not apples-to-apples at the driver level; + apples-to-apples at the user-facing "give me the current device" level. +- `ctx_device.device_get_attribute`: `DeviceProperties` caches the first + lookup in a Python dict (`_get_cached_attribute`, `_device.pyx:75`). + After the first iteration this is a dict hit, not a `cuDeviceGetAttribute` + driver call. Expect cuda.core to be faster here too. A future + paired bench can use an uncached attribute (e.g. `compute_mode`) to + measure the wrapper overhead on the driver-call path. +- `stream.stream_create_destroy`: default `StreamOptions(nonblocking=True)` + yields the same `CU_STREAM_NON_BLOCKING` flag as the cuda.bindings + bench, but cuda.core additionally calls `cuCtxGetStreamPriorityRange` + and builds a `StreamOptions` dataclass per create — real cuda.core + overhead, fair to measure. +- `memory.mem_alloc_free`: **deliberate mismatch**. `dev.allocate(size)` + with `stream=None` routes through `_MP_allocate` → `cuMemAllocFromPoolAsync` + on the cached default stream (`_memory_pool.pyx:302`). cuda.bindings + measures the synchronous `cuMemAlloc`. The bench captures the + user-visible cost of `dev.allocate(size)`, which is what a cuda.core + user actually pays; it does **not** isolate "Python wrapper overhead + on top of `cuMemAlloc`" because cuda.core does not expose a sync + `cuMemAlloc` path. +- `memory.mem_alloc_async_free_async`: same internal path as + `mem_alloc_free` (both go through `cuMemAllocFromPoolAsync`); the + only difference is whether `default_stream()` is fetched or a stream + is passed in. Driver call matches cuda.bindings' `cuMemAllocAsync` + semantically but uses the pool-backed variant. +- `launch.*`: cuda.core uses `cuLaunchKernelEx` (takes a + `CUlaunchConfig` struct) and allocates a fresh `ParamHolder` + + `LaunchConfig._to_native_launch_config()` per call. cuda.bindings + uses `cuLaunchKernel` with pre-built arg tuples. The delta captures + both the Python-side per-call work and the `Ex` vs non-`Ex` driver + cost; this is real and expected cuda.core overhead. + +## Next up (not in this PR) + +1. Scalar launch variants (512 bools/ints/doubles/bytes/longlongs) so + arg-packing overhead is covered beyond the pointer fast-path. +2. Buffer-based memcpy benchmarks once the host-memory resource path is + stable in cuda.core. +3. NVRTC / `Program` / `ObjectCode` latency benches. +4. TMA (`TensorMapDescriptor`) benches when cuda.core's CCCL-backed + helper is formalised. diff --git a/benchmarks/cuda_core/README.md b/benchmarks/cuda_core/README.md new file mode 100644 index 00000000000..de124f7ff73 --- /dev/null +++ b/benchmarks/cuda_core/README.md @@ -0,0 +1,73 @@ +# cuda.core benchmarks + +These benchmarks measure the latency overhead of the `cuda.core` public API +on top of `cuda.bindings`. Every benchmark ID here has a 1:1 counterpart in +`../cuda_bindings/benchmarks/` so a `compare.py` run produces a side-by-side +"bindings vs core" overhead table for every operation. + +This suite is **not** a throughput benchmark and does not test kernel +performance — it measures Python-side call overhead only. No C++ baseline +is built or run for `cuda.core`: the comparative baseline is the +`cuda.bindings` Python results file at +`../cuda_bindings/results-python.json`. + +The pyperf runner (`runner/main.py`) is shared with the cuda.bindings +suite via a `sys.path` insert in `run_pyperf.py`; only the per-suite +`runtime.py` and `benchmarks/*.py` live here. + +## Usage + +Requires pixi. + +Environments: + +- `wheel`: Installs released `cuda-core` from conda-forge. +- `source`: Installs `cuda-core` and `cuda-bindings` from the in-tree + sources, so local changes are exercised. + +Tasks: + +- `bench`: Runs the full suite. +- `bench-smoke-test`: Runs each bench with `--debug-single-value` for + a quick smoke check (not meaningful for timing). +- `bench-compare`: Prints a side-by-side table against + `../cuda_bindings/results-python.json`. + +### System tuning + +For more stable results on Linux, tune the system before running. +See: https://pyperf.readthedocs.io/en/latest/system.html#system + +```bash +pixi run -e wheel -- python -m pyperf system show +$(pixi run -e wheel -- which python) -m pyperf system tune +``` + +### Running benchmarks + +```bash +# Wheel env +pixi run -e wheel bench +pixi run -e wheel bench --min-time 0.1 + +# Source env (picks up local cuda.core / cuda.bindings changes) +pixi run -e source bench + +# Side-by-side comparison vs cuda.bindings +pixi run -e wheel bench-compare +``` + +Results are saved to `results-python.json` in this directory. Compare +against the cuda.bindings baseline by running that suite's `bench` task +first so `../cuda_bindings/results-python.json` exists. + +## Output JSON and analysis + +The suite uses [pyperf](https://pyperf.readthedocs.io/en/latest/). The +output JSON is pyperf-compatible: + +```bash +pixi run -e wheel -- python -m pyperf stats results-python.json +pixi run -e wheel -- python -m pyperf compare_to \ + ../cuda_bindings/results-python.json results-python.json +``` diff --git a/benchmarks/cuda_core/benchmarks/bench_ctx_device.py b/benchmarks/cuda_core/benchmarks/bench_ctx_device.py new file mode 100644 index 00000000000..ebe7f28730f --- /dev/null +++ b/benchmarks/cuda_core/benchmarks/bench_ctx_device.py @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runtime import ensure_device + +from cuda.core import Device + +DEV = ensure_device() + + +def bench_ctx_get_current(loops: int) -> float: + # Device() with no args returns the TLS-cached "current" device. + _fn = Device + + t0 = time.perf_counter() + for _ in range(loops): + _fn() + return time.perf_counter() - t0 + + +def bench_ctx_set_current(loops: int) -> float: + _fn = DEV.set_current + + t0 = time.perf_counter() + for _ in range(loops): + _fn() + return time.perf_counter() - t0 + + +def bench_device_get(loops: int) -> float: + # Device(id) hits the same TLS cache after the first construction. + _fn = Device + + t0 = time.perf_counter() + for _ in range(loops): + _fn(0) + return time.perf_counter() - t0 + + +def bench_device_get_attribute(loops: int) -> float: + # Matches the cuda.bindings bench's CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR + # call. cuda.core caches this attribute in DeviceProperties, so every + # iteration past the first is a dict lookup rather than a driver call + # — the bench therefore measures the user-visible cost of the public + # API, which is legitimately faster than cuda.bindings here. See + # BENCHMARK_PLAN.md for the rationale. + _props = DEV.properties + + t0 = time.perf_counter() + for _ in range(loops): + _props.compute_capability_major # noqa: B018 + return time.perf_counter() - t0 diff --git a/benchmarks/cuda_core/benchmarks/bench_event.py b/benchmarks/cuda_core/benchmarks/bench_event.py new file mode 100644 index 00000000000..883c470be1f --- /dev/null +++ b/benchmarks/cuda_core/benchmarks/bench_event.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runtime import ensure_device + +DEV = ensure_device() +STREAM = DEV.create_stream() + +# Pre-recorded event, completed, reused for record/query/sync benches. +EVENT = STREAM.record() +STREAM.sync() + + +def bench_event_create_destroy(loops: int) -> float: + _create = DEV.create_event + + t0 = time.perf_counter() + for _ in range(loops): + e = _create() + e.close() + return time.perf_counter() - t0 + + +def bench_event_record(loops: int) -> float: + # Reuse EVENT so we measure cuEventRecord, not event allocation. + _record = STREAM.record + _event = EVENT + + t0 = time.perf_counter() + for _ in range(loops): + _record(_event) + return time.perf_counter() - t0 + + +def bench_event_query(loops: int) -> float: + _event = EVENT + + t0 = time.perf_counter() + for _ in range(loops): + _event.is_done # noqa: B018 + return time.perf_counter() - t0 + + +def bench_event_synchronize(loops: int) -> float: + _fn = EVENT.sync + + t0 = time.perf_counter() + for _ in range(loops): + _fn() + return time.perf_counter() - t0 diff --git a/benchmarks/cuda_core/benchmarks/bench_launch.py b/benchmarks/cuda_core/benchmarks/bench_launch.py new file mode 100644 index 00000000000..09d418b0a05 --- /dev/null +++ b/benchmarks/cuda_core/benchmarks/bench_launch.py @@ -0,0 +1,183 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runtime import compile_module, ensure_device + +from cuda.core import LaunchConfig, launch + +# Same source as the cuda.bindings launch bench, minus the variants we +# don't need here. Pointer args are passed as Python ints (via int(handle)) +# or Buffer instances; cuda.core's ParamHolder treats both as intptr_t. +KERNEL_SOURCE = """\ +extern "C" __global__ void empty_kernel() { return; } +extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; } + +#define ITEM_PARAM(x, T) T x +#define REP1(x, T) , ITEM_PARAM(x, T) +#define REP2(x, T) REP1(x##0, T) REP1(x##1, T) +#define REP4(x, T) REP2(x##0, T) REP2(x##1, T) +#define REP8(x, T) REP4(x##0, T) REP4(x##1, T) +#define REP16(x, T) REP8(x##0, T) REP8(x##1, T) +#define REP32(x, T) REP16(x##0, T) REP16(x##1, T) +#define REP64(x, T) REP32(x##0, T) REP32(x##1, T) +#define REP128(x, T) REP64(x##0, T) REP64(x##1, T) +#define REP256(x, T) REP128(x##0, T) REP128(x##1, T) + +extern "C" __global__ +void small_kernel_16_args( + ITEM_PARAM(F, int*) + REP1(A, int*) + REP2(A, int*) + REP4(A, int*) + REP8(A, int*)) +{ *F = 0; } + +extern "C" __global__ +void small_kernel_256_args( + ITEM_PARAM(F, int*) + REP1(A, int*) + REP2(A, int*) + REP4(A, int*) + REP8(A, int*) + REP16(A, int*) + REP32(A, int*) + REP64(A, int*) + REP128(A, int*)) +{ *F = 0; } + +extern "C" __global__ +void small_kernel_512_args( + ITEM_PARAM(F, int*) + REP1(A, int*) + REP2(A, int*) + REP4(A, int*) + REP8(A, int*) + REP16(A, int*) + REP32(A, int*) + REP64(A, int*) + REP128(A, int*) + REP256(A, int*)) +{ *F = 0; } +""" + +KERNEL_NAMES = ( + "empty_kernel", + "small_kernel", + "small_kernel_16_args", + "small_kernel_256_args", + "small_kernel_512_args", +) + +DEV = ensure_device() +STREAM = DEV.create_stream() +CONFIG = LaunchConfig(grid=1, block=1) + +MODULE = None +EMPTY_KERNEL = None +SMALL_KERNEL = None +KERNEL_16_ARGS = None +KERNEL_256_ARGS = None +KERNEL_512_ARGS = None +FLOAT_BUF = None +INT_BUFS_512: tuple = () +INT_PTRS_512: tuple = () + + +def _ensure_launch_state() -> None: + global MODULE, EMPTY_KERNEL, SMALL_KERNEL + global KERNEL_16_ARGS, KERNEL_256_ARGS, KERNEL_512_ARGS + global FLOAT_BUF, INT_BUFS_512, INT_PTRS_512 + + if EMPTY_KERNEL is not None: + return + + module = compile_module(KERNEL_SOURCE, KERNEL_NAMES) + + # Pre-allocate buffers for the kernel args. Use ints (raw pointer + # addresses) in the launch hot path so ParamHolder skips the Buffer + # type check and goes through its int fast-path. + float_buf = DEV.allocate(4) + int_bufs_512 = tuple(DEV.allocate(4) for _ in range(512)) + int_ptrs_512 = tuple(int(b.handle) for b in int_bufs_512) + + MODULE = module + EMPTY_KERNEL = module.get_kernel("empty_kernel") + SMALL_KERNEL = module.get_kernel("small_kernel") + KERNEL_16_ARGS = module.get_kernel("small_kernel_16_args") + KERNEL_256_ARGS = module.get_kernel("small_kernel_256_args") + KERNEL_512_ARGS = module.get_kernel("small_kernel_512_args") + FLOAT_BUF = float_buf + INT_BUFS_512 = int_bufs_512 + INT_PTRS_512 = int_ptrs_512 + + +def bench_launch_empty_kernel(loops: int) -> float: + _ensure_launch_state() + _launch = launch + _kernel = EMPTY_KERNEL + _stream = STREAM + _config = CONFIG + + t0 = time.perf_counter() + for _ in range(loops): + _launch(_stream, _config, _kernel) + return time.perf_counter() - t0 + + +def bench_launch_small_kernel(loops: int) -> float: + _ensure_launch_state() + _launch = launch + _kernel = SMALL_KERNEL + _stream = STREAM + _config = CONFIG + _ptr = int(FLOAT_BUF.handle) + + t0 = time.perf_counter() + for _ in range(loops): + _launch(_stream, _config, _kernel, _ptr) + return time.perf_counter() - t0 + + +def bench_launch_16_args(loops: int) -> float: + _ensure_launch_state() + _launch = launch + _kernel = KERNEL_16_ARGS + _stream = STREAM + _config = CONFIG + _args = INT_PTRS_512[:16] + + t0 = time.perf_counter() + for _ in range(loops): + _launch(_stream, _config, _kernel, *_args) + return time.perf_counter() - t0 + + +def bench_launch_256_args(loops: int) -> float: + _ensure_launch_state() + _launch = launch + _kernel = KERNEL_256_ARGS + _stream = STREAM + _config = CONFIG + _args = INT_PTRS_512[:256] + + t0 = time.perf_counter() + for _ in range(loops): + _launch(_stream, _config, _kernel, *_args) + return time.perf_counter() - t0 + + +def bench_launch_512_args(loops: int) -> float: + _ensure_launch_state() + _launch = launch + _kernel = KERNEL_512_ARGS + _stream = STREAM + _config = CONFIG + _args = INT_PTRS_512 + + t0 = time.perf_counter() + for _ in range(loops): + _launch(_stream, _config, _kernel, *_args) + return time.perf_counter() - t0 diff --git a/benchmarks/cuda_core/benchmarks/bench_memory.py b/benchmarks/cuda_core/benchmarks/bench_memory.py new file mode 100644 index 00000000000..445f85e5121 --- /dev/null +++ b/benchmarks/cuda_core/benchmarks/bench_memory.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runtime import ensure_device + +DEV = ensure_device() +STREAM = DEV.create_stream() + +# Small allocation size: we measure call overhead, not the allocator. +ALLOC_SIZE = 1024 + + +def bench_mem_alloc_free(loops: int) -> float: + # No-stream variant: dev.allocate() uses the default stream internally. + _alloc = DEV.allocate + _size = ALLOC_SIZE + + t0 = time.perf_counter() + for _ in range(loops): + buf = _alloc(_size) + buf.close() + return time.perf_counter() - t0 + + +def bench_mem_alloc_async_free_async(loops: int) -> float: + _alloc = DEV.allocate + _size = ALLOC_SIZE + _stream = STREAM + + t0 = time.perf_counter() + for _ in range(loops): + buf = _alloc(_size, _stream) + buf.close(_stream) + return time.perf_counter() - t0 diff --git a/benchmarks/cuda_core/benchmarks/bench_stream.py b/benchmarks/cuda_core/benchmarks/bench_stream.py new file mode 100644 index 00000000000..f1501c6f658 --- /dev/null +++ b/benchmarks/cuda_core/benchmarks/bench_stream.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runtime import ensure_device + +DEV = ensure_device() +STREAM = DEV.create_stream() + + +def bench_stream_create_destroy(loops: int) -> float: + _create = DEV.create_stream + + t0 = time.perf_counter() + for _ in range(loops): + s = _create() + s.close() + return time.perf_counter() - t0 + + +def bench_stream_synchronize(loops: int) -> float: + _fn = STREAM.sync + + t0 = time.perf_counter() + for _ in range(loops): + _fn() + return time.perf_counter() - t0 diff --git a/benchmarks/cuda_core/compare.py b/benchmarks/cuda_core/compare.py new file mode 100644 index 00000000000..99cbca31a69 --- /dev/null +++ b/benchmarks/cuda_core/compare.py @@ -0,0 +1,159 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Compare cuda.core and cuda.bindings benchmark results side by side. + +Benchmark IDs are kept intentionally identical across the two suites +(e.g. ``stream.stream_create_destroy``) so a result present in both +files can be diffed directly. IDs that exist in only one suite are +rendered with ``-`` in the missing column. +""" + +import argparse +import json +import statistics +import sys +from pathlib import Path + +HERE = Path(__file__).resolve().parent +DEFAULT_CORE = HERE / "results-python.json" +DEFAULT_BINDINGS = HERE.parent / "cuda_bindings" / "results-python.json" + + +def load_benchmarks(path: Path) -> dict[str, list[float]]: + """Load a pyperf JSON file and return {name: [values]}.""" + with open(path) as f: + data = json.load(f) + + results: dict[str, list[float]] = {} + for bench in data.get("benchmarks", []): + name = bench.get("metadata", {}).get("name", "") + if not name: + for run in bench.get("runs", []): + name = run.get("metadata", {}).get("name", "") + if name: + break + values: list[float] = [] + for run in bench.get("runs", []): + values.extend(run.get("values", [])) + if name and values: + results[name] = values + return results + + +def stats(values: list[float]) -> tuple[float, float, float, int]: + mean = statistics.mean(values) + stdev = statistics.pstdev(values) if len(values) > 1 else 0.0 + rsd = (stdev / mean) if mean else 0.0 + return mean, stdev, rsd, len(values) + + +def fmt_rsd(rsd: float | None) -> str: + if rsd is None: + return "-" + return f"{rsd * 100:.1f}%" + + +def fmt_ns(seconds: float) -> str: + return f"{seconds * 1e9:,.0f}" + + +def fmt_overhead_ns(core_mean: float, bindings_mean: float) -> str: + return f"{(core_mean - bindings_mean) * 1e9:+,.0f}" + + +def fmt_overhead_pct(core_mean: float, bindings_mean: float) -> str: + if bindings_mean <= 0.0: + return "-" + pct = (core_mean - bindings_mean) / bindings_mean * 100 + return f"{pct:+,.0f}%" + + +def main() -> None: + parser = argparse.ArgumentParser(description="Compare cuda.core vs cuda.bindings benchmark results") + parser.add_argument( + "--core", + type=Path, + default=DEFAULT_CORE, + help=f"cuda.core results JSON (default: {DEFAULT_CORE.name})", + ) + parser.add_argument( + "--bindings", + type=Path, + default=DEFAULT_BINDINGS, + help=f"cuda.bindings results JSON (default: {DEFAULT_BINDINGS})", + ) + args = parser.parse_args() + + if not args.core.exists(): + print(f"cuda.core results not found: {args.core}", file=sys.stderr) + print("Run: pixi run -e source bench", file=sys.stderr) + sys.exit(1) + + core_benchmarks = load_benchmarks(args.core) + bindings_benchmarks = load_benchmarks(args.bindings) if args.bindings.exists() else {} + + if not core_benchmarks: + print("No benchmarks found in cuda.core results.", file=sys.stderr) + sys.exit(1) + + all_names = sorted(set(core_benchmarks) | set(bindings_benchmarks)) + name_width = max(len(n) for n in all_names) + name_width = max(name_width, len("Benchmark")) + + bind_w = 14 + core_w = 14 + rsd_w = 8 + oh_ns_w = 12 + oh_pct_w = 10 + + if bindings_benchmarks: + header = ( + f"{'Benchmark':<{name_width}} " + f"{'bindings (ns)':>{bind_w}} {'RSD':>{rsd_w}} " + f"{'core (ns)':>{core_w}} {'RSD':>{rsd_w}} " + f"{'Overhead ns':>{oh_ns_w}} {'Overhead %':>{oh_pct_w}}" + ) + else: + header = f"{'Benchmark':<{name_width}} {'core (ns)':>{core_w}} {'RSD':>{rsd_w}}" + + sep = "-" * len(header) + print(sep) + print(header) + print(sep) + + for name in all_names: + core_vals = core_benchmarks.get(name) + bind_vals = bindings_benchmarks.get(name) + + core_stats = stats(core_vals) if core_vals else None + bind_stats = stats(bind_vals) if bind_vals else None + + core_str = fmt_ns(core_stats[0]) if core_stats else "-" + bind_str = fmt_ns(bind_stats[0]) if bind_stats else "-" + core_rsd = fmt_rsd(core_stats[2]) if core_stats else "-" + bind_rsd = fmt_rsd(bind_stats[2]) if bind_stats else "-" + + if core_stats and bind_stats: + overhead_ns_str = fmt_overhead_ns(core_stats[0], bind_stats[0]) + overhead_pct_str = fmt_overhead_pct(core_stats[0], bind_stats[0]) + else: + overhead_ns_str = "-" + overhead_pct_str = "-" + + if bindings_benchmarks: + print( + f"{name:<{name_width}} " + f"{bind_str:>{bind_w}} {bind_rsd:>{rsd_w}} " + f"{core_str:>{core_w}} {core_rsd:>{rsd_w}} " + f"{overhead_ns_str:>{oh_ns_w}} {overhead_pct_str:>{oh_pct_w}}" + ) + else: + print(f"{name:<{name_width}} {core_str:>{core_w}} {core_rsd:>{rsd_w}}") + + print(sep) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/cuda_core/pixi.lock b/benchmarks/cuda_core/pixi.lock new file mode 100644 index 00000000000..e5eca8b36ee --- /dev/null +++ b/benchmarks/cuda_core/pixi.lock @@ -0,0 +1,972 @@ +version: 6 +environments: + default: + channels: + - url: https://conda.anaconda.org/conda-forge/ + options: + channel-priority: disabled + pypi-prerelease-mode: if-necessary-or-explicit + packages: {} + source: + channels: + - url: https://conda.anaconda.org/conda-forge/ + options: + channel-priority: disabled + pypi-prerelease-mode: if-necessary-or-explicit + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.2.0-py314ha0b5721_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.75-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.75-h376f20c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.78-h4bc722e_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.5.4-pyhc364b38_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.29.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.19-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-6_h4a7cf45_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-hd0affe5_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-6_h0358290_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.5-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-6_h47877c9_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.78-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.32-pthreads_h94d23a6_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.0-hf4e2dac_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.13-hd0affe5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.13-hd0affe5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.6-hdb14827_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.3-py314h2b28147_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.2-h35e630c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.6-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.6.0-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.4-habeac84_100_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.2.2-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.3.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + - conda: ../../cuda_core + wheel: + channels: + - url: https://conda.anaconda.org/conda-forge/ + options: + channel-priority: disabled + pypi-prerelease-mode: if-necessary-or-explicit + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.2.0-py314ha0b5721_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-core-0.7.0-cuda13_py314h025f531_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.5.4-pyhc364b38_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.29.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.19-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-6_h4a7cf45_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-hd0affe5_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-6_h0358290_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.5-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-6_h47877c9_openblas.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.78-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.32-pthreads_h94d23a6_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.0-hf4e2dac_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.13-hd0affe5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.13-hd0affe5_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42-h5347b49_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.6-hdb14827_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.3-py314h2b28147_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.2-h35e630c_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.6-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.6.0-pyha770c72_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.4-habeac84_100_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.2.2-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.3.0-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.1-pyhcf101f3_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda +packages: +- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda + build_number: 20 + sha256: 1dd3fffd892081df9726d7eb7e0dea6198962ba775bd88842135a4ddb4deb3c9 + md5: a9f577daf3de00bca7c3c76c0ecbd1de + depends: + - __glibc >=2.17,<3.0.a0 + - libgomp >=7.5.0 + constrains: + - openmp_impl <0.0a0 + license: BSD-3-Clause + license_family: BSD + size: 28948 + timestamp: 1770939786096 +- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda + sha256: 0b75d45f0bba3e95dc693336fa51f40ea28c980131fec438afb7ce6118ed05f6 + md5: d2ffd7602c02f2b316fd921d39876885 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: bzip2-1.0.6 + license_family: BSD + size: 260182 + timestamp: 1771350215188 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.4.22-hbd8a1cb_0.conda + sha256: c9dbcc8039a52023660d6d1bbf87594a93dd69c6ac5a2a44323af2c92976728d + md5: e18ad67cf881dcadee8b8d9e2f8e5f73 + depends: + - __unix + license: ISC + size: 131039 + timestamp: 1776865545798 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-2.0.0-py314h4a8dc5f_1.conda + sha256: c6339858a0aaf5d939e00d345c98b99e4558f285942b27232ac098ad17ac7f8e + md5: cf45f4278afd6f4e6d03eda0f435d527 + depends: + - __glibc >=2.17,<3.0.a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - pycparser + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + license: MIT + license_family: MIT + size: 300271 + timestamp: 1761203085220 +- conda: https://conda.anaconda.org/conda-forge/noarch/cfgv-3.5.0-pyhd8ed1ab_0.conda + sha256: aa589352e61bb221351a79e5946d56916e3c595783994884accdb3b97fe9d449 + md5: 381bd45fb7aa032691f3063aff47e3a1 + depends: + - python >=3.10 + license: MIT + license_family: MIT + size: 13589 + timestamp: 1763607964133 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.2.0-py314ha0b5721_0.conda + sha256: f77e397cbf7b173dae7470aca3052184779431b7a7bcce3ecc56c2822c83cdb4 + md5: 29baeabdf5e58a696a68625b7bd8d9f8 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-nvrtc >=13,<14.0a0 + - cuda-nvvm-impl >=13,<14.0a0 + - cuda-pathfinder >=1.1.0,<2 + - cuda-version >=13,<14.0a0 + - libcufile >=1,<2.0a0 + - libgcc >=14 + - libnvjitlink >=13.0,<14.0a0 + - libstdcxx >=14 + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + constrains: + - cuda-cudart >=13,<14.0a0 + - cuda-python >=13.2.0,<13.3.0a0 + license: LicenseRef-NVIDIA-SOFTWARE-LICENSE + size: 4281852 + timestamp: 1773284503198 +- conda: ../../cuda_core + name: cuda-core + version: 0.7.0 + build: hb0f4dca_0 + subdir: linux-64 + variants: + target_platform: linux-64 + depends: + - python + - cuda-version + - numpy + - cuda-bindings + - cuda-pathfinder + - libgcc >=15 + - libgcc >=15 + - libstdcxx >=15 + - python_abi 3.14.* *_cp314 + - cuda-nvrtc >=13.2.78,<14.0a0 + - cuda-cudart >=13.2.75,<14.0a0 + license: Apache-2.0 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-core-0.7.0-cuda13_py314h025f531_0.conda + sha256: 25841761aa7cfb2a9295afbefea5ab96ac15f07aa56fc35acb36d254c92e6747 + md5: 60c216b8b88821564e7f2409d6437bb5 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-bindings >=13,<14.0a0 + - cuda-pathfinder >=1.4.2,<2 + - cuda-version >=13,<14.0a0 + - libgcc >=14 + - libstdcxx >=14 + - numpy + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + license: Apache-2.0 + license_family: APACHE + size: 1541620 + timestamp: 1775673907706 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-13.2.75-hecca717_0.conda + sha256: 633bc9ba458a12a20a42776bf3fa25cecfddc65a22e4ed207fe09b9adcd9de58 + md5: 9b7dcd83f8a965efcf7377dc54203619 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-cudart_linux-64 13.2.75 h376f20c_0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 24542 + timestamp: 1776110472025 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.75-h376f20c_0.conda + sha256: cd03c67b2005e2e74ff278f6f8b17ca7d6f18cf43fb00775833669508d301a83 + md5: ff98f2b9b87eb8b3a4b36745d3d5b93e + depends: + - cuda-version >=13.2,<13.3.0a0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 203339 + timestamp: 1776110448238 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda + sha256: 9cc4f9df70c02eea5121cdb0e865207b04cd52591f57ebcac2ba44fada10eb5b + md5: df16c9049d882cdaf4f83a5b90079589 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13.1,<13.2.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 35339417 + timestamp: 1768272955912 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.78-hecca717_0.conda + sha256: 73fbc9d15c062c3ea60891e8183002f6b055fa6638402d17581677af0aaa20d8 + md5: 66623d882c42506fa3f1780b90841400 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 35670504 + timestamp: 1776109867257 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda + sha256: 12d84615684f1279799c023ce4ccc7c34f151bec2a90e0c8d04798a8c8af437c + md5: bf76661bc0de83a60537c4913f339fb3 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13.1,<13.2.0a0 + - libgcc >=12 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 21873791 + timestamp: 1768280315627 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.78-h4bc722e_0.conda + sha256: 944d132f61f240131abff67646da4040ae585a1f43c6b38fabebb6cc075a7c16 + md5: 5e1021b4c73e795deabbf35ed1317dcb + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=12 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 22205958 + timestamp: 1776121258973 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.5.4-pyhc364b38_0.conda + sha256: 662046880e1dd8f62cd2ad8ac9618ace683cb7a3f3af93c52e425a772c9b4f00 + md5: 42d4610b52102122741f9bf68f2866ed + depends: + - python >=3.10 + - cuda-version >=12.0,<14 + - python + license: Apache-2.0 + license_family: APACHE + size: 45892 + timestamp: 1777330272729 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda + sha256: 176ac20fdb95611af8fb2bf0d3d16fee998019b1d0f12fc9ddd5fa0df4553992 + md5: d85448460c25ee43ff2f8346bb9ad52b + constrains: + - cudatoolkit 13.1|13.1.* + - __cuda >=13 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 21511 + timestamp: 1757017115788 +- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda + sha256: 64aebe8ccb3a2c3ff446d3c0c0e88ef4fdb069a5732c03539bf3a37243c4c679 + md5: 45676e3dd76b30ec613f1f822d450eff + constrains: + - __cuda >=13 + - cudatoolkit 13.2|13.2.* + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 21908 + timestamp: 1773093709154 +- conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda + sha256: 6d977f0b2fc24fee21a9554389ab83070db341af6d6f09285360b2e09ef8b26e + md5: 003b8ba0a94e2f1e117d0bd46aebc901 + depends: + - python >=3.9 + license: Apache-2.0 + license_family: APACHE + size: 275642 + timestamp: 1752823081585 +- conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.29.0-pyhd8ed1ab_0.conda + sha256: 6b471a18372bbd52bdf32fc965f71de3bc1b5219418b8e6b3875a67a7b08c483 + md5: 8fa8358d022a3a9bd101384a808044c6 + depends: + - python >=3.10 + license: Unlicense + size: 34211 + timestamp: 1776621506566 +- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda + sha256: fbf86c4a59c2ed05bbffb2ba25c7ed94f6185ec30ecb691615d42342baa1a16a + md5: c80d8a3b84358cb967fa81e7075fbc8a + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libstdcxx >=14 + license: MIT + license_family: MIT + size: 12723451 + timestamp: 1773822285671 +- conda: https://conda.anaconda.org/conda-forge/noarch/identify-2.6.19-pyhd8ed1ab_0.conda + sha256: 381cedccf0866babfc135d65ee40b778bd20e927d2a5ec810f750c5860a7c5b8 + md5: 84a3233b709a289a4ddd7a2fd27dd988 + depends: + - python >=3.10 + - ukkonen + license: MIT + license_family: MIT + size: 79757 + timestamp: 1776455344188 +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda + sha256: 82ab2a0d91ca1e7e63ab6a4939356667ef683905dea631bc2121aa534d347b16 + md5: 080594bf4493e6bae2607e65390c520a + depends: + - python >=3.10 + - zipp >=3.20 + - python + license: Apache-2.0 + license_family: APACHE + size: 34387 + timestamp: 1773931568510 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda + sha256: 3d584956604909ff5df353767f3a2a2f60e07d070b328d109f30ac40cd62df6c + md5: 18335a698559cdbcd86150a48bf54ba6 + depends: + - __glibc >=2.17,<3.0.a0 + - zstd >=1.5.7,<1.6.0a0 + constrains: + - binutils_impl_linux-64 2.45.1 + license: GPL-3.0-only + license_family: GPL + size: 728002 + timestamp: 1774197446916 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-6_h4a7cf45_openblas.conda + build_number: 6 + sha256: 7bfe936dbb5db04820cf300a9cc1f5ee8d5302fc896c2d66e30f1ee2f20fbfd6 + md5: 6d6d225559bfa6e2f3c90ee9c03d4e2e + depends: + - libopenblas >=0.3.32,<0.3.33.0a0 + - libopenblas >=0.3.32,<1.0a0 + constrains: + - blas 2.306 openblas + - liblapack 3.11.0 6*_openblas + - liblapacke 3.11.0 6*_openblas + - libcblas 3.11.0 6*_openblas + - mkl <2026 + license: BSD-3-Clause + license_family: BSD + size: 18621 + timestamp: 1774503034895 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcap-2.77-hd0affe5_1.conda + sha256: 37c41b1024d0c75da76822e3c079aabaf121618a32fe05e53a897b35a88008fc + md5: 499cd8e2d4358986dbe3b30e8fe1bf6a + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + size: 124432 + timestamp: 1774333989027 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-6_h0358290_openblas.conda + build_number: 6 + sha256: 57edafa7796f6fa3ebbd5367692dd4c7f552be42109c2dd1a7c89b55089bf374 + md5: 36ae340a916635b97ac8a0655ace2a35 + depends: + - libblas 3.11.0 6_h4a7cf45_openblas + constrains: + - blas 2.306 openblas + - liblapack 3.11.0 6*_openblas + - liblapacke 3.11.0 6*_openblas + license: BSD-3-Clause + license_family: BSD + size: 18622 + timestamp: 1774503050205 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.16.1.26-hd07211c_0.conda + sha256: 8c44b5bf947afad827df0df49fe7483cf1b2916694081b2db4fecdfd6a2bacd1 + md5: 48418c48dac04671fa46cb446122b8a5 + depends: + - __glibc >=2.28,<3.0.a0 + - cuda-version >=13.1,<13.2.0a0 + - libgcc >=14 + - libstdcxx >=14 + - rdma-core >=60.0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 990938 + timestamp: 1768273732081 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcufile-1.17.1.22-h85c024f_0.conda + sha256: a24ad0ca488aa3e237049cd5b5c6d7fe3d2d4330682ed329203064e332ea1d74 + md5: 056a67706108efd1f9c24682ba8d3685 + depends: + - __glibc >=2.28,<3.0.a0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + - rdma-core >=61.0 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 1082447 + timestamp: 1776110053053 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.5-hecca717_0.conda + sha256: e8c2b57f6aacabdf2f1b0924bd4831ce5071ba080baa4a9e8c0d720588b6794c + md5: 49f570f3bc4c874a06ea69b7225753af + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - expat 2.7.5.* + license: MIT + license_family: MIT + size: 76624 + timestamp: 1774719175983 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda + sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6 + md5: a360c33a5abe61c07959e449fa1453eb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: MIT + license_family: MIT + size: 58592 + timestamp: 1769456073053 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda + sha256: faf7d2017b4d718951e3a59d081eb09759152f93038479b768e3d612688f83f5 + md5: 0aa00f03f9e39fb9876085dee11a85d4 + depends: + - __glibc >=2.17,<3.0.a0 + - _openmp_mutex >=4.5 + constrains: + - libgcc-ng ==15.2.0=*_18 + - libgomp 15.2.0 he0feb66_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 1041788 + timestamp: 1771378212382 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda + sha256: d2c9fad338fd85e4487424865da8e74006ab2e2475bd788f624d7a39b2a72aee + md5: 9063115da5bc35fdc3e1002e69b9ef6e + depends: + - libgfortran5 15.2.0 h68bc16d_18 + constrains: + - libgfortran-ng ==15.2.0=*_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 27523 + timestamp: 1771378269450 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda + sha256: 539b57cf50ec85509a94ba9949b7e30717839e4d694bc94f30d41c9d34de2d12 + md5: 646855f357199a12f02a87382d429b75 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=15.2.0 + constrains: + - libgfortran 15.2.0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 2482475 + timestamp: 1771378241063 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda + sha256: 21337ab58e5e0649d869ab168d4e609b033509de22521de1bfed0c031bfc5110 + md5: 239c5e9546c38a1e884d69effcf4c882 + depends: + - __glibc >=2.17,<3.0.a0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 603262 + timestamp: 1771378117851 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-6_h47877c9_openblas.conda + build_number: 6 + sha256: 371f517eb7010b21c6cc882c7606daccebb943307cb9a3bf2c70456a5c024f7d + md5: 881d801569b201c2e753f03c84b85e15 + depends: + - libblas 3.11.0 6_h4a7cf45_openblas + constrains: + - blas 2.306 openblas + - liblapacke 3.11.0 6*_openblas + - libcblas 3.11.0 6*_openblas + license: BSD-3-Clause + license_family: BSD + size: 18624 + timestamp: 1774503065378 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.3-hb03c661_0.conda + sha256: ec30e52a3c1bf7d0425380a189d209a52baa03f22fb66dd3eb587acaa765bd6d + md5: b88d90cad08e6bc8ad540cb310a761fb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - xz 5.8.3.* + license: 0BSD + size: 113478 + timestamp: 1775825492909 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda + sha256: fe171ed5cf5959993d43ff72de7596e8ac2853e9021dec0344e583734f1e0843 + md5: 2c21e66f50753a083cbe6b80f38268fa + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-2-Clause + license_family: BSD + size: 92400 + timestamp: 1769482286018 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libnl-3.11.0-hb9d3cd8_0.conda + sha256: ba7c5d294e3d80f08ac5a39564217702d1a752e352e486210faff794ac5001b4 + md5: db63358239cbe1ff86242406d440e44a + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: LGPL-2.1-or-later + license_family: LGPL + size: 741323 + timestamp: 1731846827427 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libnvjitlink-13.2.78-hecca717_0.conda + sha256: 2ece599a2a1090eb70916061ab8b49670ee9b143e1f3b41efa0e32e0336a9465 + md5: 641ddee63cb39856291275114ce15d13 + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-version >=13,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 31659345 + timestamp: 1776110062236 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.32-pthreads_h94d23a6_0.conda + sha256: 6dc30b28f32737a1c52dada10c8f3a41bc9e021854215efca04a7f00487d09d9 + md5: 89d61bc91d3f39fda0ca10fcd3c68594 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libgfortran + - libgfortran5 >=14.3.0 + constrains: + - openblas >=0.3.32,<0.3.33.0a0 + license: BSD-3-Clause + license_family: BSD + size: 5928890 + timestamp: 1774471724897 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.53.0-hf4e2dac_0.conda + sha256: ec37c79f737933bbac965f5dc0f08ef2790247129a84bb3114fad4900adce401 + md5: 810d83373448da85c3f673fbcb7ad3a3 + depends: + - __glibc >=2.17,<3.0.a0 + - icu >=78.3,<79.0a0 + - libgcc >=14 + - libzlib >=1.3.2,<2.0a0 + license: blessing + size: 958864 + timestamp: 1775753750179 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda + sha256: 78668020064fdaa27e9ab65cd2997e2c837b564ab26ce3bf0e58a2ce1a525c6e + md5: 1b08cd684f34175e4514474793d44bcb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc 15.2.0 he0feb66_18 + constrains: + - libstdcxx-ng ==15.2.0=*_18 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + size: 5852330 + timestamp: 1771378262446 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsystemd0-257.13-hd0affe5_0.conda + sha256: c5008b602cb5c819f7b52d418b3ed17e1818cbbf6705b189e7ab36bb70cce3d8 + md5: 8ee3cb7f64be0e8c4787f3a4dbe024e6 + depends: + - __glibc >=2.17,<3.0.a0 + - libcap >=2.77,<2.78.0a0 + - libgcc >=14 + license: LGPL-2.1-or-later + size: 492799 + timestamp: 1773797095649 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libudev1-257.13-hd0affe5_0.conda + sha256: 1a1e367c04d66030aa93b4d33905f7f6fbb59cfc292e816fe3e9c1e8b3f4d1e2 + md5: 2c2270f93d6f9073cbf72d821dfc7d72 + depends: + - __glibc >=2.17,<3.0.a0 + - libcap >=2.77,<2.78.0a0 + - libgcc >=14 + license: LGPL-2.1-or-later + size: 145087 + timestamp: 1773797108513 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.42-h5347b49_0.conda + sha256: bc1b08c92626c91500fd9f26f2c797f3eb153b627d53e9c13cd167f1e12b2829 + md5: 38ffe67b78c9d4de527be8315e5ada2c + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: BSD-3-Clause + license_family: BSD + size: 40297 + timestamp: 1775052476770 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda + sha256: 55044c403570f0dc26e6364de4dc5368e5f3fc7ff103e867c487e2b5ab2bcda9 + md5: d87ff7921124eccd67248aa483c23fec + depends: + - __glibc >=2.17,<3.0.a0 + constrains: + - zlib 1.3.2 *_2 + license: Zlib + license_family: Other + size: 63629 + timestamp: 1774072609062 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.6-hdb14827_0.conda + sha256: fc89f74bbe362fb29fa3c037697a89bec140b346a2469a90f7936d1d7ea4d8a3 + md5: fc21868a1a5aacc937e7a18747acb8a5 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + license: X11 AND BSD-3-Clause + size: 918956 + timestamp: 1777422145199 +- conda: https://conda.anaconda.org/conda-forge/noarch/nodeenv-1.10.0-pyhd8ed1ab_0.conda + sha256: 4fa40e3e13fc6ea0a93f67dfc76c96190afd7ea4ffc1bac2612d954b42cdc3ee + md5: eb52d14a901e23c39e9e7b4a1a5c015f + depends: + - python >=3.10 + - setuptools + license: BSD-3-Clause + license_family: BSD + size: 40866 + timestamp: 1766261270149 +- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.3-py314h2b28147_0.conda + sha256: f2ba8cb0d86a6461a6bcf0d315c80c7076083f72c6733c9290086640723f79ec + md5: 36f5b7eb328bdc204954a2225cf908e2 + depends: + - python + - libstdcxx >=14 + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + - python_abi 3.14.* *_cp314 + - libcblas >=3.9.0,<4.0a0 + - liblapack >=3.9.0,<4.0a0 + - libblas >=3.9.0,<4.0a0 + constrains: + - numpy-base <0a0 + license: BSD-3-Clause + license_family: BSD + size: 8927860 + timestamp: 1773839233468 +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.2-h35e630c_0.conda + sha256: c0ef482280e38c71a08ad6d71448194b719630345b0c9c60744a2010e8a8e0cb + md5: da1b85b6a87e141f5140bb9924cecab0 + depends: + - __glibc >=2.17,<3.0.a0 + - ca-certificates + - libgcc >=14 + license: Apache-2.0 + license_family: Apache + size: 3167099 + timestamp: 1775587756857 +- conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.9.6-pyhcf101f3_0.conda + sha256: 8f29915c172f1f7f4f7c9391cd5dac3ebf5d13745c8b7c8006032615246345a5 + md5: 89c0b6d1793601a2a3a3f7d2d3d8b937 + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + size: 25862 + timestamp: 1775741140609 +- conda: https://conda.anaconda.org/conda-forge/noarch/pre-commit-4.6.0-pyha770c72_0.conda + sha256: 716960bf0a9eb334458a26b3bdcb17b8d0786062138a4f48c7f335c8418c5d0b + md5: 7859736b4f8ebe6c8481bf48d91c9a1e + depends: + - cfgv >=2.0.0 + - identify >=1.0.0 + - nodeenv >=0.11.1 + - python >=3.10 + - pyyaml >=5.1 + - virtualenv >=20.10.0 + license: MIT + license_family: MIT + size: 201606 + timestamp: 1776858157327 +- conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda + sha256: 79db7928d13fab2d892592223d7570f5061c192f27b9febd1a418427b719acc6 + md5: 12c566707c80111f9799308d9e265aef + depends: + - python >=3.9 + - python + license: BSD-3-Clause + license_family: BSD + size: 110100 + timestamp: 1733195786147 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pyperf-2.9.0-py314hdafbbf9_0.conda + sha256: 438c41b42530874928733299ca815f5994f36996c86024f3f37ca220ed910a07 + md5: ed166875b3876d5d7e6e39d2e8d1c6e3 + depends: + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + - six + license: MIT + license_family: MIT + size: 273897 + timestamp: 1765980972868 +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.4-habeac84_100_cp314.conda + build_number: 100 + sha256: dec247c5badc811baa34d6085df9d0465535883cf745e22e8d79092ad54a3a7b + md5: a443f87920815d41bfe611296e507995 + depends: + - __glibc >=2.17,<3.0.a0 + - bzip2 >=1.0.8,<2.0a0 + - ld_impl_linux-64 >=2.36.1 + - libexpat >=2.7.5,<3.0a0 + - libffi >=3.5.2,<3.6.0a0 + - libgcc >=14 + - liblzma >=5.8.2,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.52.0,<4.0a0 + - libuuid >=2.42,<3.0a0 + - libzlib >=1.3.2,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.6,<4.0a0 + - python_abi 3.14.* *_cp314 + - readline >=8.3,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + - zstd >=1.5.7,<1.6.0a0 + license: Python-2.0 + size: 36705460 + timestamp: 1775614357822 + python_site_packages_path: lib/python3.14/site-packages +- conda: https://conda.anaconda.org/conda-forge/noarch/python-discovery-1.2.2-pyhcf101f3_0.conda + sha256: 498ad019d75ba31c7891dc6d9efc8a7ed48cd5d5973f3a9377eb1b174577d3db + md5: feb2e11368da12d6ce473b6573efab41 + depends: + - python >=3.10 + - filelock >=3.15.4 + - platformdirs <5,>=4.3.6 + - python + license: MIT + license_family: MIT + size: 34341 + timestamp: 1775586706825 +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda + build_number: 8 + sha256: ad6d2e9ac39751cc0529dd1566a26751a0bf2542adb0c232533d32e176e21db5 + md5: 0539938c55b6b1a59b560e843ad864a4 + constrains: + - python 3.14.* *_cp314 + license: BSD-3-Clause + license_family: BSD + size: 6989 + timestamp: 1752805904792 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda + sha256: b318fb070c7a1f89980ef124b80a0b5ccf3928143708a85e0053cde0169c699d + md5: 2035f68f96be30dc60a5dfd7452c7941 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + - yaml >=0.2.5,<0.3.0a0 + license: MIT + license_family: MIT + size: 202391 + timestamp: 1770223462836 +- conda: https://conda.anaconda.org/conda-forge/linux-64/rdma-core-61.0-h192683f_0.conda + sha256: 8e0b7962cf8bec9a016cd91a6c6dc1f9ebc8e7e316b1d572f7b9047d0de54717 + md5: d487d93d170e332ab39803e05912a762 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libnl >=3.11.0,<4.0a0 + - libstdcxx >=14 + - libsystemd0 >=257.10 + - libudev1 >=257.10 + license: Linux-OpenIB + license_family: BSD + size: 1268666 + timestamp: 1769154883613 +- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda + sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002 + md5: d7d95fc8287ea7bf33e0e7116d2b95ec + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - ncurses >=6.5,<7.0a0 + license: GPL-3.0-only + license_family: GPL + size: 345073 + timestamp: 1765813471974 +- conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-82.0.1-pyh332efcf_0.conda + sha256: 82088a6e4daa33329a30bc26dc19a98c7c1d3f05c0f73ce9845d4eab4924e9e1 + md5: 8e194e7b992f99a5015edbd4ebd38efd + depends: + - python >=3.10 + license: MIT + license_family: MIT + size: 639697 + timestamp: 1773074868565 +- conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + sha256: 458227f759d5e3fcec5d9b7acce54e10c9e1f4f4b7ec978f3bfd54ce4ee9853d + md5: 3339e3b65d58accf4ca4fb8748ab16b3 + depends: + - python >=3.9 + - python + license: MIT + license_family: MIT + size: 18455 + timestamp: 1753199211006 +- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda + sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac + md5: cffd3bdd58090148f4cfcd831f4b26ab + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libzlib >=1.3.1,<2.0a0 + constrains: + - xorg-libx11 >=1.8.12,<2.0a0 + license: TCL + license_family: BSD + size: 3301196 + timestamp: 1769460227866 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda + sha256: 032271135bca55aeb156cee361c81350c6f3fb203f57d024d7e5a1fc9ef18731 + md5: 0caa1af407ecff61170c9437a808404d + depends: + - python >=3.10 + - python + license: PSF-2.0 + license_family: PSF + size: 51692 + timestamp: 1756220668932 +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda + sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c + md5: ad659d0a2b3e47e38d829aa8cad2d610 + license: LicenseRef-Public-Domain + size: 119135 + timestamp: 1767016325805 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ukkonen-1.1.0-py314h9891dd4_0.conda + sha256: c84034056dc938c853e4f61e72e5bd37e2ec91927a661fb9762f678cbea52d43 + md5: 5d3c008e54c7f49592fca9c32896a76f + depends: + - __glibc >=2.17,<3.0.a0 + - cffi + - libgcc >=14 + - libstdcxx >=14 + - python >=3.14,<3.15.0a0 + - python_abi 3.14.* *_cp314 + license: MIT + license_family: MIT + size: 15004 + timestamp: 1769438727085 +- conda: https://conda.anaconda.org/conda-forge/noarch/virtualenv-21.3.0-pyhcf101f3_0.conda + sha256: defaf2bc2a3cf6f1455149531e8be4d03e18eb1d022ffe4f4d964d49bbf0fe34 + md5: da6e70a64226740cef159121dbe40b95 + depends: + - python >=3.10 + - distlib >=0.3.7,<1 + - filelock <4,>=3.24.2 + - importlib-metadata >=6.6 + - platformdirs >=3.9.1,<5 + - python-discovery >=1 + - typing_extensions >=4.13.2 + - python + license: MIT + license_family: MIT + size: 5161814 + timestamp: 1777321763628 +- conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda + sha256: 6d9ea2f731e284e9316d95fa61869fe7bbba33df7929f82693c121022810f4ad + md5: a77f85f77be52ff59391544bfe73390a + depends: + - libgcc >=14 + - __glibc >=2.17,<3.0.a0 + license: MIT + license_family: MIT + size: 85189 + timestamp: 1753484064210 +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.1-pyhcf101f3_0.conda + sha256: 523616c0530d305d2216c2b4a8dfd3872628b60083255b89c5e0d8c42e738cca + md5: e1c36c6121a7c9c76f2f148f1e83b983 + depends: + - python >=3.10 + - python + license: MIT + license_family: MIT + size: 24461 + timestamp: 1776131454755 +- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda + sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7 + md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829 + depends: + - __glibc >=2.17,<3.0.a0 + - libzlib >=1.3.1,<2.0a0 + license: BSD-3-Clause + license_family: BSD + size: 601375 + timestamp: 1764777111296 diff --git a/benchmarks/cuda_core/pixi.toml b/benchmarks/cuda_core/pixi.toml new file mode 100644 index 00000000000..419dfd91b92 --- /dev/null +++ b/benchmarks/cuda_core/pixi.toml @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +[workspace] +channels = ["conda-forge"] +platforms = ["linux-64"] +preview = ["pixi-build"] +channel-priority = "disabled" + +[feature.cu13.system-requirements] +cuda = "13" + +[feature.cu13-pinned.dependencies] +cuda-version = "13.1.*" + +[feature.cu13-source.dependencies] +cuda-version = "13.*" + +[feature.bench.dependencies] +python = "3.14.*" +pyperf = "*" +numpy = "*" + +[feature.dev.dependencies] +pre-commit = "*" + +# Wheel environment: released cuda-core from conda-forge. +[feature.core-wheel.dependencies] +cuda-core = "==0.7.0" + +# Source environment: build cuda-core + cuda-bindings from the in-tree +# sources so local changes are exercised. Mirrors cuda_core/pixi.toml. +[feature.core-source.dependencies] +cuda-core = { path = "../../cuda_core" } +cuda-bindings = { path = "../../cuda_bindings" } + +[environments] +wheel = { features = ["cu13", "cu13-pinned", "bench", "dev", "core-wheel"] } +source = { features = ["cu13", "cu13-source", "bench", "dev", "core-source"] } + +[target.linux.tasks.bench] +cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py"] + +[target.linux.tasks.bench-smoke-test] +cmd = ["python", "$PIXI_PROJECT_ROOT/run_pyperf.py", "--debug-single-value"] + +[target.linux.tasks.bench-compare] +cmd = ["python", "$PIXI_PROJECT_ROOT/compare.py"] + +[target.linux.tasks.lint] +cmd = ["pre-commit", "run", "--all-files"] diff --git a/benchmarks/cuda_core/run_pyperf.py b/benchmarks/cuda_core/run_pyperf.py new file mode 100644 index 00000000000..1f0c4b130b1 --- /dev/null +++ b/benchmarks/cuda_core/run_pyperf.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Entry point for the cuda.core latency benchmark suite. + +The pyperf runner lives in the cuda_bindings suite. Reuse it by putting +that directory on sys.path, then call main() with this suite's paths. +pyperf workers re-execute this script, so the sys.path tweak is done +before the worker can import anything. +""" + +import sys +from pathlib import Path + +HERE = Path(__file__).resolve().parent +CUDA_BINDINGS_SUITE = HERE.parent / "cuda_bindings" + +# Share the runner with cuda_bindings; keep cuda_core's own modules +# (benchmarks/, runtime.py) resolvable via the script's own directory. +if str(CUDA_BINDINGS_SUITE) not in sys.path: + sys.path.append(str(CUDA_BINDINGS_SUITE)) + +from runner.main import main + +if __name__ == "__main__": + main( + bench_dir=HERE / "benchmarks", + default_output=HERE / "results-python.json", + module_name_prefix="cuda_core_bench", + bench_filter_env_var="CUDA_CORE_BENCH_FILTER", + ) diff --git a/benchmarks/cuda_core/runtime.py b/benchmarks/cuda_core/runtime.py new file mode 100644 index 00000000000..d2942f40240 --- /dev/null +++ b/benchmarks/cuda_core/runtime.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Shared cuda.core setup for the latency benchmarks. + +Holds the one Device/stream/ObjectCode instance that every bench module +reuses. No state is shared with the cuda_bindings suite — each suite +runs standalone in its own process. +""" + +import atexit + +from cuda.core import Device, Program, ProgramOptions + +_device: Device | None = None +_modules: list = [] + + +def ensure_device() -> Device: + """Return the primary Device, initializing it on first call.""" + global _device + if _device is not None: + return _device + dev = Device() + dev.set_current() + _device = dev + return dev + + +def register_module(module) -> object: + """Keep a reference to an ObjectCode so its kernels stay alive.""" + _modules.append(module) + return module + + +def compile_module(kernel_source: str, name_expressions: tuple[str, ...]) -> object: + """Compile a CUDA C++ source with NVRTC via cuda.core.Program. + + name_expressions must list every __global__ function the caller + intends to fetch via ObjectCode.get_kernel(). + """ + dev = ensure_device() + options = ProgramOptions(arch=f"sm_{dev.arch}", fma=False) + prog = Program(kernel_source, code_type="c++", options=options) + return register_module(prog.compile("cubin", name_expressions=name_expressions)) + + +def cleanup() -> None: + _modules.clear() + + +atexit.register(cleanup) From a711361244d4ca5819a94995cb534a2962106372 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 1 May 2026 12:53:01 -0500 Subject: [PATCH 2/4] cuda.core benchmarks From 2144446abde43deb429354107cf9fc8303a4b1c9 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 1 May 2026 13:27:06 -0500 Subject: [PATCH 3/4] cuda.core benchmarks --- benchmarks/cuda_bindings/runner/main.py | 7 +- benchmarks/cuda_bindings/tests/test_runner.py | 2 +- .../cuda_core/benchmarks/bench_launch.py | 14 ++-- benchmarks/cuda_core/compare.py | 71 +++++++++++++++---- 4 files changed, 74 insertions(+), 20 deletions(-) diff --git a/benchmarks/cuda_bindings/runner/main.py b/benchmarks/cuda_bindings/runner/main.py index f4a138fd4e4..9c984c340d6 100644 --- a/benchmarks/cuda_bindings/runner/main.py +++ b/benchmarks/cuda_bindings/runner/main.py @@ -120,7 +120,7 @@ def _collect_skipped_benchmarks( def discover_benchmarks( - bench_dir: Path = BENCH_DIR, + bench_dir: Path | None = None, module_name_prefix: str = DEFAULT_MODULE_NAME_PREFIX, ) -> dict[str, Callable[[int], float]]: """Discover bench_ functions. @@ -129,6 +129,11 @@ def discover_benchmarks( where it calls the operation `loops` times and returns the total elapsed time in seconds (using time.perf_counter). """ + # Resolve the default inside the call so tests (and embedders) can + # monkeypatch ``BENCH_DIR`` at the module level — Python binds default + # args at def-time, so a literal default would ignore later patches. + if bench_dir is None: + bench_dir = BENCH_DIR registry: dict[str, Callable[[int], float]] = {} for module_path in sorted(bench_dir.glob("bench_*.py")): module_name = module_path.stem diff --git a/benchmarks/cuda_bindings/tests/test_runner.py b/benchmarks/cuda_bindings/tests/test_runner.py index f26baf8f5b5..56d88444c9e 100644 --- a/benchmarks/cuda_bindings/tests/test_runner.py +++ b/benchmarks/cuda_bindings/tests/test_runner.py @@ -135,7 +135,7 @@ def test_discover_benchmarks_is_lazy(monkeypatch, tmp_path): def test_ensure_pyperf_worker_env_preserves_existing_args(monkeypatch): runner_main = load_runner_main(monkeypatch) - for env_var in runner_main.PYPERF_INHERITED_ENV_VARS: + for env_var in runner_main.BASE_PYPERF_INHERITED_ENV_VARS: monkeypatch.delenv(env_var, raising=False) monkeypatch.setenv("CUDA_PATH", "/opt/cuda") monkeypatch.setenv("LD_LIBRARY_PATH", "/opt/cuda/lib64") diff --git a/benchmarks/cuda_core/benchmarks/bench_launch.py b/benchmarks/cuda_core/benchmarks/bench_launch.py index 09d418b0a05..c08f866c63c 100644 --- a/benchmarks/cuda_core/benchmarks/bench_launch.py +++ b/benchmarks/cuda_core/benchmarks/bench_launch.py @@ -96,11 +96,15 @@ def _ensure_launch_state() -> None: module = compile_module(KERNEL_SOURCE, KERNEL_NAMES) - # Pre-allocate buffers for the kernel args. Use ints (raw pointer - # addresses) in the launch hot path so ParamHolder skips the Buffer - # type check and goes through its int fast-path. - float_buf = DEV.allocate(4) - int_bufs_512 = tuple(DEV.allocate(4) for _ in range(512)) + # Pre-allocate buffers for the kernel args. Allocate on STREAM so the + # pool-async allocs are stream-ordered with the launches we will issue + # on the same stream below — matches the cuda.bindings bench, which + # uses synchronous cuMemAlloc and therefore has the pointers ready + # before the first timed launch. Use ints (raw pointer addresses) in + # the launch hot path so ParamHolder skips the Buffer type check and + # goes through its int fast-path. + float_buf = DEV.allocate(4, STREAM) + int_bufs_512 = tuple(DEV.allocate(4, STREAM) for _ in range(512)) int_ptrs_512 = tuple(int(b.handle) for b in int_bufs_512) MODULE = module diff --git a/benchmarks/cuda_core/compare.py b/benchmarks/cuda_core/compare.py index 99cbca31a69..2cb1ed92bd0 100644 --- a/benchmarks/cuda_core/compare.py +++ b/benchmarks/cuda_core/compare.py @@ -20,6 +20,33 @@ DEFAULT_CORE = HERE / "results-python.json" DEFAULT_BINDINGS = HERE.parent / "cuda_bindings" / "results-python.json" +# Benchmark IDs where cuda.core and cuda.bindings exercise *different* +# underlying driver calls or hit a cuda.core-side cache, so the "Delta" +# column is NOT pure Python wrapper overhead. See BENCHMARK_PLAN.md's +# "Audit notes" section for a full explanation of each entry. +DIFFERENT_CODEPATH_BENCHMARKS: frozenset[str] = frozenset( + { + # cuCtxGetDevice (core) vs cuCtxGetCurrent (bindings). + "ctx_device.ctx_get_current", + # TLS list lookup (core) vs cuDeviceGet (bindings). + "ctx_device.device_get", + # DeviceProperties dict cache hit (core) vs cuDeviceGetAttribute + # (bindings) on every iteration. + "ctx_device.device_get_attribute", + # cuMemAllocFromPoolAsync on default stream (core) vs synchronous + # cuMemAlloc (bindings). + "memory.mem_alloc_free", + # cuLaunchKernelEx + per-call ParamHolder (core) vs cuLaunchKernel + # with pre-built arg tuple (bindings). + "launch.launch_empty_kernel", + "launch.launch_small_kernel", + "launch.launch_16_args", + "launch.launch_256_args", + "launch.launch_512_args", + } +) +DIFFERENT_CODEPATH_MARKER = "*" + def load_benchmarks(path: Path) -> dict[str, list[float]]: """Load a pyperf JSON file and return {name: [values]}.""" @@ -59,11 +86,11 @@ def fmt_ns(seconds: float) -> str: return f"{seconds * 1e9:,.0f}" -def fmt_overhead_ns(core_mean: float, bindings_mean: float) -> str: +def fmt_delta_ns(core_mean: float, bindings_mean: float) -> str: return f"{(core_mean - bindings_mean) * 1e9:+,.0f}" -def fmt_overhead_pct(core_mean: float, bindings_mean: float) -> str: +def fmt_delta_pct(core_mean: float, bindings_mean: float) -> str: if bindings_mean <= 0.0: return "-" pct = (core_mean - bindings_mean) / bindings_mean * 100 @@ -99,26 +126,42 @@ def main() -> None: sys.exit(1) all_names = sorted(set(core_benchmarks) | set(bindings_benchmarks)) - name_width = max(len(n) for n in all_names) + + # Reserve a trailing column of space for the "different codepath" marker + # so it does not collide visually with the benchmark ID. + display_names = { + name: f"{name} {DIFFERENT_CODEPATH_MARKER}" if name in DIFFERENT_CODEPATH_BENCHMARKS else name + for name in all_names + } + name_width = max(len(display_names[n]) for n in all_names) name_width = max(name_width, len("Benchmark")) bind_w = 14 core_w = 14 rsd_w = 8 - oh_ns_w = 12 - oh_pct_w = 10 + delta_ns_w = 12 + delta_pct_w = 10 if bindings_benchmarks: header = ( f"{'Benchmark':<{name_width}} " f"{'bindings (ns)':>{bind_w}} {'RSD':>{rsd_w}} " f"{'core (ns)':>{core_w}} {'RSD':>{rsd_w}} " - f"{'Overhead ns':>{oh_ns_w}} {'Overhead %':>{oh_pct_w}}" + f"{'Delta ns':>{delta_ns_w}} {'Delta %':>{delta_pct_w}}" ) else: header = f"{'Benchmark':<{name_width}} {'core (ns)':>{core_w}} {'RSD':>{rsd_w}}" sep = "-" * len(header) + + if bindings_benchmarks: + # Keep legend lines shorter than the table so they don't overflow. + print("Delta = core mean - bindings mean (positive = cuda.core slower).") + print(f"{DIFFERENT_CODEPATH_MARKER} marks benchmarks where core and bindings exercise different") + print(" underlying driver calls or hit a cuda.core cache — see BENCHMARK_PLAN.md") + print(" (Audit notes) for details on each row.") + print() + print(sep) print(header) print(sep) @@ -136,21 +179,23 @@ def main() -> None: bind_rsd = fmt_rsd(bind_stats[2]) if bind_stats else "-" if core_stats and bind_stats: - overhead_ns_str = fmt_overhead_ns(core_stats[0], bind_stats[0]) - overhead_pct_str = fmt_overhead_pct(core_stats[0], bind_stats[0]) + delta_ns_str = fmt_delta_ns(core_stats[0], bind_stats[0]) + delta_pct_str = fmt_delta_pct(core_stats[0], bind_stats[0]) else: - overhead_ns_str = "-" - overhead_pct_str = "-" + delta_ns_str = "-" + delta_pct_str = "-" + + display_name = display_names[name] if bindings_benchmarks: print( - f"{name:<{name_width}} " + f"{display_name:<{name_width}} " f"{bind_str:>{bind_w}} {bind_rsd:>{rsd_w}} " f"{core_str:>{core_w}} {core_rsd:>{rsd_w}} " - f"{overhead_ns_str:>{oh_ns_w}} {overhead_pct_str:>{oh_pct_w}}" + f"{delta_ns_str:>{delta_ns_w}} {delta_pct_str:>{delta_pct_w}}" ) else: - print(f"{name:<{name_width}} {core_str:>{core_w}} {core_rsd:>{rsd_w}}") + print(f"{display_name:<{name_width}} {core_str:>{core_w}} {core_rsd:>{rsd_w}}") print(sep) From c25b82f26ee089c5bcc03e77de3a230665d16b49 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 1 May 2026 13:43:15 -0500 Subject: [PATCH 4/4] cuda.core benchmarks --- benchmarks/cuda_core/BENCHMARK_PLAN.md | 111 ------------------------- benchmarks/cuda_core/compare.py | 24 ++++-- 2 files changed, 17 insertions(+), 118 deletions(-) delete mode 100644 benchmarks/cuda_core/BENCHMARK_PLAN.md diff --git a/benchmarks/cuda_core/BENCHMARK_PLAN.md b/benchmarks/cuda_core/BENCHMARK_PLAN.md deleted file mode 100644 index 93d29a735a2..00000000000 --- a/benchmarks/cuda_core/BENCHMARK_PLAN.md +++ /dev/null @@ -1,111 +0,0 @@ -# cuda.core Benchmark Plan - -## Goal - -Measure the **additional Python-side latency** that `cuda.core` adds on top -of `cuda.bindings` for every public API with a clear cuda.bindings -counterpart. Target is **< 1 µs** of extra overhead per call. - -The baseline for this suite is not C++: it is the cuda.bindings Python -results file (`../cuda_bindings/results-python.json`). Benchmark IDs are -kept identical across suites so a direct diff is possible — see -`compare.py`. - -## Phase 1 coverage - -Benchmark IDs shipped in this PR (all map 1:1 to cuda.bindings): - -| ID | cuda.core surface | -|-------------------------------------------|----------------------------------------------------| -| `ctx_device.ctx_get_current` | `Device()` | -| `ctx_device.ctx_set_current` | `dev.set_current()` | -| `ctx_device.device_get` | `Device(0)` | -| `ctx_device.device_get_attribute` | `dev.properties.compute_capability_major` (cached) | -| `stream.stream_create_destroy` | `dev.create_stream()` + `stream.close()` | -| `stream.stream_synchronize` | `stream.sync()` | -| `event.event_create_destroy` | `dev.create_event()` + `event.close()` | -| `event.event_record` | `stream.record(event)` | -| `event.event_query` | `event.is_done` | -| `event.event_synchronize` | `event.sync()` | -| `memory.mem_alloc_free` | `dev.allocate(size)` + `buf.close()` (async pool!) | -| `memory.mem_alloc_async_free_async` | `dev.allocate(size, stream)` + `buf.close(stream)` | -| `launch.launch_empty_kernel` | `launch(stream, config, kernel)` | -| `launch.launch_small_kernel` | `launch(..., ptr)` | -| `launch.launch_16_args` | `launch(..., *16 ptrs)` | -| `launch.launch_256_args` | `launch(..., *256 ptrs)` | -| `launch.launch_512_args` | `launch(..., *512 ptrs)` | - -## Intentionally not covered in Phase 1 - -- `ctx_device.ctx_get_device`, `ctx_device.device_primary_ctx_retain`: - cuda.core abstracts CUDA contexts away — no direct counterpart. -- `enum.*`: cuda.core does not re-export cuda.bindings enums; those - benches measure a cuda.bindings-specific cost. -- `stream.stream_query`: no public `Stream.query()` in cuda.core. -- `launch.launch_*_pre_packed`: pre-packing is a cuda.bindings-specific - optimization of its tuple-of-args API; cuda.core's `ParamHolder` - handles packing internally on every call. -- `launch.launch_512_bools / _ints / _doubles / _bytes / _longlongs`: - non-pointer scalar arg variants — deferred to Phase 2. -- `launch.launch_2048b`: struct-by-value arg — requires a - `TensorMapDescriptor`/ctypes path that is not yet settled in cuda.core. -- `memory.memcpy_htod / _dtoh / _dtod`: cuda.core's `Buffer.copy_to / - copy_from` only go buffer-to-buffer through `cuMemcpyAsync`; pairing - with cuda.bindings' synchronous `cuMemcpyDtoD` / `HtoD` / `DtoH` would - be apples-to-oranges. Deferred until the comparable host-memory - resource path is finalized. -- NVRTC / module benches: cuda.core's `Program` / `ObjectCode` pipeline - is meaningfully different from raw NVRTC; deserves its own set of - bench functions rather than reusing cuda.bindings IDs. - -## Audit notes: known driver-call mismatches - -The IDs above match cuda.bindings 1:1 at the *public API* level, but a few -measure a different underlying driver call. Readers of `compare.py` should -know which deltas are "pure cuda.core Python overhead" vs. a deliberate -different driver path: - -- `ctx_device.ctx_get_current`: `Device()` reads a TLS-cached device - object; cuda.bindings calls `cuCtxGetCurrent` every iteration. Expect - cuda.core to be faster. Not apples-to-apples at the driver level; - apples-to-apples at the user-facing "give me the current device" level. -- `ctx_device.device_get_attribute`: `DeviceProperties` caches the first - lookup in a Python dict (`_get_cached_attribute`, `_device.pyx:75`). - After the first iteration this is a dict hit, not a `cuDeviceGetAttribute` - driver call. Expect cuda.core to be faster here too. A future - paired bench can use an uncached attribute (e.g. `compute_mode`) to - measure the wrapper overhead on the driver-call path. -- `stream.stream_create_destroy`: default `StreamOptions(nonblocking=True)` - yields the same `CU_STREAM_NON_BLOCKING` flag as the cuda.bindings - bench, but cuda.core additionally calls `cuCtxGetStreamPriorityRange` - and builds a `StreamOptions` dataclass per create — real cuda.core - overhead, fair to measure. -- `memory.mem_alloc_free`: **deliberate mismatch**. `dev.allocate(size)` - with `stream=None` routes through `_MP_allocate` → `cuMemAllocFromPoolAsync` - on the cached default stream (`_memory_pool.pyx:302`). cuda.bindings - measures the synchronous `cuMemAlloc`. The bench captures the - user-visible cost of `dev.allocate(size)`, which is what a cuda.core - user actually pays; it does **not** isolate "Python wrapper overhead - on top of `cuMemAlloc`" because cuda.core does not expose a sync - `cuMemAlloc` path. -- `memory.mem_alloc_async_free_async`: same internal path as - `mem_alloc_free` (both go through `cuMemAllocFromPoolAsync`); the - only difference is whether `default_stream()` is fetched or a stream - is passed in. Driver call matches cuda.bindings' `cuMemAllocAsync` - semantically but uses the pool-backed variant. -- `launch.*`: cuda.core uses `cuLaunchKernelEx` (takes a - `CUlaunchConfig` struct) and allocates a fresh `ParamHolder` + - `LaunchConfig._to_native_launch_config()` per call. cuda.bindings - uses `cuLaunchKernel` with pre-built arg tuples. The delta captures - both the Python-side per-call work and the `Ex` vs non-`Ex` driver - cost; this is real and expected cuda.core overhead. - -## Next up (not in this PR) - -1. Scalar launch variants (512 bools/ints/doubles/bytes/longlongs) so - arg-packing overhead is covered beyond the pointer fast-path. -2. Buffer-based memcpy benchmarks once the host-memory resource path is - stable in cuda.core. -3. NVRTC / `Program` / `ObjectCode` latency benches. -4. TMA (`TensorMapDescriptor`) benches when cuda.core's CCCL-backed - helper is formalised. diff --git a/benchmarks/cuda_core/compare.py b/benchmarks/cuda_core/compare.py index 2cb1ed92bd0..02027ad43bd 100644 --- a/benchmarks/cuda_core/compare.py +++ b/benchmarks/cuda_core/compare.py @@ -20,10 +20,12 @@ DEFAULT_CORE = HERE / "results-python.json" DEFAULT_BINDINGS = HERE.parent / "cuda_bindings" / "results-python.json" -# Benchmark IDs where cuda.core and cuda.bindings exercise *different* -# underlying driver calls or hit a cuda.core-side cache, so the "Delta" -# column is NOT pure Python wrapper overhead. See BENCHMARK_PLAN.md's -# "Audit notes" section for a full explanation of each entry. +# Benchmark IDs where the cuda.core path invokes a different driver +# symbol, makes an additional driver call, or hits a cuda.core-side cache +# compared to the cuda.bindings bench — i.e. the "Delta" column is NOT +# pure Python wrapper overhead on top of the same driver call. +# Unstarred rows compare like-for-like driver calls. See +# BENCHMARK_PLAN.md's "Audit notes" section for per-row rationale. DIFFERENT_CODEPATH_BENCHMARKS: frozenset[str] = frozenset( { # cuCtxGetDevice (core) vs cuCtxGetCurrent (bindings). @@ -33,9 +35,15 @@ # DeviceProperties dict cache hit (core) vs cuDeviceGetAttribute # (bindings) on every iteration. "ctx_device.device_get_attribute", + # cuStreamCreateWithPriority + cuCtxGetStreamPriorityRange (core) + # vs cuStreamCreate (bindings). + "stream.stream_create_destroy", # cuMemAllocFromPoolAsync on default stream (core) vs synchronous # cuMemAlloc (bindings). "memory.mem_alloc_free", + # cuMemAllocFromPoolAsync with explicit pool handle (core) vs + # cuMemAllocAsync with implicit default pool (bindings). + "memory.mem_alloc_async_free_async", # cuLaunchKernelEx + per-call ParamHolder (core) vs cuLaunchKernel # with pre-built arg tuple (bindings). "launch.launch_empty_kernel", @@ -157,9 +165,11 @@ def main() -> None: if bindings_benchmarks: # Keep legend lines shorter than the table so they don't overflow. print("Delta = core mean - bindings mean (positive = cuda.core slower).") - print(f"{DIFFERENT_CODEPATH_MARKER} marks benchmarks where core and bindings exercise different") - print(" underlying driver calls or hit a cuda.core cache — see BENCHMARK_PLAN.md") - print(" (Audit notes) for details on each row.") + print(f"{DIFFERENT_CODEPATH_MARKER} marks benchmarks where the cuda.core path invokes a different driver") + print(" symbol, makes an additional driver call, or hits a cuda.core-side cache") + print(" — so Delta is not pure Python wrapper overhead on top of the same driver") + print(" call. Unstarred rows compare like-for-like driver calls; their Delta is") + print(" wrapper overhead. See BENCHMARK_PLAN.md (Audit notes) for per-row detail.") print() print(sep)