NVIDIA · rparolin · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
@@ -32,6 +32,7 @@ def _import_versioned_module():
 from cuda.core._device import Device
 from cuda.core._event import Event, EventOptions
 from cuda.core._graphics import GraphicsResource
+from cuda.core._host import Host
 from cuda.core._launch_config import LaunchConfig
 from cuda.core._launcher import launch
 from cuda.core._linker import Linker, LinkerOptions
@@ -41,6 +42,7 @@ def _import_versioned_module():
     DeviceMemoryResourceOptions,
     GraphMemoryResource,
     LegacyPinnedMemoryResource,
+    ManagedBuffer,
     ManagedMemoryResource,
     ManagedMemoryResourceOptions,
     MemoryResource,

diff --git a/cuda_core/cuda/core/_host.py b/cuda_core/cuda/core/_host.py
@@ -0,0 +1,61 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+
+class Host:
+    """Host (CPU) location for managed-memory operations.
+
+    Use one of the three forms:
+
+    * ``Host()`` — generic host (any NUMA node).
+    * ``Host(numa_id=N)`` — specific NUMA node ``N``.
+    * ``Host.numa_current()`` — NUMA node of the calling thread.
+
+    ``Host`` is the symmetric counterpart of :class:`~cuda.core.Device`
+    for managed-memory `prefetch`, `advise`, and `discard_prefetch`
+    targets. Pass either a ``Device`` or a ``Host`` to those operations
+    and to ``ManagedBuffer.preferred_location`` / ``accessed_by``.
+    """
+
+    __slots__ = ("_is_numa_current", "_numa_id")
+
+    def __init__(self, numa_id: int | None = None) -> None:
+        if numa_id is not None and (not isinstance(numa_id, int) or numa_id < 0):
+            raise ValueError(f"numa_id must be a non-negative int, got {numa_id!r}")
+        object.__setattr__(self, "_numa_id", numa_id)
+        object.__setattr__(self, "_is_numa_current", False)
+
+    @property
+    def numa_id(self) -> int | None:
+        return self._numa_id
+
+    @property
+    def is_numa_current(self) -> bool:
+        return self._is_numa_current
+
+    @classmethod
+    def numa_current(cls) -> Host:
+        """Construct a ``Host`` referring to the calling thread's NUMA node."""
+        h = cls()
+        object.__setattr__(h, "_is_numa_current", True)
+        return h
+
+    def __setattr__(self, name: str, value) -> None:
+        raise AttributeError(f"{type(self).__name__} is immutable; cannot set {name!r}")
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Host):
+            return NotImplemented
+        return self._numa_id == other._numa_id and self._is_numa_current == other._is_numa_current
+
+    def __hash__(self) -> int:
+        return hash((Host, self._numa_id, self._is_numa_current))
+
+    def __repr__(self) -> str:
+        if self.is_numa_current:
+            return "Host.numa_current()"
+        if self.numa_id is None:
+            return "Host()"
+        return f"Host(numa_id={self.numa_id})"
diff --git a/cuda_core/cuda/core/_memory/__init__.py b/cuda_core/cuda/core/_memory/__init__.py
@@ -7,6 +7,7 @@
 from ._graph_memory_resource import *
 from ._ipc import *
 from ._legacy import *
+from ._managed_buffer import ManagedBuffer
 from ._managed_memory_resource import *
 from ._pinned_memory_resource import *
 from ._virtual_memory_resource import *
diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd
@@ -4,6 +4,7 @@
 
 from libc.stdint cimport uintptr_t
 
+from cuda.bindings cimport cydriver
 from cuda.core._resource_handles cimport DevicePtrHandle
 from cuda.core._stream cimport Stream
 
@@ -31,10 +32,20 @@ cdef class MemoryResource:
     pass
 
 
-# Helper function to create a Buffer from a DevicePtrHandle
+# Helper function to create a Buffer from a DevicePtrHandle.
+# `cls` lets callers materialize Buffer subclasses (e.g. ManagedBuffer for
+# managed-memory allocations); defaults to Buffer.
 cdef Buffer Buffer_from_deviceptr_handle(
     DevicePtrHandle h_ptr,
     size_t size,
     MemoryResource mr,
-    object ipc_descriptor = *
+    object ipc_descriptor = *,
+    type cls = *,
 )
+
+# Memory attribute query helpers (used by _managed_memory_ops)
+cdef void _init_mem_attrs(Buffer self)
+cdef int _query_memory_attrs(
+    _MemAttrs& out,
+    cydriver.CUdeviceptr ptr,
+) except -1 nogil
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -71,6 +71,7 @@ A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting
 :attr:`Buffer.handle`.
 """
 
+
 cdef class Buffer:
     """Represent a handle to allocated memory.
 
@@ -455,12 +456,15 @@ cdef inline int _query_memory_attrs(
         ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
     HANDLE_RETURN(ret)
 
+    # TODO: HMM/ATS-enabled sysmem should also report is_managed=True; the
+    # CU_POINTER_ATTRIBUTE_IS_MANAGED query does not capture that yet.
+    out.is_managed = is_managed != 0
+
     if memory_type == 0:
         # unregistered host pointer
         out.is_host_accessible = True
         out.is_device_accessible = False
         out.device_id = -1
-        out.is_managed = False
     elif (
         is_managed
         or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
@@ -469,12 +473,10 @@ cdef inline int _query_memory_attrs(
         out.is_host_accessible = True
         out.is_device_accessible = True
         out.device_id = device_id
-        out.is_managed = is_managed
     elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE:
         out.is_host_accessible = False
         out.is_device_accessible = True
         out.device_id = device_id
-        out.is_managed = False
     else:
         with cython.gil:
             raise ValueError(f"Unsupported memory type: {memory_type}")
@@ -552,14 +554,15 @@ cdef class MemoryResource:
 
 # Buffer Implementation Helpers
 # -----------------------------
-cdef inline Buffer Buffer_from_deviceptr_handle(
+cdef Buffer Buffer_from_deviceptr_handle(
     DevicePtrHandle h_ptr,
     size_t size,
     MemoryResource mr,
-    object ipc_descriptor = None
+    object ipc_descriptor = None,
+    type cls = Buffer,
 ):
-    """Create a Buffer from an existing DevicePtrHandle."""
-    cdef Buffer buf = Buffer.__new__(Buffer)
+    """Create a Buffer (or subclass instance) from an existing DevicePtrHandle."""
+    cdef Buffer buf = cls.__new__(cls)
     buf._h_ptr = h_ptr
     buf._size = size
     buf._memory_resource = mr

diff --git a/cuda_core/cuda/core/_memory/_managed_buffer.py b/cuda_core/cuda/core/_memory/_managed_buffer.py
@@ -0,0 +1,221 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from cuda.core._device import Device
+from cuda.core._host import Host
+from cuda.core._memory._buffer import Buffer
+from cuda.core._memory._managed_memory_ops import (
+    _advise_one,
+    _do_single_discard_prefetch_py,
+    _do_single_discard_py,
+    _do_single_prefetch_py,
+)
+from cuda.core._utils.cuda_utils import driver, handle_return
+from cuda.core._utils.version import binding_version
+
+if TYPE_CHECKING:
+    from cuda.core._memory._buffer import MemoryResource
+    from cuda.core._stream import Stream
+    from cuda.core.graph import GraphBuilder
+
+
+_INT_SIZE = 4
+
+# Enum aliases — referenced once per property write, so cache the lookup.
+_ADV = driver.CUmem_advise
+_SET_READ_MOSTLY = _ADV.CU_MEM_ADVISE_SET_READ_MOSTLY
+_UNSET_READ_MOSTLY = _ADV.CU_MEM_ADVISE_UNSET_READ_MOSTLY
+_SET_PREFERRED = _ADV.CU_MEM_ADVISE_SET_PREFERRED_LOCATION
+_UNSET_PREFERRED = _ADV.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION
+_SET_ACCESSED_BY = _ADV.CU_MEM_ADVISE_SET_ACCESSED_BY
+_UNSET_ACCESSED_BY = _ADV.CU_MEM_ADVISE_UNSET_ACCESSED_BY
+
+_RANGE = driver.CUmem_range_attribute
+_ATTR_READ_MOSTLY = _RANGE.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY
+_ATTR_PREFERRED = _RANGE.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION
+_ATTR_ACCESSED_BY = _RANGE.CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY
+
+
+def _get_int_attr(buf: Buffer, attribute) -> int:
+    return handle_return(driver.cuMemRangeGetAttribute(_INT_SIZE, attribute, buf.handle, buf.size))
+
+
+def _query_accessed_by(buf: Buffer) -> list[Device | Host]:
+    """Read the live ``CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY`` list.
+
+    Driver fills an int32 array: device id, ``-1`` = host, ``-2`` = empty.
+    Sized to ``cuDeviceGetCount() + 1`` (every visible device plus host).
+    """
+    num_devices = handle_return(driver.cuDeviceGetCount())
+    n = num_devices + 1
+    raw = handle_return(driver.cuMemRangeGetAttribute(n * _INT_SIZE, _ATTR_ACCESSED_BY, buf.handle, buf.size))
+    return [Host() if v == -1 else Device(v) for v in raw if v != -2]
+
+
+class AccessedBySet:
+    """Live driver-backed view of ``set_accessed_by`` advice for a managed buffer.
+
+    Reads (``__contains__``, ``__iter__``, ``len(...)``) call
+    ``cuMemRangeGetAttribute``; writes (``add``, ``discard``) call
+    ``cuMemAdvise``. There is no in-memory mirror, so the view always
+    reflects the current driver state.
+
+    Note
+    ----
+    The driver returns integer device ordinals (``-1`` for host); host
+    NUMA distinctions applied via ``Host(numa_id=...)`` collapse to a
+    generic ``Host()`` when iterating this set.
+    """
+
+    __slots__ = ("_buf",)
+
+    def __init__(self, buf: ManagedBuffer):
+        self._buf = buf
+
+    def __contains__(self, location) -> bool:
+        return location in _query_accessed_by(self._buf)
+
+    def __iter__(self):
+        return iter(_query_accessed_by(self._buf))
+
+    def __len__(self) -> int:
+        return len(_query_accessed_by(self._buf))
+
+    def __eq__(self, other) -> bool:
+        if isinstance(other, AccessedBySet):
+            return set(_query_accessed_by(self._buf)) == set(_query_accessed_by(other._buf))
+        if isinstance(other, (set, frozenset)):
+            return set(_query_accessed_by(self._buf)) == other
+        return NotImplemented
+
+    def __repr__(self) -> str:
+        return f"AccessedBySet({set(_query_accessed_by(self._buf))!r})"
+
+    def add(self, location: Device | Host) -> None:
+        """Apply ``set_accessed_by`` advice for ``location``."""
+        _advise_one(self._buf, _SET_ACCESSED_BY, location)
+
+    def discard(self, location: Device | Host) -> None:
+        """Apply ``unset_accessed_by`` advice for ``location``."""
+        _advise_one(self._buf, _UNSET_ACCESSED_BY, location)
+
+
+class ManagedBuffer(Buffer):
+    """Managed (unified) memory buffer with a property-style advice API.
+
+    Returned by :meth:`ManagedMemoryResource.allocate`, or wrap an
+    existing managed-memory pointer with :meth:`ManagedBuffer.from_handle`.
+
+    Examples
+    --------
+    >>> buf = mr.allocate(size)
+    >>> buf.read_mostly = True
+    >>> buf.preferred_location = Device(0)
+    >>> buf.accessed_by.add(Device(1))
+    >>> buf.prefetch(Device(0), stream=stream)
+
+    Note
+    ----
+    On CUDA 13 builds, ``preferred_location`` round-trips full NUMA
+    information. On CUDA 12 the legacy ``cuMemRangeGetAttribute`` query
+    path returns integer device ordinals, so ``Host(numa_id=...)``
+    collapses to ``Host()`` on read-back. Setters preserve full NUMA
+    information when issuing advice on both.
+    """
+
+    @classmethod
+    def from_handle(
+        cls,
+        ptr,
+        size: int,
+        mr: MemoryResource | None = None,
+        owner: object | None = None,
+    ) -> ManagedBuffer:
+        """Wrap an existing managed-memory pointer in a :class:`ManagedBuffer`.
+
+        Use this when you have an externally-allocated managed pointer
+        and want the property-style advice API (:attr:`read_mostly`,
+        :attr:`preferred_location`, :attr:`accessed_by`).
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerT`
+            Pointer to a managed allocation.
+        size : int
+            Allocation size in bytes.
+        mr : :obj:`~_memory.MemoryResource`, optional
+            Memory resource that owns ``ptr``. When provided, its
+            ``deallocate`` is called when the buffer is closed.
+        owner : object, optional
+            An object that keeps the underlying allocation alive.
+            ``owner`` and ``mr`` cannot both be specified.
+        """
+        return cls._init(ptr, size, mr=mr, owner=owner)
+
+    @property
+    def read_mostly(self) -> bool:
+        """Whether ``set_read_mostly`` advice is currently applied."""
+        return _get_int_attr(self, _ATTR_READ_MOSTLY) != 0
+
+    @read_mostly.setter
+    def read_mostly(self, value: bool) -> None:
+        _advise_one(self, _SET_READ_MOSTLY if value else _UNSET_READ_MOSTLY, None)
+
+    @property
+    def preferred_location(self) -> Device | Host | None:
+        """Currently applied ``set_preferred_location`` target, or ``None``.
+
+        On CUDA 13 builds, fully round-trips ``Host(numa_id=N)``. On CUDA 12
+        the legacy attribute carries only a device ordinal (or ``-1`` for
+        host), so ``Host(numa_id=N)`` set via the setter round-trips back
+        as ``Host()``.
+        """
+        if binding_version() >= (13, 0, 0):
+            from cuda.core._memory._managed_memory_ops import _read_preferred_location_v2
+
+            return _read_preferred_location_v2(self)
+        # CUDA 12 legacy path (no NUMA info available).
+        loc_id = _get_int_attr(self, _ATTR_PREFERRED)
+        if loc_id == -2:
+            return None
+        if loc_id == -1:
+            return Host()
+        return Device(loc_id)
+
+    @preferred_location.setter
+    def preferred_location(self, value: Device | Host | None) -> None:
+        if value is None:
+            _advise_one(self, _UNSET_PREFERRED, None)
+        else:
+            _advise_one(self, _SET_PREFERRED, value)
+
+    @property
+    def accessed_by(self) -> AccessedBySet:
+        """Live set-like view of ``set_accessed_by`` locations."""
+        return AccessedBySet(self)
+
+    @accessed_by.setter
+    def accessed_by(self, locations) -> None:
+        # Diff against the current driver state and advise only the deltas.
+        current = set(_query_accessed_by(self))
+        target = set(locations)
+        for loc in current - target:
+            _advise_one(self, _UNSET_ACCESSED_BY, loc)
+        for loc in target - current:
+            _advise_one(self, _SET_ACCESSED_BY, loc)
+
+    def prefetch(self, location: Device | Host, *, stream: Stream | GraphBuilder) -> None:
+        """Prefetch this range to ``location`` on ``stream``."""
+        _do_single_prefetch_py(self, location, stream)
+
+    def discard(self, *, stream: Stream | GraphBuilder) -> None:
+        """Discard this range's resident pages on ``stream`` (CUDA 13+)."""
+        _do_single_discard_py(self, stream)
+
+    def discard_prefetch(self, location: Device | Host, *, stream: Stream | GraphBuilder) -> None:
+        """Discard this range and prefetch to ``location`` on ``stream`` (CUDA 13+)."""
+        _do_single_discard_prefetch_py(self, location, stream)