From 0b6efe3a5b8c0a47bd17ec394ea7422ea4d86c9e Mon Sep 17 00:00:00 2001
From: Rui Luo <ruluo@nvidia.com>
Date: Tue, 9 Jun 2026 09:18:52 +0800
Subject: [PATCH 1/4] coverage: add cuda.core test_utils.py tests for
 DLPack/StridedMemoryView

---
 cuda_core/tests/test_utils.py | 458 ++++++++++++++++++++++++++++++++--
 1 file changed, 440 insertions(+), 18 deletions(-)

diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 3d4059b6961..80631c90c36 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -1060,25 +1060,25 @@ def test_dlpack_export_non_native_endian_rejected():
         bad_view.__dlpack__()
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        np.uint8,
-        np.uint16,
-        np.uint32,
-        np.uint64,
-        np.int8,
-        np.int16,
-        np.int32,
-        np.int64,
-        np.float16,
-        np.float32,
-        np.float64,
-        np.complex64,
-        np.complex128,
-        np.bool_,
-    ],
+_NUMPY_NATIVE_DLPACK_DTYPES = (
+    np.uint8,
+    np.uint16,
+    np.uint32,
+    np.uint64,
+    np.int8,
+    np.int16,
+    np.int32,
+    np.int64,
+    np.float16,
+    np.float32,
+    np.float64,
+    np.complex64,
+    np.complex128,
+    np.bool_,
 )
+
+
+@pytest.mark.parametrize("dtype", _NUMPY_NATIVE_DLPACK_DTYPES)
 def test_strided_memory_view_dtype_roundtrip_all(dtype):
     """Exercise dtype_dlpack_to_numpy for every NumPy-native DLPack dtype.
 
@@ -1094,3 +1094,425 @@ def test_strided_memory_view_dtype_roundtrip_all(dtype):
         pytest.skip(f"NumPy does not export {np.dtype(dtype)} via DLPack: {e}")
     view = StridedMemoryView.from_dlpack(src, stream_ptr=-1)
     assert view.dtype == np.dtype(dtype)  # .dtype triggers dtype_dlpack_to_numpy
+
+
+def test_as_tensor_map_assembles_kwargs(monkeypatch):
+    """``as_tensor_map`` forwards the view + box_dim and only the non-None
+    tiled options to ``TensorMapDescriptor._from_tiled``.
+
+    The real ``_from_tiled`` requires a device-accessible, 16-byte-aligned view
+    on TMA-capable hardware (sm90+), so we replace the (module-level) class the
+    method imports with a recorder and assert the assembled call instead.
+    """
+    captured = {}
+    sentinel = object()
+
+    class _RecordingTMD:
+        @classmethod
+        def _from_tiled(cls, view, box_dim=None, **kwargs):
+            captured["view"] = view
+            captured["box_dim"] = box_dim
+            captured["kwargs"] = kwargs
+            return sentinel
+
+    # as_tensor_map does `from cuda.core._tensor_map import TensorMapDescriptor`
+    # on each call, so patching the module attribute swaps the bound name.
+    monkeypatch.setattr("cuda.core._tensor_map.TensorMapDescriptor", _RecordingTMD)
+
+    src = np.zeros(6, dtype=np.float32)
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    result = view.as_tensor_map(
+        (2, 3),
+        options="OPT",
+        element_strides=(1, 1),
+        data_type="DT",
+        interleave="IL",
+        swizzle="SW",
+        l2_promotion="L2",
+        oob_fill="OOB",
+    )
+    assert result is sentinel
+    assert captured["view"] is view
+    assert captured["box_dim"] == (2, 3)
+    assert captured["kwargs"] == {
+        "options": "OPT",
+        "element_strides": (1, 1),
+        "data_type": "DT",
+        "interleave": "IL",
+        "swizzle": "SW",
+        "l2_promotion": "L2",
+        "oob_fill": "OOB",
+    }
+
+
+def test_as_tensor_map_omits_none_kwargs(monkeypatch):
+    """Tiled options left as None are not forwarded to ``_from_tiled``."""
+    captured = {}
+
+    class _RecordingTMD:
+        @classmethod
+        def _from_tiled(cls, _view, _box_dim=None, **kwargs):
+            captured["kwargs"] = kwargs
+            return None
+
+    monkeypatch.setattr("cuda.core._tensor_map.TensorMapDescriptor", _RecordingTMD)
+    view = StridedMemoryView.from_any_interface(np.zeros(6, dtype=np.float32), stream_ptr=-1)
+    view.as_tensor_map((6,))
+    assert captured["kwargs"] == {}
+
+
+def _assert_dlpack_export_roundtrip(src):
+    # Skip only if NumPy itself can't round-trip this dtype/shape; past the
+    # probe, a failure on our view is a regression, not an env limitation.
+    try:
+        np.from_dlpack(src)
+    except (BufferError, TypeError, RuntimeError) as e:
+        pytest.skip(f"NumPy does not support DLPack for {src.dtype} {src.shape}: {e}")
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    out = np.from_dlpack(view)
+    assert out.dtype == src.dtype
+    assert out.shape == src.shape
+    assert np.array_equal(out, src)
+
+
+@pytest.mark.parametrize("dtype", _NUMPY_NATIVE_DLPACK_DTYPES)
+def test_dlpack_export_roundtrip_dtypes(dtype):
+    """Export every NumPy-native DLPack dtype through ``StridedMemoryView.__dlpack__``."""
+    _assert_dlpack_export_roundtrip(np.zeros((2, 3), dtype=dtype))
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [pytest.param((), id="scalar"), pytest.param((0, 3), id="empty")],
+)
+def test_dlpack_export_roundtrip_special_shapes(shape):
+    """Export scalar and zero-volume shapes through ``StridedMemoryView.__dlpack__``."""
+    _assert_dlpack_export_roundtrip(np.zeros(shape, dtype=np.complex128))
+
+
+def test_dlpack_export_unversioned_capsule_and_deleter():
+    """``__dlpack__()`` with no ``max_version`` yields an *unversioned* unused
+    DLPack capsule; dropping it unconsumed runs ``_smv_pycapsule_deleter`` on
+    the non-versioned branch (freeing the managed tensor)."""
+    src = np.arange(6, dtype=np.int32)
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    capsule = view.__dlpack__()
+    assert _PyCapsule_IsValid(capsule, b"dltensor") == 1
+    assert _PyCapsule_IsValid(capsule, b"dltensor_versioned") == 0
+    del capsule  # unconsumed -> deleter frees dlm_tensor
+
+
+def test_dlpack_export_versioned_capsule_and_deleter():
+    """``__dlpack__(max_version=(1, 0))`` yields a *versioned* unused capsule;
+    dropping it unconsumed runs the versioned ``_smv_pycapsule_deleter`` branch."""
+    src = np.arange(6, dtype=np.int32)
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    capsule = view.__dlpack__(max_version=(1, 0))
+    assert _PyCapsule_IsValid(capsule, b"dltensor_versioned") == 1
+    assert _PyCapsule_IsValid(capsule, b"dltensor") == 0
+    del capsule  # unconsumed -> versioned deleter frees dlm_tensor_ver
+
+
+def test_from_dlpack_cpu_stream_none_ambiguous():
+    """A CPU DLPack source with ``stream_ptr=None`` is rejected as ambiguous."""
+    src = np.arange(4, dtype=np.float32)
+    with pytest.raises(BufferError, match="stream=None is ambiguous"):
+        StridedMemoryView.from_dlpack(src, stream_ptr=None)
+
+
+def test_from_dlpack_unsupported_device_type():
+    """``view_as_dlpack`` rejects a DLPack device that is neither CPU, CUDA,
+    CUDA-pinned, nor CUDA-managed before ever calling ``__dlpack__``."""
+
+    class _FakeUnsupportedDevice:
+        def __dlpack_device__(self):
+            return (7, 0)  # e.g. kDLVulkan -- unsupported by cuda.core
+
+        def __dlpack__(self, **kwargs):
+            raise AssertionError("__dlpack__ must not be reached")
+
+    with pytest.raises(BufferError, match="device not supported"):
+        StridedMemoryView.from_dlpack(_FakeUnsupportedDevice(), stream_ptr=0)
+
+
+class _DLPackNoMaxVersion:
+    """Wraps a NumPy array but rejects the ``max_version`` kwarg, forcing the
+    TypeError fallback in ``view_as_dlpack`` and an *unversioned* capsule import."""
+
+    def __init__(self, arr):
+        self._arr = arr
+        self.max_versions = []  # max_version seen on each __dlpack__ call, in order
+
+    def __dlpack_device__(self):
+        return self._arr.__dlpack_device__()
+
+    def __dlpack__(self, *, stream=None, max_version=None, **kwargs):
+        self.max_versions.append(max_version)
+        if max_version is not None:
+            raise TypeError("max_version is not supported")
+        return self._arr.__dlpack__(stream=stream)
+
+
+def test_from_dlpack_typeerror_fallback_unversioned_import():
+    """When ``__dlpack__(max_version=...)`` raises TypeError, view_as_dlpack
+    retries without it and imports the resulting unversioned capsule; the view
+    then owns that capsule and frees it on ``__dealloc__``."""
+    src = np.arange(6, dtype=np.int32)
+    wrapper = _DLPackNoMaxVersion(src)
+    view = StridedMemoryView.from_dlpack(wrapper, stream_ptr=-1)
+    # Guard the TypeError fallback path: versioned attempt, then legacy retry.
+    assert len(wrapper.max_versions) == 2, f"expected versioned attempt + retry, got {wrapper.max_versions}"
+    assert isinstance(wrapper.max_versions[0], tuple)  # versioned attempt was made
+    assert wrapper.max_versions[1] is None  # fallback retried without max_version
+    assert view.ptr == src.ctypes.data
+    out = np.from_dlpack(view)
+    assert np.array_equal(out, src)
+    del view  # exercise __dealloc__ on the imported (used) unversioned capsule
+
+
+def test_strided_memory_view_proxy_cai_only_has_dlpack_false():
+    """``_StridedMemoryViewProxy`` records ``has_dlpack=False`` for an object
+    that exposes only ``__cuda_array_interface__`` (check_has_dlpack CAI branch)."""
+    from cuda.core._memoryview import _StridedMemoryViewProxy
+
+    obj = _make_cuda_array_interface_obj(shape=(2,), strides=None)
+    proxy = _StridedMemoryViewProxy(obj)
+    assert proxy.has_dlpack is False
+    assert proxy.obj is obj
+
+
+def test_view_as_cai_device_pointer_and_stream_ordering(init_cuda):
+    """``view_as_cai`` on a real device pointer resolves the device ordinal via
+    ``cuPointerGetAttribute`` and, when the CAI ``stream`` differs from the
+    consumer stream, establishes stream ordering through an event.
+
+    Uses a synthetic CAI object backed by a genuine device allocation, so the
+    cupy/numba-only device branch is exercised without those optional deps.
+    """
+    dev = init_cuda
+    buffer = dev.memory_resource.allocate(64, stream=dev.default_stream)
+    producer = dev.create_stream()
+    consumer = dev.create_stream()
+    obj = _make_cuda_array_interface_obj(
+        shape=(8,),
+        strides=None,
+        typestr="<f4",
+        data=(int(buffer.handle), False),
+    )
+    obj.__cuda_array_interface__["stream"] = int(producer.handle)
+
+    view = StridedMemoryView.from_cuda_array_interface(obj, stream_ptr=consumer.handle)
+
+    assert view.is_device_accessible is True
+    assert view.ptr == int(buffer.handle)
+    assert view.device_id == dev.device_id
+    assert view.shape == (8,)
+    dev.default_stream.sync()
+
+
+def test_strided_memory_view_init_cai_path_deprecated(init_cuda):
+    """The deprecated ``StridedMemoryView(obj)`` constructor routes a CAI-only
+    object through the CAI branch (warn + ``view_as_cai``), not the DLPack one."""
+    obj = _make_cuda_array_interface_obj(shape=(4,), strides=None, typestr="<f4", data=(0, False))
+    with pytest.deprecated_call(match="deprecated"):
+        view = StridedMemoryView(obj, stream_ptr=-1)
+    assert view.is_device_accessible is True
+    assert view.shape == (4,)
+    assert view.device_id == init_cuda.device_id
+
+
+def test_dlpack_export_device_accessible_cai_view(init_cuda):
+    """Exporting a device-accessible CAI-backed view (no dl_tensor) drives the
+    ``_smv_get_dl_device`` branch that calls ``get_buffer``/``classify_dl_device``
+    and reports a CUDA device via ``__dlpack_device__``."""
+    dev = init_cuda
+    buffer = dev.memory_resource.allocate(64, stream=dev.default_stream)
+    obj = _make_cuda_array_interface_obj(
+        shape=(8,),
+        strides=None,
+        typestr="<f4",
+        data=(int(buffer.handle), False),
+    )
+    view = StridedMemoryView.from_cuda_array_interface(obj, stream_ptr=-1)
+
+    device_type, device_id = view.__dlpack_device__()
+    assert device_type == int(DLDeviceType.kDLCUDA)
+    assert device_id == dev.device_id
+
+    capsule = view.__dlpack__()
+    assert _PyCapsule_IsValid(capsule, b"dltensor") == 1
+    del capsule  # unconsumed -> deleter frees the managed tensor
+    dev.default_stream.sync()
+
+
+def test_strided_memory_view_repr_with_none_dtype(init_cuda):
+    """``__repr__`` of a view whose dtype is None renders the dtype via
+    ``get_simple_repr`` taking the builtins branch (NoneType)."""
+    dev = init_cuda
+    buffer = dev.memory_resource.allocate(16, stream=dev.default_stream)
+    view = StridedMemoryView.from_buffer(buffer, shape=(16,), itemsize=1, dtype=None)
+    assert view.dtype is None
+    r = repr(view)
+    assert r.startswith("StridedMemoryView(ptr=")
+    assert "dtype=NoneType" in r
+
+
+# ---------------------------------------------------------------------------
+# DLPack C exchange API (`__dlpack_c_exchange_api__`)
+#
+# Drive the C function pointers exposed by the capsule the way a native
+# consumer would, exercising the StridedMemoryView exchange-API implementation.
+# ---------------------------------------------------------------------------
+
+_PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer
+_PyCapsule_GetPointer.argtypes = (ctypes.py_object, ctypes.c_char_p)
+_PyCapsule_GetPointer.restype = ctypes.c_void_p
+
+
+class _DLPackVersion(ctypes.Structure):
+    _fields_ = [("major", ctypes.c_uint32), ("minor", ctypes.c_uint32)]
+
+
+class _DLPackExchangeAPIHeader(ctypes.Structure):
+    _fields_ = [("version", _DLPackVersion), ("prev_api", ctypes.c_void_p)]
+
+
+class _DLDevice(ctypes.Structure):
+    _fields_ = [("device_type", ctypes.c_int), ("device_id", ctypes.c_int32)]
+
+
+class _DLDataType(ctypes.Structure):
+    _fields_ = [("code", ctypes.c_uint8), ("bits", ctypes.c_uint8), ("lanes", ctypes.c_uint16)]
+
+
+class _DLTensor(ctypes.Structure):
+    _fields_ = [
+        ("data", ctypes.c_void_p),
+        ("device", _DLDevice),
+        ("ndim", ctypes.c_int32),
+        ("dtype", _DLDataType),
+        ("shape", ctypes.POINTER(ctypes.c_int64)),
+        ("strides", ctypes.POINTER(ctypes.c_int64)),
+        ("byte_offset", ctypes.c_uint64),
+    ]
+
+
+_FN_FROM_PY = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p))
+_FN_TO_PY = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p))
+_FN_DLTENSOR_FROM_PY = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p)
+_FN_ALLOCATOR = ctypes.CFUNCTYPE(
+    ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p), ctypes.c_void_p, ctypes.c_void_p
+)
+_FN_CURRENT_STREAM = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_int32, ctypes.POINTER(ctypes.c_void_p))
+
+
+class _DLPackExchangeAPI(ctypes.Structure):
+    _fields_ = [
+        ("header", _DLPackExchangeAPIHeader),
+        ("managed_tensor_allocator", _FN_ALLOCATOR),
+        ("managed_tensor_from_py_object_no_sync", _FN_FROM_PY),
+        ("managed_tensor_to_py_object_no_sync", _FN_TO_PY),
+        ("dltensor_from_py_object_no_sync", _FN_DLTENSOR_FROM_PY),
+        ("current_work_stream", _FN_CURRENT_STREAM),
+    ]
+
+
+def _get_exchange_api():
+    capsule = StridedMemoryView.__dlpack_c_exchange_api__
+    ptr = _PyCapsule_GetPointer(capsule, b"dlpack_exchange_api")
+    assert ptr
+    return ctypes.cast(ptr, ctypes.POINTER(_DLPackExchangeAPI)).contents
+
+
+def test_dlpack_c_exchange_api_header_version():
+    """The exchange-API header advertises a non-zero DLPack version."""
+    api = _get_exchange_api()
+    assert (api.header.version.major, api.header.version.minor) >= (1, 0)
+    assert not api.header.prev_api
+
+
+def test_dlpack_c_exchange_api_current_work_stream():
+    """``current_work_stream`` reports no current stream (cuda.core has none)."""
+    api = _get_exchange_api()
+    out = ctypes.c_void_p(123)
+    rc = api.current_work_stream(int(DLDeviceType.kDLCPU), 0, ctypes.byref(out))
+    assert rc == 0
+    assert not out.value  # set back to NULL
+
+
+def test_dlpack_c_exchange_api_dltensor_from_py_object():
+    """``dltensor_from_py_object_no_sync`` fills a borrowed DLTensor from a view."""
+    api = _get_exchange_api()
+    src = np.arange(12, dtype=np.int32).reshape(3, 4)
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    out = _DLTensor()
+    rc = api.dltensor_from_py_object_no_sync(id(view), ctypes.byref(out))
+    assert rc == 0
+    assert out.ndim == 2
+    assert out.device.device_type == int(DLDeviceType.kDLCPU)
+    assert out.data == src.ctypes.data
+    assert [out.shape[i] for i in range(out.ndim)] == [3, 4]
+
+
+def _exchange_api_cause(exc):
+    """Underlying exception raised by the noexcept C fn (surfaced by ctypes as
+    SystemError, with the real error chained as __cause__ or __context__)."""
+    return exc.value.__cause__ or exc.value.__context__
+
+
+def test_dlpack_c_exchange_api_dltensor_from_py_object_type_error():
+    """A non-StridedMemoryView py_object is rejected (TypeError, rc=-1)."""
+    api = _get_exchange_api()
+    not_a_view = object()
+    out = _DLTensor()
+    with pytest.raises(SystemError) as exc:
+        api.dltensor_from_py_object_no_sync(id(not_a_view), ctypes.byref(out))
+    assert isinstance(_exchange_api_cause(exc), TypeError)
+
+
+def test_dlpack_c_exchange_api_managed_tensor_roundtrip():
+    """``managed_tensor_from_py_object_no_sync`` produces a managed tensor that
+    ``managed_tensor_to_py_object_no_sync`` turns back into a StridedMemoryView.
+
+    This exercises the versioned export fill and the capsule-import helper.
+    The reconstructed view intentionally keeps a reference (the C side holds one
+    via Py_INCREF), so the managed tensor is not freed here -- avoiding any
+    double-free across the two calls that share the same tensor.
+    """
+    api = _get_exchange_api()
+    src = np.arange(6, dtype=np.float64).reshape(2, 3)
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+
+    tensor = ctypes.c_void_p(0)
+    rc = api.managed_tensor_from_py_object_no_sync(id(view), ctypes.byref(tensor))
+    assert rc == 0
+    assert tensor.value  # non-NULL DLManagedTensorVersioned*
+
+    out_obj = ctypes.c_void_p(0)
+    rc = api.managed_tensor_to_py_object_no_sync(tensor, ctypes.byref(out_obj))
+    assert rc == 0
+    assert out_obj.value
+    imported = ctypes.cast(ctypes.c_void_p(out_obj.value), ctypes.py_object).value
+    assert isinstance(imported, StridedMemoryView)
+    assert imported.shape == (2, 3)
+    assert imported.ptr == src.ctypes.data
+
+
+def test_dlpack_c_exchange_api_to_py_object_null_tensor():
+    """``managed_tensor_to_py_object_no_sync`` rejects a NULL tensor (RuntimeError)."""
+    api = _get_exchange_api()
+    out_obj = ctypes.c_void_p(0)
+    with pytest.raises(SystemError) as exc:
+        api.managed_tensor_to_py_object_no_sync(None, ctypes.byref(out_obj))
+    assert isinstance(_exchange_api_cause(exc), RuntimeError)
+    assert not out_obj.value  # set to NULL before the error
+
+
+def test_dlpack_c_exchange_api_managed_tensor_allocator_not_supported():
+    """``managed_tensor_allocator`` is unsupported (NotImplementedError)."""
+    api = _get_exchange_api()
+    out = ctypes.c_void_p(123)
+    with pytest.raises(SystemError) as exc:
+        api.managed_tensor_allocator(None, ctypes.byref(out), None, None)
+    assert isinstance(_exchange_api_cause(exc), NotImplementedError)
+    assert not out.value  # set to NULL before the error

From f22ad8e363cfcb648bafbb8f535baaed561cc2dc Mon Sep 17 00:00:00 2001
From: Rui Luo <ruluo@nvidia.com>
Date: Wed, 17 Jun 2026 12:51:45 +0800
Subject: [PATCH 2/4] Address cuda.core coverage test review comments

---
 cuda_core/tests/test_tensor_map.py   |  21 ++
 cuda_core/tests/test_utils.py        | 387 +--------------------------
 cuda_core/tests/test_utils_dlpack.py | 324 ++++++++++++++++++++++
 3 files changed, 355 insertions(+), 377 deletions(-)
 create mode 100644 cuda_core/tests/test_utils_dlpack.py

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 3ec4cb5b020..77629632844 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -308,6 +308,27 @@ def test_invalid_data_type(self, dev, skip_if_no_tma):
                 data_type=42,
             )
 
+    def test_as_tensor_map_host_view_rejected_without_tma(self):
+        """``as_tensor_map`` on a non-device-accessible (host) view fails
+        gracefully with a clear error, without needing TMA-capable hardware.
+
+        This drives the real ``as_tensor_map`` -> ``_from_tiled`` path: every
+        keyword is assembled and the options are coerced before the
+        device-accessibility guard rejects the host pointer, so no monkeypatching
+        is required to cover the forwarding logic.
+        """
+        host = np.zeros((64, 64), dtype=np.float32)
+        view = StridedMemoryView.from_any_interface(host, stream_ptr=-1)
+        with pytest.raises(ValueError, match="device-accessible"):
+            view.as_tensor_map(
+                box_dim=(32, 32),
+                data_type=TensorMapDataType.FLOAT32,
+                element_strides=(1, 1),
+                swizzle=TensorMapSwizzle.SWIZZLE_128B,
+                l2_promotion=TensorMapL2Promotion.L2_128B,
+                oob_fill=TensorMapOOBFill.NAN_REQUEST_ZERO_FMA,
+            )
+
 
 class TestTensorMapDtypeMapping:
     """Test automatic dtype inference from numpy dtypes."""
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 80631c90c36..fbb3367db88 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -1060,216 +1060,6 @@ def test_dlpack_export_non_native_endian_rejected():
         bad_view.__dlpack__()
 
 
-_NUMPY_NATIVE_DLPACK_DTYPES = (
-    np.uint8,
-    np.uint16,
-    np.uint32,
-    np.uint64,
-    np.int8,
-    np.int16,
-    np.int32,
-    np.int64,
-    np.float16,
-    np.float32,
-    np.float64,
-    np.complex64,
-    np.complex128,
-    np.bool_,
-)
-
-
-@pytest.mark.parametrize("dtype", _NUMPY_NATIVE_DLPACK_DTYPES)
-def test_strided_memory_view_dtype_roundtrip_all(dtype):
-    """Exercise dtype_dlpack_to_numpy for every NumPy-native DLPack dtype.
-
-    bfloat16 (kDLBfloat) is excluded -- NumPy's __dlpack__ doesn't reliably
-    export ml_dtypes-extended dtypes; cover separately via jax/torch if needed.
-    """
-    src = np.zeros(3, dtype=dtype)
-    # Probe NumPy first: if it can't export this dtype, skip as env limit.
-    # Any failure AFTER the probe is OUR consumer regression and must fail.
-    try:
-        src.__dlpack__()
-    except (BufferError, TypeError) as e:
-        pytest.skip(f"NumPy does not export {np.dtype(dtype)} via DLPack: {e}")
-    view = StridedMemoryView.from_dlpack(src, stream_ptr=-1)
-    assert view.dtype == np.dtype(dtype)  # .dtype triggers dtype_dlpack_to_numpy
-
-
-def test_as_tensor_map_assembles_kwargs(monkeypatch):
-    """``as_tensor_map`` forwards the view + box_dim and only the non-None
-    tiled options to ``TensorMapDescriptor._from_tiled``.
-
-    The real ``_from_tiled`` requires a device-accessible, 16-byte-aligned view
-    on TMA-capable hardware (sm90+), so we replace the (module-level) class the
-    method imports with a recorder and assert the assembled call instead.
-    """
-    captured = {}
-    sentinel = object()
-
-    class _RecordingTMD:
-        @classmethod
-        def _from_tiled(cls, view, box_dim=None, **kwargs):
-            captured["view"] = view
-            captured["box_dim"] = box_dim
-            captured["kwargs"] = kwargs
-            return sentinel
-
-    # as_tensor_map does `from cuda.core._tensor_map import TensorMapDescriptor`
-    # on each call, so patching the module attribute swaps the bound name.
-    monkeypatch.setattr("cuda.core._tensor_map.TensorMapDescriptor", _RecordingTMD)
-
-    src = np.zeros(6, dtype=np.float32)
-    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
-    result = view.as_tensor_map(
-        (2, 3),
-        options="OPT",
-        element_strides=(1, 1),
-        data_type="DT",
-        interleave="IL",
-        swizzle="SW",
-        l2_promotion="L2",
-        oob_fill="OOB",
-    )
-    assert result is sentinel
-    assert captured["view"] is view
-    assert captured["box_dim"] == (2, 3)
-    assert captured["kwargs"] == {
-        "options": "OPT",
-        "element_strides": (1, 1),
-        "data_type": "DT",
-        "interleave": "IL",
-        "swizzle": "SW",
-        "l2_promotion": "L2",
-        "oob_fill": "OOB",
-    }
-
-
-def test_as_tensor_map_omits_none_kwargs(monkeypatch):
-    """Tiled options left as None are not forwarded to ``_from_tiled``."""
-    captured = {}
-
-    class _RecordingTMD:
-        @classmethod
-        def _from_tiled(cls, _view, _box_dim=None, **kwargs):
-            captured["kwargs"] = kwargs
-            return None
-
-    monkeypatch.setattr("cuda.core._tensor_map.TensorMapDescriptor", _RecordingTMD)
-    view = StridedMemoryView.from_any_interface(np.zeros(6, dtype=np.float32), stream_ptr=-1)
-    view.as_tensor_map((6,))
-    assert captured["kwargs"] == {}
-
-
-def _assert_dlpack_export_roundtrip(src):
-    # Skip only if NumPy itself can't round-trip this dtype/shape; past the
-    # probe, a failure on our view is a regression, not an env limitation.
-    try:
-        np.from_dlpack(src)
-    except (BufferError, TypeError, RuntimeError) as e:
-        pytest.skip(f"NumPy does not support DLPack for {src.dtype} {src.shape}: {e}")
-    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
-    out = np.from_dlpack(view)
-    assert out.dtype == src.dtype
-    assert out.shape == src.shape
-    assert np.array_equal(out, src)
-
-
-@pytest.mark.parametrize("dtype", _NUMPY_NATIVE_DLPACK_DTYPES)
-def test_dlpack_export_roundtrip_dtypes(dtype):
-    """Export every NumPy-native DLPack dtype through ``StridedMemoryView.__dlpack__``."""
-    _assert_dlpack_export_roundtrip(np.zeros((2, 3), dtype=dtype))
-
-
-@pytest.mark.parametrize(
-    "shape",
-    [pytest.param((), id="scalar"), pytest.param((0, 3), id="empty")],
-)
-def test_dlpack_export_roundtrip_special_shapes(shape):
-    """Export scalar and zero-volume shapes through ``StridedMemoryView.__dlpack__``."""
-    _assert_dlpack_export_roundtrip(np.zeros(shape, dtype=np.complex128))
-
-
-def test_dlpack_export_unversioned_capsule_and_deleter():
-    """``__dlpack__()`` with no ``max_version`` yields an *unversioned* unused
-    DLPack capsule; dropping it unconsumed runs ``_smv_pycapsule_deleter`` on
-    the non-versioned branch (freeing the managed tensor)."""
-    src = np.arange(6, dtype=np.int32)
-    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
-    capsule = view.__dlpack__()
-    assert _PyCapsule_IsValid(capsule, b"dltensor") == 1
-    assert _PyCapsule_IsValid(capsule, b"dltensor_versioned") == 0
-    del capsule  # unconsumed -> deleter frees dlm_tensor
-
-
-def test_dlpack_export_versioned_capsule_and_deleter():
-    """``__dlpack__(max_version=(1, 0))`` yields a *versioned* unused capsule;
-    dropping it unconsumed runs the versioned ``_smv_pycapsule_deleter`` branch."""
-    src = np.arange(6, dtype=np.int32)
-    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
-    capsule = view.__dlpack__(max_version=(1, 0))
-    assert _PyCapsule_IsValid(capsule, b"dltensor_versioned") == 1
-    assert _PyCapsule_IsValid(capsule, b"dltensor") == 0
-    del capsule  # unconsumed -> versioned deleter frees dlm_tensor_ver
-
-
-def test_from_dlpack_cpu_stream_none_ambiguous():
-    """A CPU DLPack source with ``stream_ptr=None`` is rejected as ambiguous."""
-    src = np.arange(4, dtype=np.float32)
-    with pytest.raises(BufferError, match="stream=None is ambiguous"):
-        StridedMemoryView.from_dlpack(src, stream_ptr=None)
-
-
-def test_from_dlpack_unsupported_device_type():
-    """``view_as_dlpack`` rejects a DLPack device that is neither CPU, CUDA,
-    CUDA-pinned, nor CUDA-managed before ever calling ``__dlpack__``."""
-
-    class _FakeUnsupportedDevice:
-        def __dlpack_device__(self):
-            return (7, 0)  # e.g. kDLVulkan -- unsupported by cuda.core
-
-        def __dlpack__(self, **kwargs):
-            raise AssertionError("__dlpack__ must not be reached")
-
-    with pytest.raises(BufferError, match="device not supported"):
-        StridedMemoryView.from_dlpack(_FakeUnsupportedDevice(), stream_ptr=0)
-
-
-class _DLPackNoMaxVersion:
-    """Wraps a NumPy array but rejects the ``max_version`` kwarg, forcing the
-    TypeError fallback in ``view_as_dlpack`` and an *unversioned* capsule import."""
-
-    def __init__(self, arr):
-        self._arr = arr
-        self.max_versions = []  # max_version seen on each __dlpack__ call, in order
-
-    def __dlpack_device__(self):
-        return self._arr.__dlpack_device__()
-
-    def __dlpack__(self, *, stream=None, max_version=None, **kwargs):
-        self.max_versions.append(max_version)
-        if max_version is not None:
-            raise TypeError("max_version is not supported")
-        return self._arr.__dlpack__(stream=stream)
-
-
-def test_from_dlpack_typeerror_fallback_unversioned_import():
-    """When ``__dlpack__(max_version=...)`` raises TypeError, view_as_dlpack
-    retries without it and imports the resulting unversioned capsule; the view
-    then owns that capsule and frees it on ``__dealloc__``."""
-    src = np.arange(6, dtype=np.int32)
-    wrapper = _DLPackNoMaxVersion(src)
-    view = StridedMemoryView.from_dlpack(wrapper, stream_ptr=-1)
-    # Guard the TypeError fallback path: versioned attempt, then legacy retry.
-    assert len(wrapper.max_versions) == 2, f"expected versioned attempt + retry, got {wrapper.max_versions}"
-    assert isinstance(wrapper.max_versions[0], tuple)  # versioned attempt was made
-    assert wrapper.max_versions[1] is None  # fallback retried without max_version
-    assert view.ptr == src.ctypes.data
-    out = np.from_dlpack(view)
-    assert np.array_equal(out, src)
-    del view  # exercise __dealloc__ on the imported (used) unversioned capsule
-
-
 def test_strided_memory_view_proxy_cai_only_has_dlpack_false():
     """``_StridedMemoryViewProxy`` records ``has_dlpack=False`` for an object
     that exposes only ``__cuda_array_interface__`` (check_has_dlpack CAI branch)."""
@@ -1283,11 +1073,15 @@ def test_strided_memory_view_proxy_cai_only_has_dlpack_false():
 
 def test_view_as_cai_device_pointer_and_stream_ordering(init_cuda):
     """``view_as_cai`` on a real device pointer resolves the device ordinal via
-    ``cuPointerGetAttribute`` and, when the CAI ``stream`` differs from the
-    consumer stream, establishes stream ordering through an event.
-
-    Uses a synthetic CAI object backed by a genuine device allocation, so the
-    cupy/numba-only device branch is exercised without those optional deps.
+    ``cuPointerGetAttribute`` and takes the cross-stream branch when the CAI
+    ``stream`` differs from the consumer stream.
+
+    This only exercises the code path and checks *device* correctness (ptr,
+    device_id, shape); it does NOT verify stream-order correctness -- that would
+    need many queued kernels on the producer stream plus an observable
+    dependency on the consumer side. Uses a synthetic CAI object backed by a
+    genuine device allocation, so the cupy/numba-only device branch is exercised
+    without those optional deps.
     """
     dev = init_cuda
     buffer = dev.memory_resource.allocate(64, stream=dev.default_stream)
@@ -1314,7 +1108,7 @@ def test_strided_memory_view_init_cai_path_deprecated(init_cuda):
     """The deprecated ``StridedMemoryView(obj)`` constructor routes a CAI-only
     object through the CAI branch (warn + ``view_as_cai``), not the DLPack one."""
     obj = _make_cuda_array_interface_obj(shape=(4,), strides=None, typestr="<f4", data=(0, False))
-    with pytest.deprecated_call(match="deprecated"):
+    with pytest.deprecated_call(match="CUDA-array-interface-supporting"):
         view = StridedMemoryView(obj, stream_ptr=-1)
     assert view.is_device_accessible is True
     assert view.shape == (4,)
@@ -1355,164 +1149,3 @@ def test_strided_memory_view_repr_with_none_dtype(init_cuda):
     r = repr(view)
     assert r.startswith("StridedMemoryView(ptr=")
     assert "dtype=NoneType" in r
-
-
-# ---------------------------------------------------------------------------
-# DLPack C exchange API (`__dlpack_c_exchange_api__`)
-#
-# Drive the C function pointers exposed by the capsule the way a native
-# consumer would, exercising the StridedMemoryView exchange-API implementation.
-# ---------------------------------------------------------------------------
-
-_PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer
-_PyCapsule_GetPointer.argtypes = (ctypes.py_object, ctypes.c_char_p)
-_PyCapsule_GetPointer.restype = ctypes.c_void_p
-
-
-class _DLPackVersion(ctypes.Structure):
-    _fields_ = [("major", ctypes.c_uint32), ("minor", ctypes.c_uint32)]
-
-
-class _DLPackExchangeAPIHeader(ctypes.Structure):
-    _fields_ = [("version", _DLPackVersion), ("prev_api", ctypes.c_void_p)]
-
-
-class _DLDevice(ctypes.Structure):
-    _fields_ = [("device_type", ctypes.c_int), ("device_id", ctypes.c_int32)]
-
-
-class _DLDataType(ctypes.Structure):
-    _fields_ = [("code", ctypes.c_uint8), ("bits", ctypes.c_uint8), ("lanes", ctypes.c_uint16)]
-
-
-class _DLTensor(ctypes.Structure):
-    _fields_ = [
-        ("data", ctypes.c_void_p),
-        ("device", _DLDevice),
-        ("ndim", ctypes.c_int32),
-        ("dtype", _DLDataType),
-        ("shape", ctypes.POINTER(ctypes.c_int64)),
-        ("strides", ctypes.POINTER(ctypes.c_int64)),
-        ("byte_offset", ctypes.c_uint64),
-    ]
-
-
-_FN_FROM_PY = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p))
-_FN_TO_PY = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p))
-_FN_DLTENSOR_FROM_PY = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p)
-_FN_ALLOCATOR = ctypes.CFUNCTYPE(
-    ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p), ctypes.c_void_p, ctypes.c_void_p
-)
-_FN_CURRENT_STREAM = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_int32, ctypes.POINTER(ctypes.c_void_p))
-
-
-class _DLPackExchangeAPI(ctypes.Structure):
-    _fields_ = [
-        ("header", _DLPackExchangeAPIHeader),
-        ("managed_tensor_allocator", _FN_ALLOCATOR),
-        ("managed_tensor_from_py_object_no_sync", _FN_FROM_PY),
-        ("managed_tensor_to_py_object_no_sync", _FN_TO_PY),
-        ("dltensor_from_py_object_no_sync", _FN_DLTENSOR_FROM_PY),
-        ("current_work_stream", _FN_CURRENT_STREAM),
-    ]
-
-
-def _get_exchange_api():
-    capsule = StridedMemoryView.__dlpack_c_exchange_api__
-    ptr = _PyCapsule_GetPointer(capsule, b"dlpack_exchange_api")
-    assert ptr
-    return ctypes.cast(ptr, ctypes.POINTER(_DLPackExchangeAPI)).contents
-
-
-def test_dlpack_c_exchange_api_header_version():
-    """The exchange-API header advertises a non-zero DLPack version."""
-    api = _get_exchange_api()
-    assert (api.header.version.major, api.header.version.minor) >= (1, 0)
-    assert not api.header.prev_api
-
-
-def test_dlpack_c_exchange_api_current_work_stream():
-    """``current_work_stream`` reports no current stream (cuda.core has none)."""
-    api = _get_exchange_api()
-    out = ctypes.c_void_p(123)
-    rc = api.current_work_stream(int(DLDeviceType.kDLCPU), 0, ctypes.byref(out))
-    assert rc == 0
-    assert not out.value  # set back to NULL
-
-
-def test_dlpack_c_exchange_api_dltensor_from_py_object():
-    """``dltensor_from_py_object_no_sync`` fills a borrowed DLTensor from a view."""
-    api = _get_exchange_api()
-    src = np.arange(12, dtype=np.int32).reshape(3, 4)
-    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
-    out = _DLTensor()
-    rc = api.dltensor_from_py_object_no_sync(id(view), ctypes.byref(out))
-    assert rc == 0
-    assert out.ndim == 2
-    assert out.device.device_type == int(DLDeviceType.kDLCPU)
-    assert out.data == src.ctypes.data
-    assert [out.shape[i] for i in range(out.ndim)] == [3, 4]
-
-
-def _exchange_api_cause(exc):
-    """Underlying exception raised by the noexcept C fn (surfaced by ctypes as
-    SystemError, with the real error chained as __cause__ or __context__)."""
-    return exc.value.__cause__ or exc.value.__context__
-
-
-def test_dlpack_c_exchange_api_dltensor_from_py_object_type_error():
-    """A non-StridedMemoryView py_object is rejected (TypeError, rc=-1)."""
-    api = _get_exchange_api()
-    not_a_view = object()
-    out = _DLTensor()
-    with pytest.raises(SystemError) as exc:
-        api.dltensor_from_py_object_no_sync(id(not_a_view), ctypes.byref(out))
-    assert isinstance(_exchange_api_cause(exc), TypeError)
-
-
-def test_dlpack_c_exchange_api_managed_tensor_roundtrip():
-    """``managed_tensor_from_py_object_no_sync`` produces a managed tensor that
-    ``managed_tensor_to_py_object_no_sync`` turns back into a StridedMemoryView.
-
-    This exercises the versioned export fill and the capsule-import helper.
-    The reconstructed view intentionally keeps a reference (the C side holds one
-    via Py_INCREF), so the managed tensor is not freed here -- avoiding any
-    double-free across the two calls that share the same tensor.
-    """
-    api = _get_exchange_api()
-    src = np.arange(6, dtype=np.float64).reshape(2, 3)
-    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
-
-    tensor = ctypes.c_void_p(0)
-    rc = api.managed_tensor_from_py_object_no_sync(id(view), ctypes.byref(tensor))
-    assert rc == 0
-    assert tensor.value  # non-NULL DLManagedTensorVersioned*
-
-    out_obj = ctypes.c_void_p(0)
-    rc = api.managed_tensor_to_py_object_no_sync(tensor, ctypes.byref(out_obj))
-    assert rc == 0
-    assert out_obj.value
-    imported = ctypes.cast(ctypes.c_void_p(out_obj.value), ctypes.py_object).value
-    assert isinstance(imported, StridedMemoryView)
-    assert imported.shape == (2, 3)
-    assert imported.ptr == src.ctypes.data
-
-
-def test_dlpack_c_exchange_api_to_py_object_null_tensor():
-    """``managed_tensor_to_py_object_no_sync`` rejects a NULL tensor (RuntimeError)."""
-    api = _get_exchange_api()
-    out_obj = ctypes.c_void_p(0)
-    with pytest.raises(SystemError) as exc:
-        api.managed_tensor_to_py_object_no_sync(None, ctypes.byref(out_obj))
-    assert isinstance(_exchange_api_cause(exc), RuntimeError)
-    assert not out_obj.value  # set to NULL before the error
-
-
-def test_dlpack_c_exchange_api_managed_tensor_allocator_not_supported():
-    """``managed_tensor_allocator`` is unsupported (NotImplementedError)."""
-    api = _get_exchange_api()
-    out = ctypes.c_void_p(123)
-    with pytest.raises(SystemError) as exc:
-        api.managed_tensor_allocator(None, ctypes.byref(out), None, None)
-    assert isinstance(_exchange_api_cause(exc), NotImplementedError)
-    assert not out.value  # set to NULL before the error
diff --git a/cuda_core/tests/test_utils_dlpack.py b/cuda_core/tests/test_utils_dlpack.py
new file mode 100644
index 00000000000..03f6e8c3432
--- /dev/null
+++ b/cuda_core/tests/test_utils_dlpack.py
@@ -0,0 +1,324 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""DLPack-focused tests for ``StridedMemoryView``.
+
+Split out of ``test_utils.py`` (which had grown large): export/import
+round-trips, capsule + deleter paths, ``from_dlpack`` error handling, and the
+``__dlpack_c_exchange_api__`` C exchange-API helpers driven through ctypes.
+CAI-only behavior stays in ``test_utils.py``.
+"""
+
+import ctypes
+
+try:
+    import ml_dtypes
+except ImportError:
+    ml_dtypes = None
+import numpy as np
+import pytest
+
+from cuda.core._dlpack import DLDeviceType
+from cuda.core.utils import StridedMemoryView
+
+_PyCapsule_IsValid = ctypes.pythonapi.PyCapsule_IsValid
+_PyCapsule_IsValid.argtypes = (ctypes.py_object, ctypes.c_char_p)
+_PyCapsule_IsValid.restype = ctypes.c_int
+
+
+_NUMPY_NATIVE_DLPACK_DTYPES = (
+    np.uint8,
+    np.uint16,
+    np.uint32,
+    np.uint64,
+    np.int8,
+    np.int16,
+    np.int32,
+    np.int64,
+    np.float16,
+    np.float32,
+    np.float64,
+    np.complex64,
+    np.complex128,
+    np.bool_,
+)
+if ml_dtypes is not None:
+    # Supported on NumPy 2.5 and ml_dtypes (probably) 0.5.5+. On older stacks the
+    # per-test probe skips it, since NumPy's __dlpack__ doesn't reliably export
+    # ml_dtypes-extended dtypes (covered separately via jax/torch).
+    _NUMPY_NATIVE_DLPACK_DTYPES += (ml_dtypes.bfloat16,)
+
+
+def _assert_dlpack_export_roundtrip(src):
+    # Skip only if NumPy itself can't round-trip this dtype/shape; past the
+    # probe, a failure on our view is a regression, not an env limitation.
+    try:
+        np.from_dlpack(src)
+    except (BufferError, TypeError, RuntimeError) as e:
+        pytest.skip(f"NumPy does not support DLPack for {src.dtype} {src.shape}: {e}")
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    out = np.from_dlpack(view)
+    assert out.dtype == src.dtype
+    assert out.shape == src.shape
+    assert np.array_equal(out, src)
+
+
+@pytest.mark.parametrize("dtype", _NUMPY_NATIVE_DLPACK_DTYPES)
+def test_dlpack_export_roundtrip_dtypes(dtype):
+    """Export every NumPy-native DLPack dtype through ``StridedMemoryView.__dlpack__``."""
+    _assert_dlpack_export_roundtrip(np.zeros((2, 3), dtype=dtype))
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [pytest.param((), id="scalar"), pytest.param((0, 3), id="empty")],
+)
+def test_dlpack_export_roundtrip_special_shapes(shape):
+    """Export scalar and zero-volume shapes through ``StridedMemoryView.__dlpack__``."""
+    _assert_dlpack_export_roundtrip(np.zeros(shape, dtype=np.complex128))
+
+
+def test_dlpack_export_unversioned_capsule_and_deleter():
+    """``__dlpack__()`` with no ``max_version`` yields an *unversioned* unused
+    DLPack capsule; dropping it unconsumed runs ``_smv_pycapsule_deleter`` on
+    the non-versioned branch (freeing the managed tensor)."""
+    src = np.arange(6, dtype=np.int32)
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    capsule = view.__dlpack__()
+    assert _PyCapsule_IsValid(capsule, b"dltensor") == 1
+    assert _PyCapsule_IsValid(capsule, b"dltensor_versioned") == 0
+    del capsule  # unconsumed -> deleter frees dlm_tensor
+
+
+def test_dlpack_export_versioned_capsule_and_deleter():
+    """``__dlpack__(max_version=(1, 0))`` yields a *versioned* unused capsule;
+    dropping it unconsumed runs the versioned ``_smv_pycapsule_deleter`` branch."""
+    src = np.arange(6, dtype=np.int32)
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    capsule = view.__dlpack__(max_version=(1, 0))
+    assert _PyCapsule_IsValid(capsule, b"dltensor_versioned") == 1
+    assert _PyCapsule_IsValid(capsule, b"dltensor") == 0
+    del capsule  # unconsumed -> versioned deleter frees dlm_tensor_ver
+
+
+def test_from_dlpack_cpu_stream_none_ambiguous():
+    """A CPU DLPack source with ``stream_ptr=None`` is rejected as ambiguous."""
+    src = np.arange(4, dtype=np.float32)
+    with pytest.raises(BufferError, match="stream=None is ambiguous"):
+        StridedMemoryView.from_dlpack(src, stream_ptr=None)
+
+
+def test_from_dlpack_unsupported_device_type():
+    """``view_as_dlpack`` rejects a DLPack device that is neither CPU, CUDA,
+    CUDA-pinned, nor CUDA-managed before ever calling ``__dlpack__``."""
+
+    class _FakeUnsupportedDevice:
+        def __dlpack_device__(self):
+            return (7, 0)  # e.g. kDLVulkan -- unsupported by cuda.core
+
+        def __dlpack__(self, **kwargs):
+            raise AssertionError("__dlpack__ must not be reached")
+
+    with pytest.raises(BufferError, match="device not supported"):
+        StridedMemoryView.from_dlpack(_FakeUnsupportedDevice(), stream_ptr=0)
+
+
+class _DLPackNoMaxVersion:
+    """Wraps a StridedMemoryView but rejects the ``max_version`` kwarg, forcing the
+    TypeError fallback in ``view_as_dlpack`` and an *unversioned* capsule import.
+
+    Backed by a StridedMemoryView (not NumPy directly) so the test stays valid
+    even if NumPy eventually stops exporting unversioned (0.x) DLPack capsules."""
+
+    def __init__(self, arr):
+        self._arr = StridedMemoryView.from_any_interface(arr, stream_ptr=-1)
+        self.max_versions = []  # max_version seen on each __dlpack__ call, in order
+
+    def __dlpack_device__(self):
+        return self._arr.__dlpack_device__()
+
+    def __dlpack__(self, *, stream=None, max_version=None, **kwargs):
+        self.max_versions.append(max_version)
+        if max_version is not None:
+            raise TypeError("max_version is not supported")
+        return self._arr.__dlpack__(stream=stream)
+
+
+def test_from_dlpack_typeerror_fallback_unversioned_import():
+    """When ``__dlpack__(max_version=...)`` raises TypeError, view_as_dlpack
+    retries without it and imports the resulting unversioned capsule; the view
+    then owns that capsule and frees it on ``__dealloc__``."""
+    src = np.arange(6, dtype=np.int32)
+    wrapper = _DLPackNoMaxVersion(src)
+    view = StridedMemoryView.from_dlpack(wrapper, stream_ptr=-1)
+    # Guard the TypeError fallback path: versioned attempt, then legacy retry.
+    assert len(wrapper.max_versions) == 2, f"expected versioned attempt + retry, got {wrapper.max_versions}"
+    assert isinstance(wrapper.max_versions[0], tuple)  # versioned attempt was made
+    assert wrapper.max_versions[1] is None  # fallback retried without max_version
+    assert view.ptr == src.ctypes.data
+    out = np.from_dlpack(view)
+    assert np.array_equal(out, src)
+    del view  # exercise __dealloc__ on the imported (used) unversioned capsule
+
+
+# ---------------------------------------------------------------------------
+# DLPack C exchange API (`__dlpack_c_exchange_api__`)
+#
+# Drive the C function pointers exposed by the capsule the way a native
+# consumer would, exercising the StridedMemoryView exchange-API implementation.
+#
+# The C functions report failure by setting a Python error and returning -1.
+# Defining the pointers with PYFUNCTYPE (Python calling convention) lets ctypes
+# propagate that real exception (TypeError/RuntimeError/NotImplementedError)
+# instead of wrapping it in a SystemError, so the tests assert the meaningful
+# type directly.
+# ---------------------------------------------------------------------------
+
+_PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer
+_PyCapsule_GetPointer.argtypes = (ctypes.py_object, ctypes.c_char_p)
+_PyCapsule_GetPointer.restype = ctypes.c_void_p
+
+
+class _DLPackVersion(ctypes.Structure):
+    _fields_ = [("major", ctypes.c_uint32), ("minor", ctypes.c_uint32)]
+
+
+class _DLPackExchangeAPIHeader(ctypes.Structure):
+    _fields_ = [("version", _DLPackVersion), ("prev_api", ctypes.c_void_p)]
+
+
+class _DLDevice(ctypes.Structure):
+    _fields_ = [("device_type", ctypes.c_int), ("device_id", ctypes.c_int32)]
+
+
+class _DLDataType(ctypes.Structure):
+    _fields_ = [("code", ctypes.c_uint8), ("bits", ctypes.c_uint8), ("lanes", ctypes.c_uint16)]
+
+
+class _DLTensor(ctypes.Structure):
+    _fields_ = [
+        ("data", ctypes.c_void_p),
+        ("device", _DLDevice),
+        ("ndim", ctypes.c_int32),
+        ("dtype", _DLDataType),
+        ("shape", ctypes.POINTER(ctypes.c_int64)),
+        ("strides", ctypes.POINTER(ctypes.c_int64)),
+        ("byte_offset", ctypes.c_uint64),
+    ]
+
+
+_FN_FROM_PY = ctypes.PYFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p))
+_FN_TO_PY = ctypes.PYFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p))
+_FN_DLTENSOR_FROM_PY = ctypes.PYFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p)
+_FN_ALLOCATOR = ctypes.PYFUNCTYPE(
+    ctypes.c_int, ctypes.c_void_p, ctypes.POINTER(ctypes.c_void_p), ctypes.c_void_p, ctypes.c_void_p
+)
+_FN_CURRENT_STREAM = ctypes.PYFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_int32, ctypes.POINTER(ctypes.c_void_p))
+
+
+class _DLPackExchangeAPI(ctypes.Structure):
+    _fields_ = [
+        ("header", _DLPackExchangeAPIHeader),
+        ("managed_tensor_allocator", _FN_ALLOCATOR),
+        ("managed_tensor_from_py_object_no_sync", _FN_FROM_PY),
+        ("managed_tensor_to_py_object_no_sync", _FN_TO_PY),
+        ("dltensor_from_py_object_no_sync", _FN_DLTENSOR_FROM_PY),
+        ("current_work_stream", _FN_CURRENT_STREAM),
+    ]
+
+
+def _get_exchange_api():
+    capsule = StridedMemoryView.__dlpack_c_exchange_api__
+    ptr = _PyCapsule_GetPointer(capsule, b"dlpack_exchange_api")
+    assert ptr
+    return ctypes.cast(ptr, ctypes.POINTER(_DLPackExchangeAPI)).contents
+
+
+def test_dlpack_c_exchange_api_header_version():
+    """The exchange-API header advertises a non-zero DLPack version."""
+    api = _get_exchange_api()
+    assert (api.header.version.major, api.header.version.minor) >= (1, 0)
+    assert not api.header.prev_api
+
+
+def test_dlpack_c_exchange_api_current_work_stream():
+    """``current_work_stream`` reports no current stream (cuda.core has none)."""
+    api = _get_exchange_api()
+    out = ctypes.c_void_p(123)
+    rc = api.current_work_stream(int(DLDeviceType.kDLCPU), 0, ctypes.byref(out))
+    assert rc == 0
+    assert not out.value  # set back to NULL
+
+
+def test_dlpack_c_exchange_api_dltensor_from_py_object():
+    """``dltensor_from_py_object_no_sync`` fills a borrowed DLTensor from a view."""
+    api = _get_exchange_api()
+    src = np.arange(12, dtype=np.int32).reshape(3, 4)
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    out = _DLTensor()
+    rc = api.dltensor_from_py_object_no_sync(id(view), ctypes.byref(out))
+    assert rc == 0
+    assert out.ndim == 2
+    assert out.device.device_type == int(DLDeviceType.kDLCPU)
+    assert out.data == src.ctypes.data
+    assert [out.shape[i] for i in range(out.ndim)] == [3, 4]
+
+
+def test_dlpack_c_exchange_api_dltensor_from_py_object_type_error():
+    """A non-StridedMemoryView py_object is rejected (TypeError, rc=-1)."""
+    api = _get_exchange_api()
+    not_a_view = object()
+    out = _DLTensor()
+    with pytest.raises(TypeError, match="must be a StridedMemoryView"):
+        api.dltensor_from_py_object_no_sync(id(not_a_view), ctypes.byref(out))
+
+
+def test_dlpack_c_exchange_api_managed_tensor_roundtrip():
+    """``managed_tensor_from_py_object_no_sync`` produces a managed tensor that
+    ``managed_tensor_to_py_object_no_sync`` turns back into a StridedMemoryView.
+
+    This exercises the versioned export fill and the capsule-import helper.
+    The reconstructed view intentionally keeps a reference (the C side holds one
+    via Py_INCREF), so the managed tensor is not freed here -- avoiding any
+    double-free across the two calls that share the same tensor.
+    """
+    api = _get_exchange_api()
+    src = np.arange(6, dtype=np.float64).reshape(2, 3)
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+
+    tensor = ctypes.c_void_p(0)
+    rc = api.managed_tensor_from_py_object_no_sync(id(view), ctypes.byref(tensor))
+    assert rc == 0
+    assert tensor.value  # non-NULL DLManagedTensorVersioned*
+
+    out_obj = ctypes.c_void_p(0)
+    rc = api.managed_tensor_to_py_object_no_sync(tensor, ctypes.byref(out_obj))
+    assert rc == 0
+    assert out_obj.value
+    imported = ctypes.cast(ctypes.c_void_p(out_obj.value), ctypes.py_object).value
+    assert isinstance(imported, StridedMemoryView)
+    assert imported.shape == (2, 3)
+    assert imported.ptr == src.ctypes.data
+
+
+def test_dlpack_c_exchange_api_to_py_object_null_tensor():
+    """``managed_tensor_to_py_object_no_sync`` rejects a NULL tensor (RuntimeError)."""
+    api = _get_exchange_api()
+    out_obj = ctypes.c_void_p(0)
+    with pytest.raises(RuntimeError, match="tensor cannot be NULL"):
+        api.managed_tensor_to_py_object_no_sync(None, ctypes.byref(out_obj))
+    assert not out_obj.value  # set to NULL before the error
+
+
+def test_dlpack_c_exchange_api_managed_tensor_allocator_not_supported():
+    """``managed_tensor_allocator`` is unsupported (NotImplementedError).
+
+    The implementation sets a Python error even when no ``SetError`` callback is
+    passed, so with PYFUNCTYPE ctypes surfaces the NotImplementedError directly.
+    """
+    api = _get_exchange_api()
+    out = ctypes.c_void_p(123)
+    with pytest.raises(NotImplementedError, match="not supported"):
+        api.managed_tensor_allocator(None, ctypes.byref(out), None, None)
+    assert not out.value  # set to NULL before the error

From c22e9dec3a00bd8080b8c5c855a45947dc307fac Mon Sep 17 00:00:00 2001
From: Rui Luo <ruluo@nvidia.com>
Date: Wed, 17 Jun 2026 13:23:02 +0800
Subject: [PATCH 3/4] Fix pytest parametrize collection for CPU samples

---
 cuda_core/tests/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index fbb3367db88..1ac5e8a67eb 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -148,7 +148,7 @@ def _cpu_array_samples():
     return samples
 
 
-@pytest.mark.parametrize("in_arr,", _cpu_array_samples())
+@pytest.mark.parametrize("in_arr", _cpu_array_samples())
 class TestViewCPU:
     def test_args_viewable_as_strided_memory_cpu(self, in_arr):
         @args_viewable_as_strided_memory((0,))

From 007cbdd9382d9a1d7cc7d1d0c02b0459d5c983ca Mon Sep 17 00:00:00 2001
From: Rui Luo <ruluo@nvidia.com>
Date: Wed, 17 Jun 2026 15:33:54 +0800
Subject: [PATCH 4/4] coverage: refine cuda.core coverage tests comments

---
 cuda_core/tests/test_tensor_map.py   | 11 +++--------
 cuda_core/tests/test_utils.py        | 10 ++++------
 cuda_core/tests/test_utils_dlpack.py | 23 +++++++----------------
 3 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/cuda_core/tests/test_tensor_map.py b/cuda_core/tests/test_tensor_map.py
index 77629632844..7abbaadb483 100644
--- a/cuda_core/tests/test_tensor_map.py
+++ b/cuda_core/tests/test_tensor_map.py
@@ -309,14 +309,9 @@ def test_invalid_data_type(self, dev, skip_if_no_tma):
             )
 
     def test_as_tensor_map_host_view_rejected_without_tma(self):
-        """``as_tensor_map`` on a non-device-accessible (host) view fails
-        gracefully with a clear error, without needing TMA-capable hardware.
-
-        This drives the real ``as_tensor_map`` -> ``_from_tiled`` path: every
-        keyword is assembled and the options are coerced before the
-        device-accessibility guard rejects the host pointer, so no monkeypatching
-        is required to cover the forwarding logic.
-        """
+        """``as_tensor_map`` rejects a non-device-accessible (host) view with a
+        clear error, exercising the ``as_tensor_map`` -> ``_from_tiled`` path
+        without needing TMA-capable hardware."""
         host = np.zeros((64, 64), dtype=np.float32)
         view = StridedMemoryView.from_any_interface(host, stream_ptr=-1)
         with pytest.raises(ValueError, match="device-accessible"):
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 1ac5e8a67eb..ebee8d87b04 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -1077,11 +1077,9 @@ def test_view_as_cai_device_pointer_and_stream_ordering(init_cuda):
     ``stream`` differs from the consumer stream.
 
     This only exercises the code path and checks *device* correctness (ptr,
-    device_id, shape); it does NOT verify stream-order correctness -- that would
-    need many queued kernels on the producer stream plus an observable
-    dependency on the consumer side. Uses a synthetic CAI object backed by a
-    genuine device allocation, so the cupy/numba-only device branch is exercised
-    without those optional deps.
+    device_id, shape); it does NOT verify stream-order correctness. Uses a
+    synthetic CAI object backed by a genuine device allocation, so the
+    cupy/numba-only device branch is exercised without those optional deps.
     """
     dev = init_cuda
     buffer = dev.memory_resource.allocate(64, stream=dev.default_stream)
@@ -1108,7 +1106,7 @@ def test_strided_memory_view_init_cai_path_deprecated(init_cuda):
     """The deprecated ``StridedMemoryView(obj)`` constructor routes a CAI-only
     object through the CAI branch (warn + ``view_as_cai``), not the DLPack one."""
     obj = _make_cuda_array_interface_obj(shape=(4,), strides=None, typestr="<f4", data=(0, False))
-    with pytest.deprecated_call(match="CUDA-array-interface-supporting"):
+    with pytest.deprecated_call(match="CUDA-array-interface-supporting object is deprecated"):
         view = StridedMemoryView(obj, stream_ptr=-1)
     assert view.is_device_accessible is True
     assert view.shape == (4,)
diff --git a/cuda_core/tests/test_utils_dlpack.py b/cuda_core/tests/test_utils_dlpack.py
index 03f6e8c3432..8033d8250b2 100644
--- a/cuda_core/tests/test_utils_dlpack.py
+++ b/cuda_core/tests/test_utils_dlpack.py
@@ -2,12 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""DLPack-focused tests for ``StridedMemoryView``.
-
-Split out of ``test_utils.py`` (which had grown large): export/import
-round-trips, capsule + deleter paths, ``from_dlpack`` error handling, and the
+"""DLPack tests for ``StridedMemoryView``: export/import round-trips, capsule +
+deleter paths, ``from_dlpack`` error handling, and the
 ``__dlpack_c_exchange_api__`` C exchange-API helpers driven through ctypes.
-CAI-only behavior stays in ``test_utils.py``.
 """
 
 import ctypes
@@ -167,12 +164,8 @@ def test_from_dlpack_typeerror_fallback_unversioned_import():
 #
 # Drive the C function pointers exposed by the capsule the way a native
 # consumer would, exercising the StridedMemoryView exchange-API implementation.
-#
-# The C functions report failure by setting a Python error and returning -1.
-# Defining the pointers with PYFUNCTYPE (Python calling convention) lets ctypes
-# propagate that real exception (TypeError/RuntimeError/NotImplementedError)
-# instead of wrapping it in a SystemError, so the tests assert the meaningful
-# type directly.
+# Pointers use PYFUNCTYPE so a failing call raises its real Python exception
+# (TypeError/RuntimeError/NotImplementedError).
 # ---------------------------------------------------------------------------
 
 _PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer
@@ -312,13 +305,11 @@ def test_dlpack_c_exchange_api_to_py_object_null_tensor():
 
 
 def test_dlpack_c_exchange_api_managed_tensor_allocator_not_supported():
-    """``managed_tensor_allocator`` is unsupported (NotImplementedError).
-
-    The implementation sets a Python error even when no ``SetError`` callback is
-    passed, so with PYFUNCTYPE ctypes surfaces the NotImplementedError directly.
-    """
+    """Covers the ``managed_tensor_allocator`` entry point, which is unsupported
+    and only ever raises NotImplementedError (StridedMemoryView never allocates)."""
     api = _get_exchange_api()
     out = ctypes.c_void_p(123)
     with pytest.raises(NotImplementedError, match="not supported"):
+        # Currently sets a Python error when `SetError` isn't passed.
         api.managed_tensor_allocator(None, ctypes.byref(out), None, None)
     assert not out.value  # set to NULL before the error