From f4ce3dc140f5d3abee707f853574bb12bf620131 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 09:28:24 +0200
Subject: [PATCH 01/27] Move CUDA interop behind extension target

---
 CMakeLists.txt                                | 15 +--
 cmake/FindZLIB.cmake                          |  4 +-
 examples_tests                                |  2 +-
 .../{video => ext/CUDAInterop}/CCUDADevice.h  | 13 ++-
 .../CUDAInterop}/CCUDAExportableMemory.h      |  6 +-
 .../{video => ext/CUDAInterop}/CCUDAHandler.h | 70 ++++++++++++-
 .../CUDAInterop}/CCUDAImportedMemory.h        | 16 +--
 .../CUDAInterop}/CCUDAImportedSemaphore.h     |  6 +-
 include/nbl/ext/CUDAInterop/CUDAInterop.h     |  9 ++
 include/nbl/ext/OptiX/IDenoiser.h             |  4 +-
 include/nbl/system/DefaultFuncPtrLoader.h     |  4 +-
 include/nbl/video/EApiType.h                  |  6 ++
 include/nbl/video/declarations.h              |  5 +-
 src/nbl/CMakeLists.txt                        | 23 +----
 src/nbl/ext/CMakeLists.txt                    | 12 +++
 .../CUDAInterop}/CCUDADevice.cpp              |  5 +-
 .../CUDAInterop}/CCUDAExportableMemory.cpp    |  7 +-
 .../CUDAInterop}/CCUDAHandler.cpp             | 20 +++-
 .../CUDAInterop}/CCUDAImportedMemory.cpp      |  7 +-
 .../CUDAInterop}/CCUDAImportedSemaphore.cpp   |  7 +-
 src/nbl/ext/CUDAInterop/CMakeLists.txt        | 46 +++++++++
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt  | 35 +++++++
 src/nbl/ext/CUDAInterop/smoke/opt_in.cpp      | 97 +++++++++++++++++++
 .../ext/CUDAInterop/smoke/public_boundary.cpp | 15 +++
 24 files changed, 366 insertions(+), 68 deletions(-)
 rename include/nbl/{video => ext/CUDAInterop}/CCUDADevice.h (93%)
 rename include/nbl/{video => ext/CUDAInterop}/CCUDAExportableMemory.h (93%)
 rename include/nbl/{video => ext/CUDAInterop}/CCUDAHandler.h (78%)
 rename include/nbl/{video => ext/CUDAInterop}/CCUDAImportedMemory.h (74%)
 rename include/nbl/{video => ext/CUDAInterop}/CCUDAImportedSemaphore.h (90%)
 create mode 100644 include/nbl/ext/CUDAInterop/CUDAInterop.h
 rename src/nbl/{video => ext/CUDAInterop}/CCUDADevice.cpp (98%)
 rename src/nbl/{video => ext/CUDAInterop}/CCUDAExportableMemory.cpp (90%)
 rename src/nbl/{video => ext/CUDAInterop}/CCUDAHandler.cpp (97%)
 rename src/nbl/{video => ext/CUDAInterop}/CCUDAImportedMemory.cpp (84%)
 rename src/nbl/{video => ext/CUDAInterop}/CCUDAImportedSemaphore.cpp (71%)
 create mode 100644 src/nbl/ext/CUDAInterop/CMakeLists.txt
 create mode 100644 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
 create mode 100644 src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
 create mode 100644 src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa74e167f0..ff90d862ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,9 +70,13 @@ else()
 	message(STATUS "Vulkan SDK is not found")
 endif()
 
-option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" OFF)
+option(NBL_COMPILE_WITH_CUDA "Build the CUDA interop extension?" OFF)
+set(NBL_CUDA_TOOLKIT_ROOT "" CACHE PATH "Optional CUDA Toolkit root used when NBL_COMPILE_WITH_CUDA is ON")
 
 if(NBL_COMPILE_WITH_CUDA)
+	if(NBL_CUDA_TOOLKIT_ROOT)
+		set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}" CACHE PATH "CUDA Toolkit root" FORCE)
+	endif()
 	find_package(CUDAToolkit REQUIRED)
 	if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0")
 		message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!")
@@ -183,13 +187,12 @@ option(NBL_BUILD_IMGUI "Enable nbl::ext::ImGui?" ON)
 option(NBL_BUILD_DEBUG_DRAW "Enable Nabla Debug Draw extension?" ON)
 
 option(NBL_BUILD_OPTIX "Enable nbl::ext::OptiX?" OFF)
-if(NBL_COMPILE_WITH_CUDA)
-	find_package(OPTIX REQUIRED)
-	message(STATUS "CUDA enabled and OptiX found!")
-else()
-	if(NBL_BUILD_OPTIX)
+if(NBL_BUILD_OPTIX)
+	if(NOT NBL_COMPILE_WITH_CUDA)
 		message(FATAL_ERROR "You cannot build Optix without enabled CUDA! NBL_COMPILE_WITH_CUDA must be ON!")
 	endif()
+	find_package(OPTIX REQUIRED)
+	message(STATUS "CUDA enabled and OptiX found!")
 endif()
 
 option(NBL_BUILD_BULLET "Enable Bullet Physics building and integration?" OFF)
diff --git a/cmake/FindZLIB.cmake b/cmake/FindZLIB.cmake
index f855c396b9..42aa789bee 100644
--- a/cmake/FindZLIB.cmake
+++ b/cmake/FindZLIB.cmake
@@ -4,4 +4,6 @@ endif()
 
 set(ZLIB_FOUND TRUE)
 set(ZLIB_LIBRARY ZLIB::ZLIB)
-set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib")
\ No newline at end of file
+set(ZLIB_LIBRARIES ZLIB::ZLIB)
+set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib")
+set(ZLIB_INCLUDE_DIRS "${ZLIB_INCLUDE_DIR}")
diff --git a/examples_tests b/examples_tests
index 93ca5efe58..cbb24a6404 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 93ca5efe588ca85c1eaf81a486b611df98403580
+Subproject commit cbb24a640442ace7bd01a7987f280ab0b6139e22
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h
similarity index 93%
rename from include/nbl/video/CCUDADevice.h
rename to include/nbl/ext/CUDAInterop/CCUDADevice.h
index 02f85fdac8..d7886a4c53 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h
@@ -5,14 +5,13 @@
 #define _NBL_VIDEO_C_CUDA_DEVICE_H_
 
 
-#include "nbl/video/IPhysicalDevice.h"
-#include "nbl/video/CCUDAExportableMemory.h"
-#include "nbl/video/CCUDAImportedMemory.h"
-#include "nbl/video/CCUDAImportedSemaphore.h"
-
-
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
+#include "nbl/video/declarations.h"
+#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
+
 #include "cuda.h"
 #include "nvrtc.h"
 #if CUDA_VERSION < 9000
@@ -27,7 +26,7 @@ namespace nbl::video
 {
 class CCUDAHandler;
 
-class NBL_API2 CCUDADevice : public core::IReferenceCounted
+class CCUDADevice : public core::IReferenceCounted
 {
   public:
 #ifdef _WIN32
diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
similarity index 93%
rename from include/nbl/video/CCUDAExportableMemory.h
rename to include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
index 1c3d206906..10bf911717 100644
--- a/include/nbl/video/CCUDAExportableMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
@@ -7,6 +7,8 @@
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
+#include "nbl/video/declarations.h"
+
 #include "cuda.h"
 #include "nvrtc.h"
 #if CUDA_VERSION < 9000
@@ -22,7 +24,7 @@ namespace nbl::video
 
 class CCUDADevice;
 
-class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
+class CCUDAExportableMemory : public core::IReferenceCounted
 {
     public:
 
@@ -62,4 +64,4 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 
 #endif // _NBL_COMPILE_WITH_CUDA_
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
similarity index 78%
rename from include/nbl/video/CCUDAHandler.h
rename to include/nbl/ext/CUDAInterop/CCUDAHandler.h
index 61e9522a66..8c86d9102c 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -9,7 +9,7 @@
 
 #include "nbl/system/declarations.h"
 
-#include "nbl/video/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
 
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
@@ -17,7 +17,7 @@ namespace nbl::video
 {
 
 
-class NBL_API2 CCUDAHandler : public core::IReferenceCounted
+class CCUDAHandler : public core::IReferenceCounted
 {
 		public:
 		static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
@@ -151,6 +151,8 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 			nvrtcCreateProgram,
 			nvrtcDestroyProgram,
 			nvrtcGetLoweredName,
+			nvrtcGetCUBIN,
+			nvrtcGetCUBINSize,
 			nvrtcGetPTX,
 			nvrtcGetPTXSize,
 			nvrtcGetProgramLog,
@@ -216,6 +218,13 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		};
 		ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog);
 
+		struct cubin_and_nvrtcResult_t
+		{
+			core::smart_refctd_ptr<asset::ICPUBuffer> cubin;
+			nvrtcResult result;
+		};
+		cubin_and_nvrtcResult_t getCUBIN(nvrtcProgram prog);
+
 		//
 		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
@@ -260,6 +269,49 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 			return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log);
 		}
 
+		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
+			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+			std::string* log=nullptr
+		)
+		{
+			nvrtcProgram program = nullptr;
+			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+			auto cleanup = core::makeRAIIExiter([&]() -> void
+			{
+				if (result!=NVRTC_SUCCESS && program)
+					m_nvrtc.pnvrtcDestroyProgram(&program);
+			});
+
+			result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames);
+			return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log);
+		}
+		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
+			const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+			std::string* log=nullptr
+		)
+		{
+			return compileDirectlyToCUBIN(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+		}
+		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
+			system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+			std::string* log=nullptr
+		)
+		{
+			nvrtcProgram program = nullptr;
+			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+			auto cleanup = core::makeRAIIExiter([&]() -> void
+			{
+				if (result!=NVRTC_SUCCESS && program)
+					m_nvrtc.pnvrtcDestroyProgram(&program);
+			});
+
+			result = createProgram(&program,file,headerCount,headerContents,includeNames);
+			return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log);
+		}
+
 		core::smart_refctd_ptr<CCUDADevice> createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice);
 
 	protected:
@@ -281,6 +333,20 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 			return getPTX(program);
 		}
 
+		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN_impl(nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
+		{
+			if (result!=NVRTC_SUCCESS)
+				return {nullptr,result};
+
+			result = compileProgram(program,nvrtcOptions);
+			if (log)
+				getProgramLog(program,*log);
+			if (result!=NVRTC_SUCCESS)
+				return {nullptr,result};
+			
+			return getCUBIN(program);
+		}
+
 		// function tables
 		CUDA m_cuda;
 		NVRTC m_nvrtc;
diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
similarity index 74%
rename from include/nbl/video/CCUDAImportedMemory.h
rename to include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
index 4e3bfcd085..5f885abd2d 100644
--- a/include/nbl/video/CCUDAImportedMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
@@ -1,20 +1,22 @@
-#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H
-#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H
+#ifndef _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
+#define _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
+#include "nbl/video/declarations.h"
+
 #include "cuda.h"
 #include "nvrtc.h"
 #if CUDA_VERSION < 9000
   #error "Need CUDA 9.0 SDK or higher."
 #endif
 
-#endif // _NBL_COMPILE_WITH_CUDA
-
 namespace nbl::video
 {
 
-class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
+class CCUDADevice;
+
+class CCUDAImportedMemory : public core::IReferenceCounted
 {
     public:
 
@@ -39,4 +41,6 @@ class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
 
 }
 
-#endif
\ No newline at end of file
+#endif // _NBL_COMPILE_WITH_CUDA_
+
+#endif
diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
similarity index 90%
rename from include/nbl/video/CCUDAImportedSemaphore.h
rename to include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
index 2e5010fa2d..409ef1a676 100644
--- a/include/nbl/video/CCUDAImportedSemaphore.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
@@ -6,6 +6,8 @@
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
+#include "nbl/video/declarations.h"
+
 #include "cuda.h"
 #include "nvrtc.h"
 #if CUDA_VERSION < 9000
@@ -19,7 +21,9 @@
 namespace nbl::video
 {
 
-class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted
+class CCUDADevice;
+
+class CCUDAImportedSemaphore : public core::IReferenceCounted
 {
     public:
 
diff --git a/include/nbl/ext/CUDAInterop/CUDAInterop.h b/include/nbl/ext/CUDAInterop/CUDAInterop.h
new file mode 100644
index 0000000000..b30d096049
--- /dev/null
+++ b/include/nbl/ext/CUDAInterop/CUDAInterop.h
@@ -0,0 +1,9 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
+#define _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
+
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+
+#endif
diff --git a/include/nbl/ext/OptiX/IDenoiser.h b/include/nbl/ext/OptiX/IDenoiser.h
index 7820aa1222..496383d92d 100644
--- a/include/nbl/ext/OptiX/IDenoiser.h
+++ b/include/nbl/ext/OptiX/IDenoiser.h
@@ -5,7 +5,7 @@
 #ifndef __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__
 #define __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__
 
-#include "../../../../src/nbl/video/CCUDAHandler.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 
 #include <optix.h>
 #include <optix_denoiser_tiling.h>
@@ -122,4 +122,4 @@ class IDenoiser final : public core::IReferenceCounted
 }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/system/DefaultFuncPtrLoader.h b/include/nbl/system/DefaultFuncPtrLoader.h
index 56142448c8..bbb9884e7a 100644
--- a/include/nbl/system/DefaultFuncPtrLoader.h
+++ b/include/nbl/system/DefaultFuncPtrLoader.h
@@ -35,9 +35,9 @@ class DefaultFuncPtrLoader final : FuncPtrLoader
 			return lib!=nullptr;
 		}
 
-		void* loadFuncPtr(const char* funcname) override final;
+		NBL_API2 void* loadFuncPtr(const char* funcname) override final;
 };
 
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h
index 7f99d40309..db29abe54d 100644
--- a/include/nbl/video/EApiType.h
+++ b/include/nbl/video/EApiType.h
@@ -4,6 +4,12 @@
 #include "nbl/core/declarations.h"
 #include <cstdint>
 
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
 namespace nbl::video
 {
 
diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h
index 37f2f864bf..4393af1768 100644
--- a/include/nbl/video/declarations.h
+++ b/include/nbl/video/declarations.h
@@ -24,9 +24,6 @@
 #include "nbl/video/CVulkanImage.h"
 #include "nbl/video/surface/CSurfaceVulkan.h"
 
-// CUDA
-#include "nbl/video/CCUDAHandler.h"
-
 // utilities
 #include "nbl/video/utilities/CDumbPresentationOracle.h"
 #include "nbl/video/utilities/ICommandPoolCache.h"
@@ -44,4 +41,4 @@
 //#include "nbl/video/IGPUVirtualTexture.h"
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 692efec8bd..de9bde3952 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -95,12 +95,8 @@ configure_file("${NBL_ROOT_PATH}/include/nbl/config/BuildConfigOptions.h.in" "${
 file(GENERATE OUTPUT "${CONFIG_OUTPUT}" INPUT "${CONFIG_DIRECOTORY}/.int/BuildConfigOptions.h.conf")
 nbl_install_file_spec("${CONFIG_OUTPUT}" nbl/config)
 
-if (NBL_COMPILE_WITH_CUDA)
-	message(STATUS "Building with CUDA interop")
-	set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA})
-	if (NBL_BUILD_OPTIX)
-		set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX})
-	endif()
+if (NBL_BUILD_OPTIX)
+	set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX})
 endif()
 
 # => TODO: clean!
@@ -291,12 +287,6 @@ set(NBL_VIDEO_SOURCES
 	video/CVulkanEvent.cpp
 	video/CSurfaceVulkan.cpp
 	
-# CUDA
-	video/CCUDAHandler.cpp
-	video/CCUDADevice.cpp
-	video/CCUDAImportedSemaphore.cpp
-	video/CCUDAExportableMemory.cpp
-	video/CCUDAImportedMemory.cpp
 )
 
 set(NBL_SCENE_SOURCES
@@ -425,10 +415,6 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 	target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 endif()
 
-if(NBL_COMPILE_WITH_CUDA)
-	target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_)
-endif()
-
 set(INTERFACE_BUILD_DEFINITIONS
 	_DXC_DLL_="${DXC_DLL}"
 )
@@ -664,11 +650,6 @@ target_link_libraries(Nabla PRIVATE volk)
 # volk is part of public interface headers in Nabla
 target_compile_definitions(Nabla PUBLIC $<$<PLATFORM_ID:Windows>:VK_USE_PLATFORM_WIN32_KHR>)
 
-# CUDA
-if (NBL_COMPILE_WITH_CUDA)
-	list(APPEND PUBLIC_BUILD_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}")
-endif()
-
 list(APPEND PUBLIC_BUILD_INCLUDE_DIRS
 	# this should be PRIVATE, but things from /src (or /source) are sometimes included in things in /include and so examples have to put source dirs into theirs Include Path
 	# -> TODO
diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt
index f3b55531c2..1f815413e8 100644
--- a/src/nbl/ext/CMakeLists.txt
+++ b/src/nbl/ext/CMakeLists.txt
@@ -38,6 +38,18 @@ if (NBL_BUILD_OPTIX)
     )
 endif()
 
+add_subdirectory(CUDAInterop)
+if (NBL_COMPILE_WITH_CUDA)
+    set(NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS
+        ${NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS}
+        PARENT_SCOPE
+    )
+    set(NBL_EXT_CUDA_INTEROP_LIB
+        ${NBL_EXT_CUDA_INTEROP_LIB}
+        PARENT_SCOPE
+    )
+endif()
+
 if (NBL_BUILD_IMGUI)
     add_subdirectory(ImGui)
     set(NBL_EXT_IMGUI_UI_INCLUDE_DIRS
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
similarity index 98%
rename from src/nbl/video/CCUDADevice.cpp
rename to src/nbl/ext/CUDAInterop/CCUDADevice.cpp
index 27f8f6f906..aa06c6e7bf 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
@@ -1,13 +1,14 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nbl/video/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 
 #ifdef _WIN32
 #include <winternl.h>
 #endif
 
-#include "nbl/video/CCUDAImportedMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
similarity index 90%
rename from src/nbl/video/CCUDAExportableMemory.cpp
rename to src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
index 66cbbdcf4f..65afdca660 100644
--- a/src/nbl/video/CCUDAExportableMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
@@ -2,8 +2,9 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/video/CCUDAExportableMemory.h"
-#include "nbl/video/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
@@ -51,4 +52,4 @@ CCUDAExportableMemory::~CCUDAExportableMemory()
 }
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
\ No newline at end of file
+#endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
similarity index 97%
rename from src/nbl/video/CCUDAHandler.cpp
rename to src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 060afe6631..f9048d3bb6 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/video/CCUDAHandler.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 #include "nbl/system/CFileView.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
@@ -488,7 +488,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	{\
 		if (!cuda.p ## FUNC)\
 			return nullptr;\
-		auto result = cuda.p ## FUNC ## (__VA_ARGS__);\
+		auto result = cuda.p ## FUNC(__VA_ARGS__);\
 		if (result!=CUDA_SUCCESS)\
 			return nullptr;\
 	}
@@ -570,6 +570,22 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog)
 	return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
 }
 
+CCUDAHandler::cubin_and_nvrtcResult_t CCUDAHandler::getCUBIN(nvrtcProgram prog)
+{
+	size_t _size = 0ull;
+	nvrtcResult sizeRes = m_nvrtc.pnvrtcGetCUBINSize(prog,&_size);
+	if (sizeRes!=NVRTC_SUCCESS)
+		return {nullptr,sizeRes};
+	if (_size==0ull)
+		return {nullptr,NVRTC_ERROR_INVALID_INPUT};
+
+	asset::ICPUBuffer::SCreationParams cubinParams = {};
+	cubinParams.size = _size;
+	auto cubin = asset::ICPUBuffer::create(std::move(cubinParams));
+	auto cubinPtr = static_cast<char*>(cubin->getPointer());
+	return {std::move(cubin),m_nvrtc.pnvrtcGetCUBIN(prog,cubinPtr)};
+}
+
 core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice)
 {
 	if (!vulkanConnection)
diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
similarity index 84%
rename from src/nbl/video/CCUDAImportedMemory.cpp
rename to src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
index 7e21b05ef1..a785bad9b9 100644
--- a/src/nbl/video/CCUDAImportedMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
@@ -2,8 +2,9 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/video/CCUDAImportedMemory.h"
-#include "nbl/video/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
@@ -29,4 +30,4 @@ CCUDAImportedMemory::~CCUDAImportedMemory()
 
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
similarity index 71%
rename from src/nbl/video/CCUDAImportedSemaphore.cpp
rename to src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
index 0dc750a4a9..1ca4a34190 100644
--- a/src/nbl/video/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
@@ -2,8 +2,9 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/video/CCUDAImportedSemaphore.h"
-#include "nbl/video/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
@@ -15,4 +16,4 @@ CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 }
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
\ No newline at end of file
+#endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
new file mode 100644
index 0000000000..d3f8e85169
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -0,0 +1,46 @@
+include(${NBL_ROOT_PATH}/cmake/common.cmake)
+
+if (NBL_COMPILE_WITH_CUDA)
+	set(NBL_EXT_INTERNAL_INCLUDE_DIR "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop")
+
+	set(NBL_EXT_CUDA_INTEROP_H
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInterop.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDADevice.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAExportableMemory.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAHandler.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedMemory.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedSemaphore.h
+	)
+
+	set(NBL_EXT_CUDA_INTEROP_SRC
+		CCUDADevice.cpp
+		CCUDAExportableMemory.cpp
+		CCUDAHandler.cpp
+		CCUDAImportedMemory.cpp
+		CCUDAImportedSemaphore.cpp
+	)
+
+	nbl_create_ext_library_project(
+		CUDA_INTEROP
+		"${NBL_EXT_CUDA_INTEROP_H}"
+		"${NBL_EXT_CUDA_INTEROP_SRC}"
+		""
+		""
+		"_NBL_COMPILE_WITH_CUDA_"
+	)
+
+	set(NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
+	if(CUDAToolkit_ROOT)
+		list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS
+			"${CUDAToolkit_ROOT}/include"
+			"${CUDAToolkit_ROOT}/include/cccl"
+		)
+	endif()
+	list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
+	list(REMOVE_DUPLICATES NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
+
+	target_include_directories(${LIB_NAME} BEFORE PUBLIC ${NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS})
+	add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME})
+endif()
+
+add_subdirectory(smoke)
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
new file mode 100644
index 0000000000..7805153e32
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -0,0 +1,35 @@
+enable_testing()
+
+set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS)
+if(CMAKE_CONFIGURATION_TYPES)
+	set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS --config $<CONFIG>)
+endif()
+
+function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE)
+	add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+	target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20)
+	nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug)
+
+	set(_NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:$<TARGET_FILE_DIR:Nabla::Nabla>")
+	if(CUDAToolkit_BIN_DIR)
+		list(APPEND _NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:${CUDAToolkit_BIN_DIR}")
+	endif()
+
+	add_test(
+		NAME ${TARGET_NAME}.build
+		COMMAND ${CMAKE_COMMAND} --build "${CMAKE_CURRENT_BINARY_DIR}" --target ${TARGET_NAME} ${_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS}
+	)
+	add_test(NAME ${TARGET_NAME}.run COMMAND $<TARGET_FILE:${TARGET_NAME}>)
+	set_tests_properties(${TARGET_NAME}.run PROPERTIES
+		DEPENDS ${TARGET_NAME}.build
+		ENVIRONMENT_MODIFICATION "${_NBL_CUDA_INTEROP_SMOKE_PATH_MODS}"
+	)
+endfunction()
+
+nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp)
+target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla)
+
+if(TARGET Nabla::ext::CUDAInterop)
+	nbl_add_cuda_interop_smoke(NblExtCUDAInteropOptInSmoke opt_in.cpp)
+	target_link_libraries(NblExtCUDAInteropOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
+endif()
diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
new file mode 100644
index 0000000000..d6afab79d2
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
@@ -0,0 +1,97 @@
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <type_traits>
+
+#ifndef _NBL_COMPILE_WITH_CUDA_
+#error "CUDA interop consumers must opt in through Nabla::ext::CUDAInterop."
+#endif
+
+namespace
+{
+using namespace nbl;
+using namespace nbl::video;
+
+[[maybe_unused]] bool compileVulkanCudaInteropRecipe(
+	CCUDADevice& cudaDevice,
+	ILogicalDevice* vulkanDevice,
+	core::smart_refctd_ptr<IDeviceMemoryAllocation> vulkanMemory,
+	core::smart_refctd_ptr<ISemaphore> vulkanSemaphore)
+{
+	auto cudaMemory = cudaDevice.createExportableMemory({
+		.size = 4096,
+		.alignment = 4096,
+		.location = CU_MEM_LOCATION_TYPE_DEVICE,
+	});
+	if (!cudaMemory)
+		return false;
+
+	auto exportedToVulkan = cudaMemory->exportAsMemory(vulkanDevice);
+	auto importedFromVulkan = cudaDevice.importExternalMemory(std::move(vulkanMemory));
+	auto importedSemaphore = cudaDevice.importExternalSemaphore(std::move(vulkanSemaphore));
+
+	CUdeviceptr mappedVulkanMemory = 0;
+	if (importedFromVulkan)
+		importedFromVulkan->getMappedBuffer(&mappedVulkanMemory);
+
+	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? importedSemaphore->getInternalObject():nullptr;
+	return exportedToVulkan.get() && mappedVulkanMemory && cudaSemaphore;
+}
+
+bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
+{
+	auto& cuda = handler.getCUDAFunctionTable();
+
+	CUcontext context = nullptr;
+	if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS)
+		return false;
+
+	CUcontext poppedContext = nullptr;
+	auto releaseContext = [&]()
+	{
+		if (context)
+		{
+			cuda.pcuCtxPopCurrent_v2(&poppedContext);
+			cuda.pcuDevicePrimaryCtxRelease_v2(device);
+		}
+	};
+
+	if (cuda.pcuCtxPushCurrent_v2(context)!=CUDA_SUCCESS)
+	{
+		releaseContext();
+		return false;
+	}
+
+	constexpr std::array<uint32_t, 4> input = {0x12345678u, 0x90abcdefu, 0xfedcba09u, 0x87654321u};
+	std::array<uint32_t, input.size()> output = {};
+
+	CUdeviceptr deviceMemory = 0;
+	bool ok = cuda.pcuMemAlloc_v2(&deviceMemory, sizeof(input))==CUDA_SUCCESS;
+	ok = ok && cuda.pcuMemcpyHtoD_v2(deviceMemory, input.data(), sizeof(input))==CUDA_SUCCESS;
+	ok = ok && cuda.pcuMemcpyDtoH_v2(output.data(), deviceMemory, sizeof(output))==CUDA_SUCCESS;
+	if (deviceMemory)
+		ok = cuda.pcuMemFree_v2(deviceMemory)==CUDA_SUCCESS && ok;
+
+	releaseContext();
+	return ok && std::ranges::equal(input, output);
+}
+}
+
+int main()
+{
+	static_assert(std::is_same_v<decltype(std::declval<const nbl::video::CCUDADevice&>().getInternalObject()), CUdevice>);
+	CUdeviceptr devicePtr = 0;
+	static_cast<void>(devicePtr);
+
+	auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
+	if (!handler)
+		return 0;
+
+	const auto& devices = handler->getAvailableDevices();
+	if (devices.empty())
+		return 0;
+
+	return cudaDriverRoundtrip(*handler, devices.front().handle) ? 0:1;
+}
diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
new file mode 100644
index 0000000000..809d1e7b93
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
@@ -0,0 +1,15 @@
+#include "nabla.h"
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+#error "Default Nabla consumers must not get the CUDA opt-in define."
+#endif
+
+#ifdef CUDA_VERSION
+#error "Default Nabla consumers must not include CUDA SDK headers."
+#endif
+
+int main()
+{
+	return 0;
+}

From 78845ae3f2bfb360316aab2f905d0b415165d52c Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 11:43:32 +0200
Subject: [PATCH 02/27] Address CUDA interop review cleanup

---
 CMakeLists.txt                                |  2 +-
 examples_tests                                |  2 +-
 include/nbl/ext/CUDAInterop/CCUDAHandler.h    | 66 -------------------
 include/nbl/ext/CUDAInterop/CUDAInterop.h     |  4 ++
 include/nbl/system/DefaultFuncPtrLoader.h     |  8 +--
 include/nbl/video/EApiType.h                  | 31 +--------
 src/nbl/CMakeLists.txt                        |  1 +
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      | 16 -----
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt  | 36 ++++++----
 src/nbl/ext/CUDAInterop/smoke/opt_in.cpp      | 50 ++++++++++----
 .../ext/CUDAInterop/smoke/public_boundary.cpp | 24 ++++++-
 src/nbl/video/EApiType.cpp                    | 37 +++++++++++
 12 files changed, 130 insertions(+), 147 deletions(-)
 create mode 100644 src/nbl/video/EApiType.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff90d862ce..c5e1bfac20 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,7 +75,7 @@ set(NBL_CUDA_TOOLKIT_ROOT "" CACHE PATH "Optional CUDA Toolkit root used when NB
 
 if(NBL_COMPILE_WITH_CUDA)
 	if(NBL_CUDA_TOOLKIT_ROOT)
-		set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}" CACHE PATH "CUDA Toolkit root" FORCE)
+		set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}")
 	endif()
 	find_package(CUDAToolkit REQUIRED)
 	if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0")
diff --git a/examples_tests b/examples_tests
index cbb24a6404..5c604d274b 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit cbb24a640442ace7bd01a7987f280ab0b6139e22
+Subproject commit 5c604d274b8aac99d8855f5b7aaf615910c8a5f6
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
index 8c86d9102c..5128aad575 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -151,8 +151,6 @@ class CCUDAHandler : public core::IReferenceCounted
 			nvrtcCreateProgram,
 			nvrtcDestroyProgram,
 			nvrtcGetLoweredName,
-			nvrtcGetCUBIN,
-			nvrtcGetCUBINSize,
 			nvrtcGetPTX,
 			nvrtcGetPTXSize,
 			nvrtcGetProgramLog,
@@ -218,13 +216,6 @@ class CCUDAHandler : public core::IReferenceCounted
 		};
 		ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog);
 
-		struct cubin_and_nvrtcResult_t
-		{
-			core::smart_refctd_ptr<asset::ICPUBuffer> cubin;
-			nvrtcResult result;
-		};
-		cubin_and_nvrtcResult_t getCUBIN(nvrtcProgram prog);
-
 		//
 		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
@@ -269,49 +260,6 @@ class CCUDAHandler : public core::IReferenceCounted
 			return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log);
 		}
 
-		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
-			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			nvrtcProgram program = nullptr;
-			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
-			auto cleanup = core::makeRAIIExiter([&]() -> void
-			{
-				if (result!=NVRTC_SUCCESS && program)
-					m_nvrtc.pnvrtcDestroyProgram(&program);
-			});
-
-			result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames);
-			return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log);
-		}
-		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
-			const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			return compileDirectlyToCUBIN(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-		}
-		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
-			system::IFile* file, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			nvrtcProgram program = nullptr;
-			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
-			auto cleanup = core::makeRAIIExiter([&]() -> void
-			{
-				if (result!=NVRTC_SUCCESS && program)
-					m_nvrtc.pnvrtcDestroyProgram(&program);
-			});
-
-			result = createProgram(&program,file,headerCount,headerContents,includeNames);
-			return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log);
-		}
-
 		core::smart_refctd_ptr<CCUDADevice> createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice);
 
 	protected:
@@ -333,20 +281,6 @@ class CCUDAHandler : public core::IReferenceCounted
 			return getPTX(program);
 		}
 
-		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN_impl(nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
-		{
-			if (result!=NVRTC_SUCCESS)
-				return {nullptr,result};
-
-			result = compileProgram(program,nvrtcOptions);
-			if (log)
-				getProgramLog(program,*log);
-			if (result!=NVRTC_SUCCESS)
-				return {nullptr,result};
-			
-			return getCUBIN(program);
-		}
-
 		// function tables
 		CUDA m_cuda;
 		NVRTC m_nvrtc;
diff --git a/include/nbl/ext/CUDAInterop/CUDAInterop.h b/include/nbl/ext/CUDAInterop/CUDAInterop.h
index b30d096049..06d9016dc8 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInterop.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInterop.h
@@ -4,6 +4,10 @@
 #ifndef _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
 #define _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
 
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
 #include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
 
 #endif
diff --git a/include/nbl/system/DefaultFuncPtrLoader.h b/include/nbl/system/DefaultFuncPtrLoader.h
index bbb9884e7a..10fab3a454 100644
--- a/include/nbl/system/DefaultFuncPtrLoader.h
+++ b/include/nbl/system/DefaultFuncPtrLoader.h
@@ -11,18 +11,18 @@
 namespace nbl::system
 {
 
-class DefaultFuncPtrLoader final : FuncPtrLoader
+class NBL_API2 DefaultFuncPtrLoader final : FuncPtrLoader
 {
 		void* lib;
 
 	public:
 		inline DefaultFuncPtrLoader() : lib(nullptr) {}
-		NBL_API2 DefaultFuncPtrLoader(const char* name);
+		DefaultFuncPtrLoader(const char* name);
 		inline DefaultFuncPtrLoader(DefaultFuncPtrLoader&& other) : DefaultFuncPtrLoader()
 		{
 			operator=(std::move(other));
 		}
-		NBL_API2 ~DefaultFuncPtrLoader();
+		~DefaultFuncPtrLoader();
 
 		inline DefaultFuncPtrLoader& operator=(DefaultFuncPtrLoader&& other)
 		{
@@ -35,7 +35,7 @@ class DefaultFuncPtrLoader final : FuncPtrLoader
 			return lib!=nullptr;
 		}
 
-		NBL_API2 void* loadFuncPtr(const char* funcname) override final;
+		void* loadFuncPtr(const char* funcname) override final;
 };
 
 }
diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h
index db29abe54d..44a31ecf90 100644
--- a/include/nbl/video/EApiType.h
+++ b/include/nbl/video/EApiType.h
@@ -4,12 +4,6 @@
 #include "nbl/core/declarations.h"
 #include <cstdint>
 
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <unistd.h>
-#endif
-
 namespace nbl::video
 {
 
@@ -34,29 +28,8 @@ constexpr external_handle_t ExternalHandleNull = nullptr;
 constexpr external_handle_t ExternalHandleNull = -1;
 #endif
 
-inline bool CloseExternalHandle(external_handle_t handle)
-{
-#ifdef _WIN32
-    return CloseHandle(handle);
-#else
-    return (close(handle) == 0);
-#endif
-}
-
-inline external_handle_t DuplicateExternalHandle(external_handle_t handle)
-{
-#ifdef _WIN32
-    HANDLE re = ExternalHandleNull;
-
-    const HANDLE cur = GetCurrentProcess();
-    if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS))
-        return ExternalHandleNull;
-
-    return re;
-#else
-    return dup(handle);
-#endif
-}
+NBL_API2 bool CloseExternalHandle(external_handle_t handle);
+NBL_API2 external_handle_t DuplicateExternalHandle(external_handle_t handle);
 
 }
 
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index de9bde3952..acbf4d4dda 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -248,6 +248,7 @@ set(NBL_VIDEO_SOURCES
 	video/IGPUAccelerationStructure.cpp
 	video/IGPUCommandBuffer.cpp
 	video/IQueue.cpp
+	video/EApiType.cpp
 	video/IGPUDescriptorSet.cpp
 	video/IDeviceMemoryAllocation.cpp
 	video/IDeviceMemoryBacked.cpp
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index f9048d3bb6..748a88d1a1 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -570,22 +570,6 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog)
 	return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
 }
 
-CCUDAHandler::cubin_and_nvrtcResult_t CCUDAHandler::getCUBIN(nvrtcProgram prog)
-{
-	size_t _size = 0ull;
-	nvrtcResult sizeRes = m_nvrtc.pnvrtcGetCUBINSize(prog,&_size);
-	if (sizeRes!=NVRTC_SUCCESS)
-		return {nullptr,sizeRes};
-	if (_size==0ull)
-		return {nullptr,NVRTC_ERROR_INVALID_INPUT};
-
-	asset::ICPUBuffer::SCreationParams cubinParams = {};
-	cubinParams.size = _size;
-	auto cubin = asset::ICPUBuffer::create(std::move(cubinParams));
-	auto cubinPtr = static_cast<char*>(cubin->getPointer());
-	return {std::move(cubin),m_nvrtc.pnvrtcGetCUBIN(prog,cubinPtr)};
-}
-
 core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice)
 {
 	if (!vulkanConnection)
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index 7805153e32..678cd29d84 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -1,27 +1,35 @@
-enable_testing()
-
-set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS)
-if(CMAKE_CONFIGURATION_TYPES)
-	set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS --config $<CONFIG>)
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+	cmake_minimum_required(VERSION 3.30)
+	project(NblExtCUDAInteropSmoke CXX)
+	find_package(Nabla REQUIRED CONFIG)
 endif()
 
+enable_testing()
+
 function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE)
-	add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+	add_executable(${TARGET_NAME} ${SOURCE_FILE})
 	target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20)
-	nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug)
+	if(MSVC)
+		target_compile_options(${TARGET_NAME} PRIVATE
+			/Gm-
+			/bigobj
+			/Zc:wchar_t
+			/Zc:preprocessor
+			/Zc:inline
+			/Zc:forScope
+		)
+	endif()
+	if(COMMAND nbl_adjust_flags)
+		nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug)
+	endif()
 
 	set(_NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:$<TARGET_FILE_DIR:Nabla::Nabla>")
 	if(CUDAToolkit_BIN_DIR)
 		list(APPEND _NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:${CUDAToolkit_BIN_DIR}")
 	endif()
 
-	add_test(
-		NAME ${TARGET_NAME}.build
-		COMMAND ${CMAKE_COMMAND} --build "${CMAKE_CURRENT_BINARY_DIR}" --target ${TARGET_NAME} ${_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS}
-	)
-	add_test(NAME ${TARGET_NAME}.run COMMAND $<TARGET_FILE:${TARGET_NAME}>)
-	set_tests_properties(${TARGET_NAME}.run PROPERTIES
-		DEPENDS ${TARGET_NAME}.build
+	add_test(NAME ${TARGET_NAME} COMMAND $<TARGET_FILE:${TARGET_NAME}>)
+	set_tests_properties(${TARGET_NAME} PROPERTIES
 		ENVIRONMENT_MODIFICATION "${_NBL_CUDA_INTEROP_SMOKE_PATH_MODS}"
 	)
 endfunction()
diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
index d6afab79d2..adcb48e6de 100644
--- a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
@@ -1,9 +1,11 @@
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/system/IApplicationFramework.h"
 
 #include <algorithm>
 #include <array>
 #include <cstdint>
 #include <type_traits>
+#include <utility>
 
 #ifndef _NBL_COMPILE_WITH_CUDA_
 #error "CUDA interop consumers must opt in through Nabla::ext::CUDAInterop."
@@ -69,8 +71,10 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 
 	CUdeviceptr deviceMemory = 0;
 	bool ok = cuda.pcuMemAlloc_v2(&deviceMemory, sizeof(input))==CUDA_SUCCESS;
-	ok = ok && cuda.pcuMemcpyHtoD_v2(deviceMemory, input.data(), sizeof(input))==CUDA_SUCCESS;
-	ok = ok && cuda.pcuMemcpyDtoH_v2(output.data(), deviceMemory, sizeof(output))==CUDA_SUCCESS;
+	if (ok)
+		ok = cuda.pcuMemcpyHtoD_v2(deviceMemory,input.data(),sizeof(input))==CUDA_SUCCESS;
+	if (ok)
+		ok = cuda.pcuMemcpyDtoH_v2(output.data(),deviceMemory,sizeof(output))==CUDA_SUCCESS;
 	if (deviceMemory)
 		ok = cuda.pcuMemFree_v2(deviceMemory)==CUDA_SUCCESS && ok;
 
@@ -79,19 +83,37 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 }
 }
 
-int main()
+class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework
 {
-	static_assert(std::is_same_v<decltype(std::declval<const nbl::video::CCUDADevice&>().getInternalObject()), CUdevice>);
-	CUdeviceptr devicePtr = 0;
-	static_cast<void>(devicePtr);
+	using base_t = nbl::system::IApplicationFramework;
 
-	auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
-	if (!handler)
-		return 0;
+public:
+	using base_t::base_t;
 
-	const auto& devices = handler->getAvailableDevices();
-	if (devices.empty())
-		return 0;
+	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&& system) override
+	{
+		static_cast<void>(system);
 
-	return cudaDriverRoundtrip(*handler, devices.front().handle) ? 0:1;
-}
+		if (!isAPILoaded())
+			return false;
+
+		static_assert(std::is_same_v<decltype(std::declval<const nbl::video::CCUDADevice&>().getInternalObject()), CUdevice>);
+		CUdeviceptr devicePtr = 0;
+		static_cast<void>(devicePtr);
+
+		auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
+		if (!handler)
+			return true;
+
+		const auto& devices = handler->getAvailableDevices();
+		if (devices.empty())
+			return true;
+
+		return cudaDriverRoundtrip(*handler, devices.front().handle);
+	}
+
+	void workLoopBody() override {}
+	bool keepRunning() override { return false; }
+};
+
+NBL_MAIN_FUNC(CUDAInteropOptInSmoke)
diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
index 809d1e7b93..c39ba076d4 100644
--- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
@@ -1,4 +1,5 @@
 #include "nabla.h"
+#include "nbl/system/IApplicationFramework.h"
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
@@ -9,7 +10,26 @@
 #error "Default Nabla consumers must not include CUDA SDK headers."
 #endif
 
-int main()
+namespace
 {
-	return 0;
+
+class CUDAInteropPublicBoundarySmoke final : public nbl::system::IApplicationFramework
+{
+	using base_t = nbl::system::IApplicationFramework;
+
+public:
+	using base_t::base_t;
+
+	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&& system) override
+	{
+		static_cast<void>(system);
+		return isAPILoaded();
+	}
+
+	void workLoopBody() override {}
+	bool keepRunning() override { return false; }
+};
+
 }
+
+NBL_MAIN_FUNC(CUDAInteropPublicBoundarySmoke)
diff --git a/src/nbl/video/EApiType.cpp b/src/nbl/video/EApiType.cpp
new file mode 100644
index 0000000000..d7eadd8b08
--- /dev/null
+++ b/src/nbl/video/EApiType.cpp
@@ -0,0 +1,37 @@
+#include "nbl/video/EApiType.h"
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
+namespace nbl::video
+{
+
+bool CloseExternalHandle(external_handle_t handle)
+{
+#ifdef _WIN32
+	return CloseHandle(handle);
+#else
+	return close(handle)==0;
+#endif
+}
+
+external_handle_t DuplicateExternalHandle(external_handle_t handle)
+{
+#ifdef _WIN32
+	HANDLE duplicated = ExternalHandleNull;
+
+	const HANDLE process = GetCurrentProcess();
+	if (!DuplicateHandle(process,handle,process,&duplicated,GENERIC_ALL,0,DUPLICATE_SAME_ACCESS))
+		return ExternalHandleNull;
+
+	return duplicated;
+#else
+	return dup(handle);
+#endif
+}
+
+}

From ab9a7e560fadaf960a1a9f4879a02f6e66833d2a Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 12:04:30 +0200
Subject: [PATCH 03/27] Simplify CUDA interop smoke CMake

---
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index 678cd29d84..89dd821add 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -1,6 +1,9 @@
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
 	cmake_minimum_required(VERSION 3.30)
 	project(NblExtCUDAInteropSmoke CXX)
+endif()
+
+if(NOT TARGET Nabla::Nabla)
 	find_package(Nabla REQUIRED CONFIG)
 endif()
 
@@ -19,19 +22,8 @@ function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE)
 			/Zc:forScope
 		)
 	endif()
-	if(COMMAND nbl_adjust_flags)
-		nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug)
-	endif()
-
-	set(_NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:$<TARGET_FILE_DIR:Nabla::Nabla>")
-	if(CUDAToolkit_BIN_DIR)
-		list(APPEND _NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:${CUDAToolkit_BIN_DIR}")
-	endif()
 
 	add_test(NAME ${TARGET_NAME} COMMAND $<TARGET_FILE:${TARGET_NAME}>)
-	set_tests_properties(${TARGET_NAME} PROPERTIES
-		ENVIRONMENT_MODIFICATION "${_NBL_CUDA_INTEROP_SMOKE_PATH_MODS}"
-	)
 endfunction()
 
 nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp)

From bf8eeb3509935dd7f0b5970e87a44ea88bf5a4fb Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 12:27:26 +0200
Subject: [PATCH 04/27] Clean CUDA interop smoke usage requirements

---
 src/nbl/CMakeLists.txt                           |  7 +++++--
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt     | 16 ++--------------
 src/nbl/ext/CUDAInterop/smoke/opt_in.cpp         |  6 +-----
 .../ext/CUDAInterop/smoke/public_boundary.cpp    |  7 +++----
 4 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index acbf4d4dda..bb96bdfc80 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -763,8 +763,11 @@ if(TARGET ngfx)
 	)
 endif()
 
-# on MSVC it won't compile without this option!
-target_compile_options(Nabla PUBLIC $<$<CXX_COMPILER_FRONTEND_VARIANT:MSVC>:/bigobj>)
+# on MSVC it won't compile without these options!
+target_compile_options(Nabla PUBLIC
+	$<$<CXX_COMPILER_FRONTEND_VARIANT:MSVC>:/bigobj>
+	$<$<CXX_COMPILER_FRONTEND_VARIANT:MSVC>:/Zc:preprocessor>
+)
 
 if(NBL_PCH)
 	target_precompile_headers(Nabla
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index 89dd821add..23dd6d5422 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -1,7 +1,5 @@
-if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-	cmake_minimum_required(VERSION 3.30)
-	project(NblExtCUDAInteropSmoke CXX)
-endif()
+cmake_minimum_required(VERSION 3.30)
+project(NblExtCUDAInteropSmoke CXX)
 
 if(NOT TARGET Nabla::Nabla)
 	find_package(Nabla REQUIRED CONFIG)
@@ -12,16 +10,6 @@ enable_testing()
 function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE)
 	add_executable(${TARGET_NAME} ${SOURCE_FILE})
 	target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20)
-	if(MSVC)
-		target_compile_options(${TARGET_NAME} PRIVATE
-			/Gm-
-			/bigobj
-			/Zc:wchar_t
-			/Zc:preprocessor
-			/Zc:inline
-			/Zc:forScope
-		)
-	endif()
 
 	add_test(NAME ${TARGET_NAME} COMMAND $<TARGET_FILE:${TARGET_NAME}>)
 endfunction()
diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
index adcb48e6de..bc8c8952bd 100644
--- a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
@@ -90,16 +90,12 @@ class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework
 public:
 	using base_t::base_t;
 
-	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&& system) override
+	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
 	{
-		static_cast<void>(system);
-
 		if (!isAPILoaded())
 			return false;
 
 		static_assert(std::is_same_v<decltype(std::declval<const nbl::video::CCUDADevice&>().getInternalObject()), CUdevice>);
-		CUdeviceptr devicePtr = 0;
-		static_cast<void>(devicePtr);
 
 		auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
 		if (!handler)
diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
index c39ba076d4..4f6cbebfb1 100644
--- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
@@ -3,11 +3,11 @@
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
-#error "Default Nabla consumers must not get the CUDA opt-in define."
+#error "Nabla consumers must not get the CUDA opt-in define."
 #endif
 
 #ifdef CUDA_VERSION
-#error "Default Nabla consumers must not include CUDA SDK headers."
+#error "Nabla consumers must not include CUDA SDK headers."
 #endif
 
 namespace
@@ -20,9 +20,8 @@ class CUDAInteropPublicBoundarySmoke final : public nbl::system::IApplicationFra
 public:
 	using base_t::base_t;
 
-	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&& system) override
+	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
 	{
-		static_cast<void>(system);
 		return isAPILoaded();
 	}
 

From f701ac63e83bea4bf743af80a6fe29af81d002c0 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 13:28:07 +0200
Subject: [PATCH 05/27] Export CUDA interop package target

---
 cmake/NablaConfig.cmake.in                   | 33 ++++++++++++++++++++
 cmake/common.cmake                           | 19 +++++++++--
 src/nbl/CMakeLists.txt                       | 19 ++++++++++-
 src/nbl/ext/CUDAInterop/CMakeLists.txt       |  5 ++-
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt |  2 +-
 5 files changed, 72 insertions(+), 6 deletions(-)

diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index b22b3ad0d7..e88a25b0dd 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -6,6 +6,7 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso
 
 set(_NBL_NABLA_LOAD_CORE OFF)
 set(_NBL_NABLA_LOAD_NSC OFF)
+set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF)
 set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS})
 set(_NBL_NABLA_HAS_CORE_EXPORTS OFF)
 set(_NBL_NABLA_HAS_NSC_EXPORTS OFF)
@@ -25,6 +26,10 @@ if(_NBL_NABLA_COMPONENTS)
     elseif(_NBL_NABLA_COMPONENT STREQUAL "Core")
       set(_NBL_NABLA_LOAD_CORE ON)
       set(Nabla_Core_FOUND TRUE)
+    elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInterop")
+      set(_NBL_NABLA_LOAD_CORE ON)
+      set(_NBL_NABLA_LOAD_CUDA_INTEROP ON)
+      set(Nabla_CUDAInterop_FOUND TRUE)
     else()
       set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE)
     endif()
@@ -80,6 +85,34 @@ if(_NBL_NABLA_LOAD_NSC)
   endif()
 endif()
 
+if(_NBL_NABLA_LOAD_CUDA_INTEROP)
+  include(CMakeFindDependencyMacro)
+
+  if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "")
+    set(CUDAToolkit_ROOT "${Nabla_CUDA_TOOLKIT_ROOT}")
+  endif()
+
+  find_dependency(CUDAToolkit REQUIRED)
+  if(CUDAToolkit_VERSION VERSION_LESS "13.0")
+    set(Nabla_CUDAInterop_FOUND FALSE)
+    if(Nabla_FIND_REQUIRED_CUDAInterop)
+      message(FATAL_ERROR "Nabla: CUDAInterop requires CUDA Toolkit 13.0 or newer. Set Nabla_CUDA_TOOLKIT_ROOT or CUDAToolkit_ROOT if multiple CUDA Toolkit installs are present.")
+    endif()
+  else()
+    _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
+    if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
+      set(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
+      foreach(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR IN LISTS CUDAToolkit_INCLUDE_DIRS)
+        if(EXISTS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl")
+          list(APPEND _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl")
+        endif()
+      endforeach()
+      list(REMOVE_DUPLICATES _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS)
+      target_include_directories(Nabla::ext::CUDAInterop INTERFACE ${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS})
+    endif()
+  endif()
+endif()
+
 check_required_components(Nabla)
 
 #
diff --git a/cmake/common.cmake b/cmake/common.cmake
index c50e1f6fb2..ae2264fda4 100755
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -284,9 +284,22 @@ function(nbl_install_dir _DIR)
 endfunction()
 
 function(nbl_install_lib_spec _TARGETS _RELATIVE_DESTINATION)
-	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries)
-	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries)
-	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries)
+	cmake_parse_arguments(_NBL_INSTALL_LIB "" "EXPORT" "" ${ARGN})
+	if(_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS)
+		message(FATAL_ERROR "Unexpected arguments for nbl_install_lib_spec: ${_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS}")
+	endif()
+
+	if(_NBL_INSTALL_LIB_EXPORT)
+		install(TARGETS ${_TARGETS}
+			EXPORT ${_NBL_INSTALL_LIB_EXPORT}
+			ARCHIVE DESTINATION ${_NBL_CPACK_PACKAGE_RELATIVE_ENTRY_}/lib/${_RELATIVE_DESTINATION}
+			COMPONENT Libraries
+		)
+	else()
+		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries)
+		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries)
+		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries)
+	endif()
 endfunction()
 
 function(nbl_install_lib _TARGETS)
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index bb96bdfc80..6c3ab2606d 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -778,11 +778,28 @@ if(NBL_PCH)
 	)
 endif()
 
-# extensions
 start_tracking_variables_for_propagation_to_parent()
 add_subdirectory(ext EXCLUDE_FROM_ALL)
 propagate_changed_variables_to_parent_scope()
 
+if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB})
+	set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF)
+
+	set(_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS)
+	if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
+		list(APPEND _NBL_EXT_CUDA_INTEROP_INSTALL_ARGS EXPORT NablaCUDAInteropExportTargets)
+	endif()
+	nbl_install_lib_spec(${NBL_EXT_CUDA_INTEROP_LIB} "nbl/ext/CUDA_INTEROP" ${_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS})
+
+	if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
+		install(EXPORT NablaCUDAInteropExportTargets
+			NAMESPACE Nabla::
+			DESTINATION cmake
+			COMPONENT Libraries
+		)
+	endif()
+endif()
+
 if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB})
 	set_target_properties(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF)
 	nbl_install_lib_spec(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} "nbl/ext/FULL_SCREEN_TRIANGLE")
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
index d3f8e85169..93b6bef8c1 100644
--- a/src/nbl/ext/CUDAInterop/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -39,7 +39,10 @@ if (NBL_COMPILE_WITH_CUDA)
 	list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
 	list(REMOVE_DUPLICATES NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
 
-	target_include_directories(${LIB_NAME} BEFORE PUBLIC ${NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS})
+	foreach(_NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIR IN LISTS NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
+		target_include_directories(${LIB_NAME} BEFORE PUBLIC $<BUILD_INTERFACE:${_NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIR}>)
+	endforeach()
+	set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
 	add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME})
 endif()
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index 23dd6d5422..cd9ba7b70e 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.30)
 project(NblExtCUDAInteropSmoke CXX)
 
 if(NOT TARGET Nabla::Nabla)
-	find_package(Nabla REQUIRED CONFIG)
+	find_package(Nabla REQUIRED CONFIG COMPONENTS Core CUDAInterop)
 endif()
 
 enable_testing()

From a520d57a443c421d41e5f72c14cec70d29d6f175 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 13:42:37 +0200
Subject: [PATCH 06/27] Use CUDAToolkit package targets

---
 cmake/NablaConfig.cmake.in             |  9 +--------
 src/nbl/ext/CUDAInterop/CMakeLists.txt | 14 +-------------
 2 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index e88a25b0dd..ca32518244 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -101,14 +101,7 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP)
   else()
     _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
     if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
-      set(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
-      foreach(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR IN LISTS CUDAToolkit_INCLUDE_DIRS)
-        if(EXISTS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl")
-          list(APPEND _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl")
-        endif()
-      endforeach()
-      list(REMOVE_DUPLICATES _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS)
-      target_include_directories(Nabla::ext::CUDAInterop INTERFACE ${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS})
+      target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
     endif()
   endif()
 endif()
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
index 93b6bef8c1..7a69e62ad4 100644
--- a/src/nbl/ext/CUDAInterop/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -29,19 +29,7 @@ if (NBL_COMPILE_WITH_CUDA)
 		"_NBL_COMPILE_WITH_CUDA_"
 	)
 
-	set(NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
-	if(CUDAToolkit_ROOT)
-		list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS
-			"${CUDAToolkit_ROOT}/include"
-			"${CUDAToolkit_ROOT}/include/cccl"
-		)
-	endif()
-	list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
-	list(REMOVE_DUPLICATES NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
-
-	foreach(_NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIR IN LISTS NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
-		target_include_directories(${LIB_NAME} BEFORE PUBLIC $<BUILD_INTERFACE:${_NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIR}>)
-	endforeach()
+	target_link_libraries(${LIB_NAME} PUBLIC $<BUILD_INTERFACE:CUDA::toolkit>)
 	set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
 	add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME})
 endif()

From 4bddc571ade70a289036d87772a85b35870c5307 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 14:29:27 +0200
Subject: [PATCH 07/27] Require CUDA version via CMake

---
 CMakeLists.txt                                 |  8 ++------
 cmake/NablaConfig.cmake.in                     | 15 ++++-----------
 .../ext/CUDAInterop/smoke/public_boundary.cpp  | 18 ++++++++++++++++++
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5e1bfac20..14845789fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,12 +77,8 @@ if(NBL_COMPILE_WITH_CUDA)
 	if(NBL_CUDA_TOOLKIT_ROOT)
 		set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}")
 	endif()
-	find_package(CUDAToolkit REQUIRED)
-	if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0")
-		message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!")
-	else()
-		message(FATAL_ERROR "CUDA version 13.0+ needed for C++14 support!")
-	endif()
+	find_package(CUDAToolkit 13.0 REQUIRED)
+	message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!")
 endif()
 
 get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index ca32518244..8b9f62e548 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -92,17 +92,10 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP)
     set(CUDAToolkit_ROOT "${Nabla_CUDA_TOOLKIT_ROOT}")
   endif()
 
-  find_dependency(CUDAToolkit REQUIRED)
-  if(CUDAToolkit_VERSION VERSION_LESS "13.0")
-    set(Nabla_CUDAInterop_FOUND FALSE)
-    if(Nabla_FIND_REQUIRED_CUDAInterop)
-      message(FATAL_ERROR "Nabla: CUDAInterop requires CUDA Toolkit 13.0 or newer. Set Nabla_CUDA_TOOLKIT_ROOT or CUDAToolkit_ROOT if multiple CUDA Toolkit installs are present.")
-    endif()
-  else()
-    _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
-    if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
-      target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
-    endif()
+  find_dependency(CUDAToolkit 13.0 REQUIRED)
+  _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
+  if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
+    target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
   endif()
 endif()
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
index 4f6cbebfb1..eb7061f0ee 100644
--- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
@@ -1,5 +1,23 @@
 #include "nabla.h"
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+#error "Nabla consumers must not get the CUDA opt-in define."
+#endif
+
+#ifdef CUDA_VERSION
+#error "Nabla consumers must not include CUDA SDK headers."
+#endif
+
 #include "nbl/system/IApplicationFramework.h"
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+#error "Nabla consumers must not get the CUDA opt-in define."
+#endif
+
+#ifdef CUDA_VERSION
+#error "Nabla consumers must not include CUDA SDK headers."
+#endif
+
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_

From 6f68e6644eb222cc5c6a875a8a85e97650261537 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 16:17:15 +0200
Subject: [PATCH 08/27] Split CUDA interop native surface

---
 cmake/NablaConfig.cmake.in                    |  17 +-
 examples_tests                                |   2 +-
 include/nbl/ext/CUDAInterop/CCUDADevice.h     |  43 +--
 .../ext/CUDAInterop/CCUDAExportableMemory.h   |  88 +++---
 include/nbl/ext/CUDAInterop/CCUDAHandler.h    | 274 ++----------------
 .../nbl/ext/CUDAInterop/CCUDAImportedMemory.h |  41 +--
 .../ext/CUDAInterop/CCUDAImportedSemaphore.h  |  47 ++-
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 211 ++++++++++++++
 src/nbl/CMakeLists.txt                        |  14 +
 src/nbl/ext/CMakeLists.txt                    |   4 +
 src/nbl/ext/CUDAInterop/CCUDADevice.cpp       |  85 ++++--
 .../ext/CUDAInterop/CCUDAExportableMemory.cpp |  34 ++-
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      | 161 ++++++++--
 .../ext/CUDAInterop/CCUDAImportedMemory.cpp   |  32 +-
 .../CUDAInterop/CCUDAImportedSemaphore.cpp    |  24 +-
 src/nbl/ext/CUDAInterop/CMakeLists.txt        |  17 +-
 .../CUDAInterop/CUDAInteropNativeState.hpp    | 106 +++++++
 src/nbl/ext/CUDAInterop/README.md             |  23 ++
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt  |  17 +-
 .../ext/CUDAInterop/smoke/clean_opt_in.cpp    |  42 +++
 .../smoke/{opt_in.cpp => native_opt_in.cpp}   |  25 +-
 21 files changed, 817 insertions(+), 490 deletions(-)
 create mode 100644 include/nbl/ext/CUDAInterop/CUDAInteropNative.h
 create mode 100644 src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
 create mode 100644 src/nbl/ext/CUDAInterop/README.md
 create mode 100644 src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
 rename src/nbl/ext/CUDAInterop/smoke/{opt_in.cpp => native_opt_in.cpp} (72%)

diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index 8b9f62e548..afff3dcccc 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -7,6 +7,7 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso
 set(_NBL_NABLA_LOAD_CORE OFF)
 set(_NBL_NABLA_LOAD_NSC OFF)
 set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF)
+set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE OFF)
 set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS})
 set(_NBL_NABLA_HAS_CORE_EXPORTS OFF)
 set(_NBL_NABLA_HAS_NSC_EXPORTS OFF)
@@ -30,6 +31,12 @@ if(_NBL_NABLA_COMPONENTS)
       set(_NBL_NABLA_LOAD_CORE ON)
       set(_NBL_NABLA_LOAD_CUDA_INTEROP ON)
       set(Nabla_CUDAInterop_FOUND TRUE)
+    elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInteropNative")
+      set(_NBL_NABLA_LOAD_CORE ON)
+      set(_NBL_NABLA_LOAD_CUDA_INTEROP ON)
+      set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE ON)
+      set(Nabla_CUDAInterop_FOUND TRUE)
+      set(Nabla_CUDAInteropNative_FOUND TRUE)
     else()
       set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE)
     endif()
@@ -86,6 +93,10 @@ if(_NBL_NABLA_LOAD_NSC)
 endif()
 
 if(_NBL_NABLA_LOAD_CUDA_INTEROP)
+  _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
+endif()
+
+if(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE)
   include(CMakeFindDependencyMacro)
 
   if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "")
@@ -93,9 +104,9 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP)
   endif()
 
   find_dependency(CUDAToolkit 13.0 REQUIRED)
-  _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
-  if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
-    target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
+  _nbl_try_include_component("CUDAInteropNative" "NablaCUDAInteropNativeExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND)
+  if(_NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND AND TARGET Nabla::ext::CUDAInteropNative)
+    target_link_libraries(Nabla::ext::CUDAInteropNative INTERFACE CUDA::toolkit)
   endif()
 endif()
 
diff --git a/examples_tests b/examples_tests
index 5c604d274b..7a2a4f604f 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 5c604d274b8aac99d8855f5b7aaf615910c8a5f6
+Subproject commit 7a2a4f604fd941984d6624e3059f7380cc6592a2
diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h
index d7886a4c53..25c40e7ed6 100644
--- a/include/nbl/ext/CUDAInterop/CCUDADevice.h
+++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h
@@ -4,37 +4,32 @@
 #ifndef _NBL_VIDEO_C_CUDA_DEVICE_H_
 #define _NBL_VIDEO_C_CUDA_DEVICE_H_
 
-
-#ifdef _NBL_COMPILE_WITH_CUDA_
-
 #include "nbl/video/declarations.h"
 #include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
 #include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
 #include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
 
-#include "cuda.h"
-#include "nvrtc.h"
-#if CUDA_VERSION < 9000
-	#error "Need CUDA 9.0 SDK or higher."
-#endif
-
-// useful includes in the future
-//#include "cudaEGL.h"
-//#include "cudaVDPAU.h"
+#include <cstring>
+#include <memory>
+#include <vector>
 
 namespace nbl::video
 {
 class CCUDAHandler;
 
+namespace cuda_native
+{
+struct SAccess;
+}
+
 class CCUDADevice : public core::IReferenceCounted
 {
-  public:
+	public:
+		struct SNativeState;
 #ifdef _WIN32
 		static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32;
-		static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32;
 #else
 		static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD;
-		static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 #endif
 
 		enum E_VIRTUAL_ARCHITECTURE
@@ -73,22 +68,20 @@ class CCUDADevice : public core::IReferenceCounted
 		};
 		inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;}
 
-		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr<CCUDAHandler>&& handler);
+		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr<SNativeState>&& nativeState, core::smart_refctd_ptr<CCUDAHandler>&& handler);
 
-		~CCUDADevice();
+		~CCUDADevice() override;
 
 		inline core::SRange<const char* const> geDefaultCompileOptions() const
 		{
 			return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()};
 		}
 
-		CUdevice getInternalObject() const { return m_handle; }
-
 		const CCUDAHandler* getHandler() const { return m_handler.get();  }
 
 		bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); }
 
-		size_t roundToGranularity(CUmemLocationType location, size_t size) const;
+		size_t roundToGranularity(ECUDAMemoryLocation location, size_t size) const;
 
 		core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams);
 
@@ -97,24 +90,20 @@ class CCUDADevice : public core::IReferenceCounted
 		core::smart_refctd_ptr<CCUDAImportedSemaphore> importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sem);
 
 	private:
-		CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const;
+		friend struct cuda_native::SAccess;
 
 		static constexpr auto CudaMemoryLocationCount = 5;
 
-    const system::logger_opt_ptr m_logger;
+		const system::logger_opt_ptr m_logger;
 		std::vector<const char*> m_defaultCompileOptions;
 		core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection;
 		IPhysicalDevice* const m_physicalDevice;
 		E_VIRTUAL_ARCHITECTURE m_virtualArchitecture;
 
 		core::smart_refctd_ptr<CCUDAHandler> m_handler;
-		CUdevice m_handle;
-		CUcontext m_context;
-		std::array<size_t, CudaMemoryLocationCount> m_allocationGranularity;
+		std::unique_ptr<SNativeState> m_native;
 };
 
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
-
 #endif
diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
index 10bf911717..5973c31fac 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
@@ -4,64 +4,60 @@
 #ifndef _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_
 #define _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_
 
-
-#ifdef _NBL_COMPILE_WITH_CUDA_
-
 #include "nbl/video/declarations.h"
 
-#include "cuda.h"
-#include "nvrtc.h"
-#if CUDA_VERSION < 9000
-  #error "Need CUDA 9.0 SDK or higher."
-#endif
-
-// useful includes in the future
-//#include "cudaEGL.h"
-//#include "cudaVDPAU.h"
+#include <memory>
+#include <utility>
 
 namespace nbl::video
 {
-
 class CCUDADevice;
 
-class CCUDAExportableMemory : public core::IReferenceCounted
+namespace cuda_native
 {
-    public:
-
-        struct SCreationParams
-        {
-            size_t            size;
-            uint32_t          alignment;
-            CUmemLocationType location;
-        };
-
-        struct SCachedCreationParams : SCreationParams
-        {
-            size_t granularSize;
-            CUdeviceptr ptr;
-            external_handle_t externalHandle;
-        };
-
-        CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params)
-            : m_device(std::move(device))
-            , m_params(std::move(params))
-        {}
-        ~CCUDAExportableMemory() override;
-
-        CUdeviceptr getDeviceptr() const { return m_params.ptr;  }
-
-        const SCreationParams& getCreationParams() const { return m_params; }
-
-        core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
+struct SAccess;
+}
 
-    private:
+enum class ECUDAMemoryLocation : uint32_t
+{
+	DEVICE = 1,
+	HOST = 2,
+	HOST_NUMA = 3,
+	HOST_NUMA_CURRENT = 4
+};
 
-        core::smart_refctd_ptr<CCUDADevice> m_device;
-        SCachedCreationParams m_params;
+class CCUDAExportableMemory : public core::IReferenceCounted
+{
+	public:
+		struct SNativeState;
+		struct SCreationParams
+		{
+			size_t size;
+			uint32_t alignment;
+			ECUDAMemoryLocation location;
+		};
+
+		struct SCachedCreationParams : SCreationParams
+		{
+			size_t granularSize;
+			external_handle_t externalHandle;
+		};
+
+		CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState);
+		~CCUDAExportableMemory() override;
+
+		const SCreationParams& getCreationParams() const { return m_params; }
+
+		core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
+
+	private:
+		friend struct cuda_native::SAccess;
+
+		core::smart_refctd_ptr<CCUDADevice> m_device;
+		SCachedCreationParams m_params;
+		std::unique_ptr<SNativeState> m_native;
 };
 
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
-
 #endif
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
index 5128aad575..063598a518 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -9,158 +9,30 @@
 
 #include "nbl/system/declarations.h"
 
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <string>
 
-
-#ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
 {
+class CCUDADevice;
+class CVulkanConnection;
+class IPhysicalDevice;
 
+namespace cuda_native
+{
+struct SAccess;
+}
 
 class CCUDAHandler : public core::IReferenceCounted
 {
-		public:
-		static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
-
-		inline bool defaultHandleResult(CUresult result) const
-		{
-			core::smart_refctd_ptr<system::ILogger> logger = m_logger.get();
-			return defaultHandleResult(result,logger.get());
-		}
-
-		//
-		bool defaultHandleResult(nvrtcResult result);
-
-		//
-		template<typename T>
-		static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
-
-		//
+	public:
+		struct SNativeState;
 		static core::smart_refctd_ptr<CCUDAHandler> create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger);
 
-		//
-		using LibLoader = system::DefaultFuncPtrLoader;
-		NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader
-			,cuCtxCreate_v4
-			,cuDevicePrimaryCtxRetain
-			,cuDevicePrimaryCtxRelease
-			,cuDevicePrimaryCtxSetFlags
-			,cuDevicePrimaryCtxGetState
-			,cuCtxDestroy_v2
-			,cuCtxEnablePeerAccess
-			,cuCtxGetApiVersion
-			,cuCtxGetCurrent
-			,cuCtxGetDevice
-			,cuCtxGetSharedMemConfig
-			,cuCtxPopCurrent_v2
-			,cuCtxPushCurrent_v2
-			,cuCtxSetCacheConfig
-			,cuCtxSetCurrent
-			,cuCtxSetSharedMemConfig
-			,cuCtxSynchronize
-			,cuDeviceComputeCapability
-			,cuDeviceCanAccessPeer
-			,cuDeviceGetCount
-			,cuDeviceGet
-			,cuDeviceGetAttribute
-			,cuDeviceGetLuid
-			,cuDeviceGetUuid_v2
-			,cuDeviceTotalMem_v2
-			,cuDeviceGetName
-			,cuDriverGetVersion
-			,cuEventCreate
-			,cuEventDestroy_v2
-			,cuEventElapsedTime
-			,cuEventQuery
-			,cuEventRecord
-			,cuEventSynchronize
-			,cuFuncGetAttribute
-			,cuFuncSetCacheConfig
-			,cuGetErrorName
-			,cuGetErrorString
-			,cuGraphicsMapResources
-			,cuGraphicsResourceGetMappedPointer_v2
-			,cuGraphicsResourceGetMappedMipmappedArray
-			,cuGraphicsSubResourceGetMappedArray
-			,cuGraphicsUnmapResources
-			,cuGraphicsUnregisterResource
-			,cuInit
-			,cuLaunchKernel
-			,cuMemAlloc_v2
-			,cuMemcpyDtoD_v2
-			,cuMemcpyDtoH_v2
-			,cuMemcpyHtoD_v2
-			,cuMemcpyDtoDAsync_v2
-			,cuMemcpyDtoHAsync_v2
-			,cuMemcpyHtoDAsync_v2
-			,cuMemGetAddressRange_v2
-			,cuMemFree_v2
-			,cuMemFreeHost
-			,cuMemGetInfo_v2
-			,cuMemHostAlloc
-			,cuMemHostRegister_v2
-			,cuMemHostUnregister
-			,cuMemsetD32_v2
-			,cuMemsetD32Async
-			,cuMemsetD8_v2
-			,cuMemsetD8Async
-			,cuModuleGetFunction
-			,cuModuleGetGlobal_v2
-			,cuModuleLoadDataEx
-			,cuModuleLoadFatBinary
-			,cuModuleUnload
-			,cuOccupancyMaxActiveBlocksPerMultiprocessor
-			,cuPointerGetAttribute
-			,cuStreamAddCallback
-			,cuStreamCreate
-			,cuStreamDestroy_v2
-			,cuStreamQuery
-			,cuStreamSynchronize
-			,cuStreamWaitEvent
-			,cuSurfObjectCreate
-			,cuSurfObjectDestroy
-			,cuTexObjectCreate
-			,cuTexObjectDestroy
-			,cuImportExternalMemory
-			,cuDestroyExternalMemory
-			,cuExternalMemoryGetMappedBuffer
-			,cuMemUnmap
-			,cuMemAddressFree
-			,cuMemGetAllocationGranularity
-			,cuMemAddressReserve
-			,cuMemCreate
-			,cuMemExportToShareableHandle
-			,cuMemMap
-			,cuMemRelease
-			,cuMemSetAccess
-			,cuMemImportFromShareableHandle
-			,cuLaunchHostFunc
-			,cuDestroyExternalSemaphore
-			,cuImportExternalSemaphore
-			,cuSignalExternalSemaphoresAsync
-			,cuWaitExternalSemaphoresAsync
-			,cuLogsRegisterCallback
-		);
-		const CUDA& getCUDAFunctionTable() const {return m_cuda;}
-
-		NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader,
-			nvrtcGetErrorString,
-			nvrtcVersion,
-			nvrtcAddNameExpression,
-			nvrtcCompileProgram,
-			nvrtcCreateProgram,
-			nvrtcDestroyProgram,
-			nvrtcGetLoweredName,
-			nvrtcGetPTX,
-			nvrtcGetPTXSize,
-			nvrtcGetProgramLog,
-			nvrtcGetProgramLogSize
-		);
-		const NVRTC& getNVRTCFunctionTable() const {return m_nvrtc;}
+		CCUDAHandler(std::unique_ptr<SNativeState>&& nativeState, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version);
 
-		CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version);
-
-		//
 		inline core::SRange<system::IFile* const> getSTDHeaders()
 		{
 			auto begin = m_headers.empty() ? nullptr:(&m_headers[0].get());
@@ -169,29 +41,9 @@ class CCUDAHandler : public core::IReferenceCounted
 		inline const auto& getSTDHeaderContents() { return m_headerContents; }
 		inline const auto& getSTDHeaderNames() { return m_headerNames; }
 
-		//
-		nvrtcResult createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-		inline nvrtcResult createProgram(nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-		{
-			return createProgram(prog,std::string(source),name,headerCount,headerContents,includeNames);
-		}
-		inline nvrtcResult createProgram(nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-		{
-			const auto filesize = file->getSize();
-			std::string source(filesize+1u,'0');
-
-			system::IFile::success_t bytesRead;
-			file->read(bytesRead,source.data(),0u,file->getSize());
-			source.resize(bytesRead.getBytesProcessed());
-
-			return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames);
-		}
-
 		struct SCUDADeviceInfo
 		{
-			CUdevice handle = {};
-			CUuuid uuid = {};
-			int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
+			std::array<uint8_t,16> uuid = {};
 		};
 
 		inline core::vector<SCUDADeviceInfo> const& getAvailableDevices() const
@@ -199,93 +51,15 @@ class CCUDAHandler : public core::IReferenceCounted
 			return m_availableDevices;
 		}
 
-		//
-		inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange<const char* const> options)
-		{
-			return m_nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin());
-		}
-
-		//
-		nvrtcResult getProgramLog(nvrtcProgram prog, std::string& log);
-
-		//
-		struct ptx_and_nvrtcResult_t
-		{
-			core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
-			nvrtcResult result;
-		};
-		ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog);
-
-		//
-		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			nvrtcProgram program = nullptr;
-			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
-			auto cleanup = core::makeRAIIExiter([&]() -> void
-			{
-				if (result!=NVRTC_SUCCESS && program)
-					m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX?
-			});
-
-			result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames);
-			return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log);
-		}
-		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-			const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			return compileDirectlyToPTX(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-		}
-		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-			system::IFile* file, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			nvrtcProgram program = nullptr;
-			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
-			auto cleanup = core::makeRAIIExiter([&]() -> void
-			{
-				if (result!=NVRTC_SUCCESS && program)
-					m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX?
-			});
-
-			result = createProgram(&program,file,headerCount,headerContents,includeNames);
-			return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log);
-		}
-
 		core::smart_refctd_ptr<CCUDADevice> createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice);
 
 	protected:
+		~CCUDAHandler() override;
 
-		~CCUDAHandler() = default;
-		
-		//
-		inline ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
-		{
-			if (result!=NVRTC_SUCCESS)
-				return {nullptr,result};
-
-			result = compileProgram(program,nvrtcOptions);
-			if (log)
-				getProgramLog(program,*log);
-			if (result!=NVRTC_SUCCESS)
-				return {nullptr,result};
-			
-			return getPTX(program);
-		}
-
-		// function tables
-		CUDA m_cuda;
-		NVRTC m_nvrtc;
+	private:
+		friend struct cuda_native::SAccess;
 
-		//
+		std::unique_ptr<SNativeState> m_native;
 		core::vector<SCUDADeviceInfo> m_availableDevices;
 		core::vector<core::smart_refctd_ptr<system::IFile>> m_headers;
 		core::vector<const char*> m_headerContents;
@@ -295,16 +69,6 @@ class CCUDAHandler : public core::IReferenceCounted
 		int m_version;
 };
 
-#define ASSERT_CUDA_SUCCESS(expr, handler) \
-    do { \
-        const auto cudaResult = (expr); \
-        if (!((handler)->defaultHandleResult(cudaResult))) { \
-            assert(false); \
-        } \
-    } while(0)
-
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
-
 #endif
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
index 5f885abd2d..8a24f83907 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
@@ -1,46 +1,37 @@
 #ifndef _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
 #define _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
 
-#ifdef _NBL_COMPILE_WITH_CUDA_
-
 #include "nbl/video/declarations.h"
 
-#include "cuda.h"
-#include "nvrtc.h"
-#if CUDA_VERSION < 9000
-  #error "Need CUDA 9.0 SDK or higher."
-#endif
+#include <memory>
+#include <utility>
 
 namespace nbl::video
 {
 
 class CCUDADevice;
 
-class CCUDAImportedMemory : public core::IReferenceCounted
+namespace cuda_native
 {
-    public:
-
-      CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src,
-        CUexternalMemory cuExtMem) : 
-        m_device(device),
-        m_src(src),
-        m_handle(cuExtMem) {}
-
-      ~CCUDAImportedMemory() override;
+struct SAccess;
+}
 
-      CUexternalMemory getInternalObject() const { return m_handle; }
-      CUresult getMappedBuffer(CUdeviceptr* mappedBuffer);
+class CCUDAImportedMemory : public core::IReferenceCounted
+{
+	public:
+		struct SNativeState;
+		CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState);
 
-    private:
+		~CCUDAImportedMemory() override;
 
-      core::smart_refctd_ptr<CCUDADevice> m_device;
-      core::smart_refctd_ptr<IDeviceMemoryAllocation> m_src;
-      CUexternalMemory m_handle;
+	private:
+		friend struct cuda_native::SAccess;
 
+		core::smart_refctd_ptr<CCUDADevice> m_device;
+		core::smart_refctd_ptr<IDeviceMemoryAllocation> m_src;
+		std::unique_ptr<SNativeState> m_native;
 };
 
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
-
 #endif
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
index 409ef1a676..3ee03fb045 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
@@ -4,47 +4,36 @@
 #ifndef _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_
 #define _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_
 
-#ifdef _NBL_COMPILE_WITH_CUDA_
-
 #include "nbl/video/declarations.h"
 
-#include "cuda.h"
-#include "nvrtc.h"
-#if CUDA_VERSION < 9000
-  #error "Need CUDA 9.0 SDK or higher."
-#endif
-
-// useful includes in the future
-//#include "cudaEGL.h"
-//#include "cudaVDPAU.h"
+#include <memory>
+#include <utility>
 
 namespace nbl::video
 {
 
 class CCUDADevice;
 
+namespace cuda_native
+{
+struct SAccess;
+}
+
 class CCUDAImportedSemaphore : public core::IReferenceCounted
 {
-    public:
-
-      CUexternalSemaphore getInternalObject() const { return m_handle; }
-      CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, 
-        core::smart_refctd_ptr<ISemaphore> src, 
-        CUexternalSemaphore semaphore)
-          : m_device(std::move(device))
-          , m_src(std::move(src))
-          , m_handle(semaphore)
-      {}
-      ~CCUDAImportedSemaphore() override;
-
-    private:
-      core::smart_refctd_ptr<CCUDADevice> m_device;
-      core::smart_refctd_ptr<ISemaphore> m_src;
-      CUexternalSemaphore m_handle;
+	public:
+		struct SNativeState;
+		CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState);
+		~CCUDAImportedSemaphore() override;
+
+	private:
+		friend struct cuda_native::SAccess;
+
+		core::smart_refctd_ptr<CCUDADevice> m_device;
+		core::smart_refctd_ptr<ISemaphore> m_src;
+		std::unique_ptr<SNativeState> m_native;
 };
 
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
-
 #endif
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
new file mode 100644
index 0000000000..f913664122
--- /dev/null
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -0,0 +1,211 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
+#define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
+
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+
+#include "nbl/asset/ICPUBuffer.h"
+#include "nbl/system/DynamicFunctionCaller.h"
+
+#include "cuda.h"
+#include "nvrtc.h"
+#if CUDA_VERSION < 13000
+	#error "Need CUDA 13.0 SDK or higher."
+#endif
+
+namespace nbl::video::cuda_native
+{
+
+using LibLoader = system::DefaultFuncPtrLoader;
+
+NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader
+	,cuCtxCreate_v4
+	,cuDevicePrimaryCtxRetain
+	,cuDevicePrimaryCtxRelease
+	,cuDevicePrimaryCtxSetFlags
+	,cuDevicePrimaryCtxGetState
+	,cuCtxDestroy_v2
+	,cuCtxEnablePeerAccess
+	,cuCtxGetApiVersion
+	,cuCtxGetCurrent
+	,cuCtxGetDevice
+	,cuCtxGetSharedMemConfig
+	,cuCtxPopCurrent_v2
+	,cuCtxPushCurrent_v2
+	,cuCtxSetCacheConfig
+	,cuCtxSetCurrent
+	,cuCtxSetSharedMemConfig
+	,cuCtxSynchronize
+	,cuDeviceComputeCapability
+	,cuDeviceCanAccessPeer
+	,cuDeviceGetCount
+	,cuDeviceGet
+	,cuDeviceGetAttribute
+	,cuDeviceGetLuid
+	,cuDeviceGetUuid_v2
+	,cuDeviceTotalMem_v2
+	,cuDeviceGetName
+	,cuDriverGetVersion
+	,cuEventCreate
+	,cuEventDestroy_v2
+	,cuEventElapsedTime
+	,cuEventQuery
+	,cuEventRecord
+	,cuEventSynchronize
+	,cuFuncGetAttribute
+	,cuFuncSetCacheConfig
+	,cuGetErrorName
+	,cuGetErrorString
+	,cuGraphicsMapResources
+	,cuGraphicsResourceGetMappedPointer_v2
+	,cuGraphicsResourceGetMappedMipmappedArray
+	,cuGraphicsSubResourceGetMappedArray
+	,cuGraphicsUnmapResources
+	,cuGraphicsUnregisterResource
+	,cuInit
+	,cuLaunchKernel
+	,cuMemAlloc_v2
+	,cuMemcpyDtoD_v2
+	,cuMemcpyDtoH_v2
+	,cuMemcpyHtoD_v2
+	,cuMemcpyDtoDAsync_v2
+	,cuMemcpyDtoHAsync_v2
+	,cuMemcpyHtoDAsync_v2
+	,cuMemGetAddressRange_v2
+	,cuMemFree_v2
+	,cuMemFreeHost
+	,cuMemGetInfo_v2
+	,cuMemHostAlloc
+	,cuMemHostRegister_v2
+	,cuMemHostUnregister
+	,cuMemsetD32_v2
+	,cuMemsetD32Async
+	,cuMemsetD8_v2
+	,cuMemsetD8Async
+	,cuModuleGetFunction
+	,cuModuleGetGlobal_v2
+	,cuModuleLoadDataEx
+	,cuModuleLoadFatBinary
+	,cuModuleUnload
+	,cuOccupancyMaxActiveBlocksPerMultiprocessor
+	,cuPointerGetAttribute
+	,cuStreamAddCallback
+	,cuStreamCreate
+	,cuStreamDestroy_v2
+	,cuStreamQuery
+	,cuStreamSynchronize
+	,cuStreamWaitEvent
+	,cuSurfObjectCreate
+	,cuSurfObjectDestroy
+	,cuTexObjectCreate
+	,cuTexObjectDestroy
+	,cuImportExternalMemory
+	,cuDestroyExternalMemory
+	,cuExternalMemoryGetMappedBuffer
+	,cuMemUnmap
+	,cuMemAddressFree
+	,cuMemGetAllocationGranularity
+	,cuMemAddressReserve
+	,cuMemCreate
+	,cuMemExportToShareableHandle
+	,cuMemMap
+	,cuMemRelease
+	,cuMemSetAccess
+	,cuMemImportFromShareableHandle
+	,cuLaunchHostFunc
+	,cuDestroyExternalSemaphore
+	,cuImportExternalSemaphore
+	,cuSignalExternalSemaphoresAsync
+	,cuWaitExternalSemaphoresAsync
+	,cuLogsRegisterCallback
+);
+
+NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader,
+	nvrtcGetErrorString,
+	nvrtcVersion,
+	nvrtcAddNameExpression,
+	nvrtcCompileProgram,
+	nvrtcCreateProgram,
+	nvrtcDestroyProgram,
+	nvrtcGetLoweredName,
+	nvrtcGetPTX,
+	nvrtcGetPTXSize,
+	nvrtcGetProgramLog,
+	nvrtcGetProgramLogSize
+);
+
+struct SCUDADeviceInfo
+{
+	CUdevice handle = {};
+	CUuuid uuid = {};
+	int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
+};
+
+const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
+const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+
+bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
+bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
+bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
+
+template<typename T>
+T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
+
+const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
+
+nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames);
+}
+nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
+nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
+
+struct ptx_and_nvrtcResult_t
+{
+	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
+	nvrtcResult result;
+};
+
+ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
+ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+);
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler& handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(handler,std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+);
+
+CUdevice getInternalObject(const CCUDADevice& device);
+CUcontext getContext(const CCUDADevice& device);
+size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
+CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
+CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
+CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
+CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
+
+}
+
+#define ASSERT_CUDA_SUCCESS(expr, handler) \
+	do { \
+		const auto cudaResult = (expr); \
+		if (!nbl::video::cuda_native::defaultHandleResult(*(handler), cudaResult)) { \
+			assert(false); \
+		} \
+	} while(0)
+
+#endif
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 6c3ab2606d..ecf7f555c3 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -798,6 +798,20 @@ if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB})
 			COMPONENT Libraries
 		)
 	endif()
+
+	if(DEFINED NBL_EXT_CUDA_INTEROP_NATIVE_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB})
+		if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
+			install(TARGETS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}
+				EXPORT NablaCUDAInteropNativeExportTargets
+				COMPONENT Libraries
+			)
+			install(EXPORT NablaCUDAInteropNativeExportTargets
+				NAMESPACE Nabla::
+				DESTINATION cmake
+				COMPONENT Libraries
+			)
+		endif()
+	endif()
 endif()
 
 if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB})
diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt
index 1f815413e8..59ae49285e 100644
--- a/src/nbl/ext/CMakeLists.txt
+++ b/src/nbl/ext/CMakeLists.txt
@@ -48,6 +48,10 @@ if (NBL_COMPILE_WITH_CUDA)
         ${NBL_EXT_CUDA_INTEROP_LIB}
         PARENT_SCOPE
     )
+    set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB
+        ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}
+        PARENT_SCOPE
+    )
 endif()
 
 if (NBL_BUILD_IMGUI)
diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
index aa06c6e7bf..5f59545173 100644
--- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
@@ -1,15 +1,12 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "CUDAInteropNativeState.hpp"
 
 #ifdef _WIN32
 #include <winternl.h>
 #endif
 
-#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
-
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
 {
@@ -18,28 +15,27 @@ CCUDADevice::CCUDADevice(
 	core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, 
 	IPhysicalDevice* const vulkanDevice, 
 	const E_VIRTUAL_ARCHITECTURE virtualArchitecture,
-	CUdevice device,
+	std::unique_ptr<SNativeState>&& nativeState,
 	core::smart_refctd_ptr<CCUDAHandler>&& handler) : 
 	m_logger(vulkanDevice->getDebugCallback()->getLogger()),
   m_defaultCompileOptions(), 
   m_vulkanConnection(std::move(vulkanConnection)), 
   m_physicalDevice(vulkanDevice), 
   m_virtualArchitecture(virtualArchitecture),
-	m_handle(device),
 	m_handler(std::move(handler)),
-	m_allocationGranularity{}
+	m_native(std::move(nativeState))
 {
 	m_defaultCompileOptions.push_back("--std=c++14");
 	m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]);
 	m_defaultCompileOptions.push_back("-dc");
 	m_defaultCompileOptions.push_back("-use_fast_math");
 
-  const auto& cu = m_handler->getCUDAFunctionTable();
+  const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
 	
-	ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle), m_handler);
-	ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_context), m_handler);
+	ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle), m_handler);
+	ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_native->context), m_handler);
 
-	for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType)
+	for (uint32_t locationType = 0; locationType < m_native->allocationGranularity.size(); ++locationType)
 	{
 	
     #ifdef _WIN32
@@ -50,24 +46,47 @@ CCUDADevice::CCUDADevice(
 
 	  const auto prop = CUmemAllocationProp{
       .type = CU_MEM_ALLOCATION_TYPE_PINNED,
-      .requestedHandleTypes = ALLOCATION_HANDLE_TYPE,
-      .location = { .type = static_cast<CUmemLocationType>(locationType), .id = m_handle },
+      .requestedHandleTypes = cuda_native::getAllocationHandleType(),
+      .location = { .type = static_cast<CUmemLocationType>(locationType), .id = m_native->handle },
   #ifdef _WIN32
       .win32HandleMetaData = &metadata,
   #endif
     };
-		ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler);
 	}
 }
 
-size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const
+size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation location, size_t size) const
+{
+	return cuda_native::roundToGranularity(*this,cuda_native::toNative(location),size);
+}
+
+namespace cuda_native
+{
+
+CUdevice getInternalObject(const CCUDADevice& device)
+{
+	return SAccess::native(device).handle;
+}
+
+CUcontext getContext(const CCUDADevice& device)
 {
-	return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location];
+	return SAccess::native(device).context;
+}
+
+size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size)
+{
+	const auto& granularity = SAccess::native(device).allocationGranularity[location];
+	return ((size - 1) / granularity + 1) * granularity;
+}
+
 }
 
-CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const
+static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory)
 {
-	const auto& cu = m_handler->getCUDAFunctionTable();
+	const auto handler = device.getHandler();
+	const auto& native = cuda_native::SAccess::native(device);
+	const auto& cu = cuda_native::getCUDAFunctionTable(*handler);
 	
 	CUdeviceptr ptr = 0;
 	if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err)
@@ -75,19 +94,19 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz
 
 	if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err)
 	{
-		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler);
 		return err;
 	}
 	
 	CUmemAccessDesc accessDesc = {
-		.location = { .type = location, .id = m_handle },
+		.location = { .type = location, .id = native.handle },
 		.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
 	};
 
 	if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
 	{
-		ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler);
-		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), handler);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler);
 		return err;
 	}
 
@@ -100,7 +119,8 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 {
 	CCUDAExportableMemory::SCachedCreationParams params = { inParams };
 
-	auto& cu = m_handler->getCUDAFunctionTable();
+	auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
+	const auto nativeLocation = cuda_native::toNative(params.location);
 	
 #ifdef _WIN32
 	OBJECT_ATTRIBUTES metadata = {
@@ -110,14 +130,15 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 
 	 const auto prop = CUmemAllocationProp{
 		.type = CU_MEM_ALLOCATION_TYPE_PINNED,
-		.requestedHandleTypes = ALLOCATION_HANDLE_TYPE,
-		.location = { .type = params.location, .id = m_handle },
+		.requestedHandleTypes = cuda_native::getAllocationHandleType(),
+		.location = { .type = nativeLocation, .id = m_native->handle },
 #ifdef _WIN32
 		.win32HandleMetaData = &metadata,
 #endif
 	};
 
 	params.granularSize = roundToGranularity(params.location, params.size);
+	auto nativeState = std::make_unique<CCUDAExportableMemory::SNativeState>();
 
 	CUmemGenericAllocationHandle mem;
 	if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err)
@@ -133,7 +154,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 		return nullptr;
 	}
 
-	if (const auto err = reserveAddressAndMapMemory(&params.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err)
+	if (const auto err = reserveAddressAndMapMemory(*this,&nativeState->ptr, params.granularSize, params.alignment, nativeLocation, mem); CUDA_SUCCESS != err)
 	{
 		m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
@@ -152,12 +173,12 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 		return nullptr;
 	}
 	
-	return core::make_smart_refctd_ptr<CCUDAExportableMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(params));
+	return core::make_smart_refctd_ptr<CCUDAExportableMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(params), std::move(nativeState));
 }
 
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem)
 {
-	const auto& cu = m_handler->getCUDAFunctionTable();
+	const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
 	const auto handleType = mem->getCreationParams().externalHandleType;
 
 	if (!handleType) return nullptr;
@@ -180,12 +201,12 @@ core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(co
 		m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR);
 		return nullptr;
 	}
-	return core::make_smart_refctd_ptr<CCUDAImportedMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(mem), cuExtMem);
+	return core::make_smart_refctd_ptr<CCUDAImportedMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(mem), std::make_unique<CCUDAImportedMemory::SNativeState>(cuExtMem));
 }
 
 core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sema)
 {
-	auto& cu = m_handler->getCUDAFunctionTable();
+	auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
 	auto handleType = sema->getCreationParams().externalHandleTypes.value;
 
 	if (!handleType)
@@ -210,12 +231,12 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 		return nullptr;
 	}
 	
-	return core::make_smart_refctd_ptr<CCUDAImportedSemaphore>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(sema), cusema);
+	return core::make_smart_refctd_ptr<CCUDAImportedSemaphore>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(sema), std::make_unique<CCUDAImportedSemaphore::SNativeState>(cusema));
 }
 
 CCUDADevice::~CCUDADevice()
 {
-	ASSERT_CUDA_SUCCESS(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context), m_handler);
+	ASSERT_CUDA_SUCCESS(cuda_native::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context), m_handler);
 }
 
 }
diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
index 65afdca660..94d18c40bb 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
@@ -2,14 +2,18 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "CUDAInteropNativeState.hpp"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
 {
 
+CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_params(std::move(params))
+	, m_native(std::move(nativeState))
+{}
+
 core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const
 {
 	auto pd = device->getPhysicalDevice();
@@ -18,10 +22,10 @@ core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsM
 
 	switch (m_params.location)
 	{
-    case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &=  vram; break;
-    case CU_MEM_LOCATION_TYPE_HOST_NUMA:
-    case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT:
-    case CU_MEM_LOCATION_TYPE_HOST:   memoryTypeBits &= ~vram; break;
+    case ECUDAMemoryLocation::DEVICE: memoryTypeBits &=  vram; break;
+    case ECUDAMemoryLocation::HOST_NUMA:
+    case ECUDAMemoryLocation::HOST_NUMA_CURRENT:
+    case ECUDAMemoryLocation::HOST:   memoryTypeBits &= ~vram; break;
     default: break;
 	}
 
@@ -40,15 +44,25 @@ core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsM
 
 CCUDAExportableMemory::~CCUDAExportableMemory()
 {
-	const auto& cu = m_device->getHandler()->getCUDAFunctionTable();
+	const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
 
-  ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), m_device->getHandler());
+  ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_native->ptr, m_params.granularSize), m_device->getHandler());
 
-	ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize), m_device->getHandler());
+	ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize), m_device->getHandler());
 
   bool closeSucceed = CloseExternalHandle(m_params.externalHandle);
 	assert(closeSucceed);
 
+}
+
+namespace cuda_native
+{
+
+CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory)
+{
+	return SAccess::native(memory).ptr;
+}
+
 }
 }
 
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 748a88d1a1..49e36083d4 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "CUDAInteropNativeState.hpp"
 #include "nbl/system/CFileView.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
@@ -13,13 +13,11 @@ namespace nbl::video
 {
 	
 CCUDAHandler::CCUDAHandler(
-	CUDA&& _cuda, 
-	NVRTC&& _nvrtc, 
+	std::unique_ptr<SNativeState>&& nativeState,
 	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, 
 	core::smart_refctd_ptr<system::ILogger>&& _logger,
 	int _version)
-	: m_cuda(std::move(_cuda))
-	, m_nvrtc(std::move(_nvrtc))
+	: m_native(std::move(nativeState))
 	, m_headers(std::move(_headers))
 	, m_logger(std::move(_logger))
 	, m_version(_version)
@@ -32,29 +30,38 @@ CCUDAHandler::CCUDAHandler(
 	}
 
 	int deviceCount = 0;
-	if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0)
+	if (m_native->cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0)
 		return;
 
 	for (int device_i = 0; device_i < deviceCount; device_i++)
 	{
 		CUdevice handle = -1;
-		if (m_cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0)
+		if (m_native->cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0)
 			continue;
 
 		CUuuid uuid = {};
-		if (m_cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS)
+		if (m_native->cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS)
 			continue;
 
-		m_availableDevices.emplace_back(handle, uuid);
+		auto& nativeDevice = m_native->availableDevices.emplace_back();
+		nativeDevice.handle = handle;
+		nativeDevice.uuid = uuid;
+		auto& cleanDevice = m_availableDevices.emplace_back();
+		memcpy(cleanDevice.uuid.data(),&uuid,cleanDevice.uuid.size());
 
-		int* attributes = m_availableDevices.back().attributes;
+		int* attributes = nativeDevice.attributes;
 		for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++)
-			m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast<CUdevice_attribute>(i), handle);
+			m_native->cuda.pcuDeviceGetAttribute(attributes + i, static_cast<CUdevice_attribute>(i), handle);
 
 	}
 }
 
-bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
+CCUDAHandler::~CCUDAHandler() = default;
+
+namespace cuda_native
+{
+
+bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 {
 	switch (result)
 	{
@@ -420,7 +427,12 @@ bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt
 	return false;
 }
 
-bool CCUDAHandler::defaultHandleResult(nvrtcResult result)
+bool defaultHandleResult(const CCUDAHandler& handler, CUresult result)
+{
+	return defaultHandleResult(result,SAccess::logger(handler));
+}
+
+bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
 {
 	switch (result)
 	{
@@ -428,19 +440,21 @@ bool CCUDAHandler::defaultHandleResult(nvrtcResult result)
 			return true;
 			break;
 		default:
-			if (m_nvrtc.pnvrtcGetErrorString)
-				m_logger.log("%s\n",system::ILogger::ELL_ERROR,m_nvrtc.pnvrtcGetErrorString(result));
+			if (SAccess::native(handler).nvrtc.pnvrtcGetErrorString)
+				SAccess::logger(handler).log("%s\n",system::ILogger::ELL_ERROR,SAccess::native(handler).nvrtc.pnvrtcGetErrorString(result));
 			else
-				m_logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR);
+				SAccess::logger(handler).log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR);
 			break;
 	}
 	_NBL_DEBUG_BREAK_IF(true);
 	return false;
 }
 
+}
+
 core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger)
 {
-	CUDA cuda = CUDA(
+	cuda_native::CUDA cuda = cuda_native::CUDA(
 		#if defined(_NBL_WINDOWS_API_)
 			"nvcuda"
 		#elif defined(_NBL_POSIX_API_)
@@ -450,7 +464,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 		#endif
 	);
 	
-	NVRTC nvrtc = {};
+	cuda_native::NVRTC nvrtc = {};
 	#if defined(_NBL_WINDOWS_API_)
 	// Perpetual TODO: any new CUDA releases we need to account for?
 	// Version List: https://developer.nvidia.com/cuda-toolkit-archive
@@ -468,7 +482,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 		{
 			std::string path(*verpath);
 			path += *suffix;
-			nvrtc = NVRTC(path.c_str());
+			nvrtc = cuda_native::NVRTC(path.c_str());
 			if (nvrtc.pnvrtcVersion)
 				break;
 		}
@@ -476,7 +490,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 			break;
 	}
 	#elif defined(_NBL_POSIX_API_)
-	nvrtc = NVRTC("nvrtc");
+	nvrtc = cuda_native::NVRTC("nvrtc");
 	//nvrtc_builtins = NVRTC("nvrtc-builtins");
 	#else
 	#error "Unsuported Platform"
@@ -526,10 +540,28 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 		));
 	}
 
-	return core::make_smart_refctd_ptr<CCUDAHandler>(std::move(cuda),std::move(nvrtc), std::move(headers), std::move(_logger), cudaVersion);
+	return core::make_smart_refctd_ptr<CCUDAHandler>(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc)), std::move(headers), std::move(_logger), cudaVersion);
+}
+
+namespace cuda_native
+{
+
+const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler)
+{
+	return SAccess::native(handler).cuda;
+}
+
+const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler)
+{
+	return SAccess::native(handler).nvrtc;
 }
 
-nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
+const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler)
+{
+	return SAccess::native(handler).availableDevices;
+}
+
+nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
 #if defined(_NBL_WINDOWS_API_)
 	source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n");
@@ -538,26 +570,43 @@ nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source
 #else
 #error "Unsuported Platform"
 #endif
-	return m_nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames);
+	return SAccess::native(handler).nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames);
+}
+
+nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount, const char* const* headerContents, const char* const* includeNames)
+{
+	const auto filesize = file->getSize();
+	std::string source(filesize+1u,'0');
+
+	system::IFile::success_t bytesRead;
+	file->read(bytesRead,source.data(),0u,file->getSize());
+	source.resize(bytesRead.getBytesProcessed());
+
+	return createProgram(handler,prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames);
+}
+
+nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options)
+{
+	return SAccess::native(handler).nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin());
 }
 
-nvrtcResult CCUDAHandler::getProgramLog(nvrtcProgram prog, std::string& log)
+nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log)
 {
 	size_t _size = 0ull;
-	nvrtcResult sizeRes = m_nvrtc.pnvrtcGetProgramLogSize(prog, &_size);
+	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetProgramLogSize(prog, &_size);
 	if (sizeRes != NVRTC_SUCCESS)
 		return sizeRes;
 	if (_size == 0ull)
 		return NVRTC_ERROR_INVALID_INPUT;
 
 	log.resize(_size);
-	return m_nvrtc.pnvrtcGetProgramLog(prog,log.data());
+	return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data());
 }
 
-CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog)
+ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 {
 	size_t _size = 0ull;
-	nvrtcResult sizeRes = m_nvrtc.pnvrtcGetPTXSize(prog,&_size);
+	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size);
 	if (sizeRes!=NVRTC_SUCCESS)
 		return {nullptr,sizeRes};
 	if (_size==0ull)
@@ -567,7 +616,57 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog)
 	ptxParams.size = _size;
 	auto ptx = asset::ICPUBuffer::create(std::move(ptxParams));
 	auto ptxPtr = static_cast<char*>(ptx->getPointer());
-	return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
+	return {std::move(ptx),SAccess::native(handler).nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
+}
+
+static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
+{
+	if (result!=NVRTC_SUCCESS)
+		return {nullptr,result};
+
+	result = compileProgram(handler,program,nvrtcOptions);
+	if (log)
+		getProgramLog(handler,program,*log);
+	if (result!=NVRTC_SUCCESS)
+		return {nullptr,result};
+
+	return getPTX(handler,program);
+}
+
+ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount, const char* const* headerContents, const char* const* includeNames,
+	std::string* log)
+{
+	nvrtcProgram program = nullptr;
+	nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+	auto cleanup = core::makeRAIIExiter([&]() -> void
+	{
+		if (result!=NVRTC_SUCCESS && program)
+			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
+	});
+
+	result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames);
+	return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log);
+}
+
+ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount, const char* const* headerContents, const char* const* includeNames,
+	std::string* log)
+{
+	nvrtcProgram program = nullptr;
+	nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+	auto cleanup = core::makeRAIIExiter([&]() -> void
+	{
+		if (result!=NVRTC_SUCCESS && program)
+			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
+	});
+
+	result = createProgram(handler,&program,file,headerCount,headerContents,includeNames);
+	return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log);
+}
+
 }
 
 core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice)
@@ -578,7 +677,7 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 	if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end())
 		return nullptr;
 
-	for (const auto& device : m_availableDevices)
+	for (const auto& device : m_native->availableDevices)
 	{
 		if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE))
 		{
@@ -662,7 +761,7 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 			if (arch==CCUDADevice::EVA_COUNT)
 				continue;
 
-			return core::make_smart_refctd_ptr<CCUDADevice>(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr<CCUDAHandler>(this));
+			return core::make_smart_refctd_ptr<CCUDADevice>(std::move(vulkanConnection), physicalDevice, arch, std::make_unique<CCUDADevice::SNativeState>(device.handle), core::smart_refctd_ptr<CCUDAHandler>(this));
 		}
 	}
 	return nullptr;
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
index a785bad9b9..bbc65f91ab 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
@@ -2,30 +2,44 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "CUDAInteropNativeState.hpp"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
 namespace nbl::video
 {
 
-CUresult CCUDAImportedMemory::getMappedBuffer(CUdeviceptr* mappedBuffer)
+CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+namespace cuda_native
+{
+
+CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory)
+{
+  return SAccess::native(memory).handle;
+}
+
+CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer)
 {
   CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {};
   bufferDesc.offset = 0;
-  bufferDesc.size = m_src->getAllocationSize();
+  bufferDesc.size = SAccess::source(memory)->getAllocationSize();
 
-  auto& cu = m_device->getHandler()->getCUDAFunctionTable();
-  return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, m_handle, &bufferDesc);
+  const auto& cu = getCUDAFunctionTable(*SAccess::device(memory)->getHandler());
+  return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, SAccess::native(memory).handle, &bufferDesc);
   
 }
 
+}
+
 CCUDAImportedMemory::~CCUDAImportedMemory()
 {
-  auto& cu = m_device->getHandler()->getCUDAFunctionTable();
-  ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_handle), m_device->getHandler());
+  auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+  ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_native->handle), m_device->getHandler());
 }
 
 }
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
index 1ca4a34190..b6e3b319f7 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
@@ -2,17 +2,31 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "CUDAInteropNativeState.hpp"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
 {
+CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+namespace cuda_native
+{
+
+CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore)
+{
+	return SAccess::native(semaphore).handle;
+}
+
+}
+
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 {
-	auto& cu = m_device->getHandler()->getCUDAFunctionTable();
-	ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_handle), m_device->getHandler());
+	auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+	ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_native->handle), m_device->getHandler());
 }
 }
 
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
index 7a69e62ad4..973fbb232a 100644
--- a/src/nbl/ext/CUDAInterop/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -5,6 +5,7 @@ if (NBL_COMPILE_WITH_CUDA)
 
 	set(NBL_EXT_CUDA_INTEROP_H
 		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInterop.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInteropNative.h
 		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDADevice.h
 		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAExportableMemory.h
 		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAHandler.h
@@ -26,12 +27,24 @@ if (NBL_COMPILE_WITH_CUDA)
 		"${NBL_EXT_CUDA_INTEROP_SRC}"
 		""
 		""
-		"_NBL_COMPILE_WITH_CUDA_"
+		""
 	)
 
-	target_link_libraries(${LIB_NAME} PUBLIC $<BUILD_INTERFACE:CUDA::toolkit>)
+	target_compile_definitions(${LIB_NAME} PRIVATE _NBL_COMPILE_WITH_CUDA_)
+	target_include_directories(${LIB_NAME} PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
 	set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
 	add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME})
+
+	set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "NblExtCUDA_INTEROP_NATIVE")
+	add_library(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE)
+	target_link_libraries(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE
+		$<BUILD_INTERFACE:${LIB_NAME}>
+		$<INSTALL_INTERFACE:Nabla::ext::CUDAInterop>
+		CUDA::toolkit
+	)
+	set_target_properties(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInteropNative")
+	add_library(Nabla::ext::CUDAInteropNative ALIAS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB})
+	set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}" PARENT_SCOPE)
 endif()
 
 add_subdirectory(smoke)
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
new file mode 100644
index 0000000000..2dc3c3bbca
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
@@ -0,0 +1,106 @@
+#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
+#define _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
+
+#include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
+
+#include <array>
+
+namespace nbl::video
+{
+
+struct CCUDAHandler::SNativeState
+{
+	cuda_native::CUDA cuda;
+	cuda_native::NVRTC nvrtc;
+	core::vector<cuda_native::SCUDADeviceInfo> availableDevices;
+
+	SNativeState(cuda_native::CUDA&& _cuda, cuda_native::NVRTC&& _nvrtc)
+		: cuda(std::move(_cuda))
+		, nvrtc(std::move(_nvrtc))
+	{}
+};
+
+struct CCUDADevice::SNativeState
+{
+	CUdevice handle = {};
+	CUcontext context = nullptr;
+	std::array<size_t,5> allocationGranularity = {};
+
+	explicit SNativeState(CUdevice _handle)
+		: handle(_handle)
+	{}
+};
+
+struct CCUDAExportableMemory::SNativeState
+{
+	CUdeviceptr ptr = 0;
+};
+
+struct CCUDAImportedMemory::SNativeState
+{
+	CUexternalMemory handle = nullptr;
+
+	explicit SNativeState(CUexternalMemory _handle)
+		: handle(_handle)
+	{}
+};
+
+struct CCUDAImportedSemaphore::SNativeState
+{
+	CUexternalSemaphore handle = nullptr;
+
+	explicit SNativeState(CUexternalSemaphore _handle)
+		: handle(_handle)
+	{}
+};
+
+namespace cuda_native
+{
+
+inline CUmemLocationType toNative(ECUDAMemoryLocation location)
+{
+	return static_cast<CUmemLocationType>(static_cast<uint32_t>(location));
+}
+
+inline ECUDAMemoryLocation toNabla(CUmemLocationType location)
+{
+	return static_cast<ECUDAMemoryLocation>(static_cast<uint32_t>(location));
+}
+
+inline CUmemAllocationHandleType getAllocationHandleType()
+{
+#ifdef _WIN32
+	return CU_MEM_HANDLE_TYPE_WIN32;
+#else
+	return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#endif
+}
+
+struct SAccess
+{
+	static CCUDAHandler::SNativeState& native(CCUDAHandler& handler) { return *handler.m_native; }
+	static const CCUDAHandler::SNativeState& native(const CCUDAHandler& handler) { return *handler.m_native; }
+
+	static CCUDADevice::SNativeState& native(CCUDADevice& device) { return *device.m_native; }
+	static const CCUDADevice::SNativeState& native(const CCUDADevice& device) { return *device.m_native; }
+
+	static CCUDAExportableMemory::SNativeState& native(CCUDAExportableMemory& memory) { return *memory.m_native; }
+	static const CCUDAExportableMemory::SNativeState& native(const CCUDAExportableMemory& memory) { return *memory.m_native; }
+
+	static CCUDAImportedMemory::SNativeState& native(CCUDAImportedMemory& memory) { return *memory.m_native; }
+	static const CCUDAImportedMemory::SNativeState& native(const CCUDAImportedMemory& memory) { return *memory.m_native; }
+
+	static CCUDAImportedSemaphore::SNativeState& native(CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; }
+	static const CCUDAImportedSemaphore::SNativeState& native(const CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; }
+
+	static system::logger_opt_ptr logger(const CCUDAHandler& handler) { return handler.m_logger.get().get(); }
+	static system::logger_opt_ptr logger(const CCUDADevice& device) { return device.m_logger; }
+	static const CCUDADevice* device(const CCUDAImportedMemory& memory) { return memory.m_device.get(); }
+	static IDeviceMemoryAllocation* source(const CCUDAImportedMemory& memory) { return memory.m_src.get(); }
+};
+
+}
+
+}
+
+#endif
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
new file mode 100644
index 0000000000..1fd88d1b04
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -0,0 +1,23 @@
+# CUDA Interop Targets
+
+- `Nabla::Nabla` stays CUDA-free. `find_package(Nabla CONFIG)` does not require the CUDA SDK.
+- `Nabla::ext::CUDAInterop` is the clean Nabla interop target. Its public headers do not include `cuda.h` or `nvrtc.h`, so consumers can use a CUDA-enabled Nabla package without installing the CUDA SDK.
+- `Nabla::ext::CUDAInteropNative` is the explicit raw CUDA opt-in target. It exposes `CUDAInteropNative.h`, CUDA Driver API and NVRTC types, and requires `CUDAToolkit`.
+- Consumers can request native CUDA with `find_package(Nabla CONFIG COMPONENTS Core CUDAInteropNative)` and override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>`.
+- A consumer can use a newer compatible local CUDA SDK through `CUDAInteropNative` without rebuilding Nabla or the clean `CUDAInterop` target.
+- Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
+
+```cmake
+find_package(Nabla CONFIG REQUIRED)
+target_link_libraries(app PRIVATE Nabla::Nabla)
+```
+
+```cmake
+find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop)
+target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)
+```
+
+```cmake
+find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInteropNative)
+target_link_libraries(app PRIVATE Nabla::ext::CUDAInteropNative)
+```
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index cd9ba7b70e..71bdac260d 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -1,8 +1,14 @@
 cmake_minimum_required(VERSION 3.30)
 project(NblExtCUDAInteropSmoke CXX)
 
+option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF)
+
 if(NOT TARGET Nabla::Nabla)
-	find_package(Nabla REQUIRED CONFIG COMPONENTS Core CUDAInterop)
+	set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core CUDAInterop)
+	if(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE)
+		list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInteropNative)
+	endif()
+	find_package(Nabla REQUIRED CONFIG COMPONENTS ${_NBL_CUDA_INTEROP_SMOKE_COMPONENTS})
 endif()
 
 enable_testing()
@@ -18,6 +24,11 @@ nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.
 target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla)
 
 if(TARGET Nabla::ext::CUDAInterop)
-	nbl_add_cuda_interop_smoke(NblExtCUDAInteropOptInSmoke opt_in.cpp)
-	target_link_libraries(NblExtCUDAInteropOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
+	nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanOptInSmoke clean_opt_in.cpp)
+	target_link_libraries(NblExtCUDAInteropCleanOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
+endif()
+
+if(TARGET Nabla::ext::CUDAInteropNative)
+	nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp)
+	target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInteropNative)
 endif()
diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
new file mode 100644
index 0000000000..6952433f9e
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
@@ -0,0 +1,42 @@
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/system/IApplicationFramework.h"
+
+#include <type_traits>
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+#error "Nabla::ext::CUDAInterop must not propagate the CUDA build define."
+#endif
+
+#ifdef CUDA_VERSION
+#error "Nabla::ext::CUDAInterop must not require CUDA SDK headers."
+#endif
+
+namespace
+{
+
+class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramework
+{
+	using base_t = nbl::system::IApplicationFramework;
+
+public:
+	using base_t::base_t;
+
+	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
+	{
+		static_assert(std::is_same_v<decltype(nbl::video::CCUDAExportableMemory::SCreationParams{}.location), nbl::video::ECUDAMemoryLocation>);
+
+		const nbl::video::CCUDAExportableMemory::SCreationParams params = {
+			.size = 4096,
+			.alignment = 4096,
+			.location = nbl::video::ECUDAMemoryLocation::DEVICE,
+		};
+		return isAPILoaded() && params.location==nbl::video::ECUDAMemoryLocation::DEVICE;
+	}
+
+	void workLoopBody() override {}
+	bool keepRunning() override { return false; }
+};
+
+}
+
+NBL_MAIN_FUNC(CUDAInteropCleanOptInSmoke)
diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
similarity index 72%
rename from src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
rename to src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index bc8c8952bd..d868b2eaa7 100644
--- a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -1,4 +1,4 @@
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
 #include "nbl/system/IApplicationFramework.h"
 
 #include <algorithm>
@@ -7,8 +7,8 @@
 #include <type_traits>
 #include <utility>
 
-#ifndef _NBL_COMPILE_WITH_CUDA_
-#error "CUDA interop consumers must opt in through Nabla::ext::CUDAInterop."
+#ifndef CUDA_VERSION
+#error "Nabla::ext::CUDAInteropNative must expose CUDA SDK headers."
 #endif
 
 namespace
@@ -25,7 +25,7 @@ using namespace nbl::video;
 	auto cudaMemory = cudaDevice.createExportableMemory({
 		.size = 4096,
 		.alignment = 4096,
-		.location = CU_MEM_LOCATION_TYPE_DEVICE,
+		.location = ECUDAMemoryLocation::DEVICE,
 	});
 	if (!cudaMemory)
 		return false;
@@ -36,15 +36,16 @@ using namespace nbl::video;
 
 	CUdeviceptr mappedVulkanMemory = 0;
 	if (importedFromVulkan)
-		importedFromVulkan->getMappedBuffer(&mappedVulkanMemory);
+		cuda_native::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory);
 
-	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? importedSemaphore->getInternalObject():nullptr;
-	return exportedToVulkan.get() && mappedVulkanMemory && cudaSemaphore;
+	const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(*cudaMemory);
+	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(*importedSemaphore):nullptr;
+	return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore;
 }
 
 bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 {
-	auto& cuda = handler.getCUDAFunctionTable();
+	auto& cuda = cuda_native::getCUDAFunctionTable(handler);
 
 	CUcontext context = nullptr;
 	if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS)
@@ -83,7 +84,7 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 }
 }
 
-class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework
+class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramework
 {
 	using base_t = nbl::system::IApplicationFramework;
 
@@ -95,13 +96,13 @@ class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework
 		if (!isAPILoaded())
 			return false;
 
-		static_assert(std::is_same_v<decltype(std::declval<const nbl::video::CCUDADevice&>().getInternalObject()), CUdevice>);
+		static_assert(std::is_same_v<decltype(nbl::video::cuda_native::getInternalObject(std::declval<const nbl::video::CCUDADevice&>())), CUdevice>);
 
 		auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
 		if (!handler)
 			return true;
 
-		const auto& devices = handler->getAvailableDevices();
+		const auto& devices = nbl::video::cuda_native::getAvailableDevices(*handler);
 		if (devices.empty())
 			return true;
 
@@ -112,4 +113,4 @@ class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework
 	bool keepRunning() override { return false; }
 };
 
-NBL_MAIN_FUNC(CUDAInteropOptInSmoke)
+NBL_MAIN_FUNC(CUDAInteropNativeOptInSmoke)

From 49bcb2cf6c96e7fca42a16142c28ddc83686c579 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 16:40:53 +0200
Subject: [PATCH 09/27] Add native CUDA accessor overloads

---
 examples_tests                                |   2 +-
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 172 ++++++++++++++++++
 src/nbl/ext/CUDAInterop/README.md             |   1 +
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |   8 +-
 4 files changed, 178 insertions(+), 5 deletions(-)

diff --git a/examples_tests b/examples_tests
index 7a2a4f604f..dfa2b7ac39 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 7a2a4f604fd941984d6624e3059f7380cc6592a2
+Subproject commit dfa2b7ac39c6b9ae94ae2eb70c8f6ec251a9715e
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index f913664122..ea6313f26b 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -146,6 +146,26 @@ struct SCUDADeviceInfo
 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
 
+inline const CUDA& getCUDAFunctionTable(const CCUDAHandler* handler)
+{
+	return getCUDAFunctionTable(*handler);
+}
+
+inline const CUDA& getCUDAFunctionTable(const core::smart_refctd_ptr<CCUDAHandler>& handler)
+{
+	return getCUDAFunctionTable(*handler);
+}
+
+inline const NVRTC& getNVRTCFunctionTable(const CCUDAHandler* handler)
+{
+	return getNVRTCFunctionTable(*handler);
+}
+
+inline const NVRTC& getNVRTCFunctionTable(const core::smart_refctd_ptr<CCUDAHandler>& handler)
+{
+	return getNVRTCFunctionTable(*handler);
+}
+
 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
@@ -155,12 +175,46 @@ T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
 
 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
 
+inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler* handler)
+{
+	return getAvailableDevices(*handler);
+}
+
+inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const core::smart_refctd_ptr<CCUDAHandler>& handler)
+{
+	return getAvailableDevices(*handler);
+}
+
 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
 	return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames);
 }
 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames);
+}
+inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames);
+}
+inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames);
+}
+inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames);
+}
+inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,file,headerCount,headerContents,includeNames);
+}
+inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,file,headerCount,headerContents,includeNames);
+}
 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
 
@@ -189,6 +243,54 @@ ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 );
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler* handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	const core::smart_refctd_ptr<CCUDAHandler>& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler* handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	const core::smart_refctd_ptr<CCUDAHandler>& handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler* handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	const core::smart_refctd_ptr<CCUDAHandler>& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
 
 CUdevice getInternalObject(const CCUDADevice& device);
 CUcontext getContext(const CCUDADevice& device);
@@ -198,6 +300,76 @@ CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
 
+inline CUdevice getInternalObject(const CCUDADevice* device)
+{
+	return getInternalObject(*device);
+}
+
+inline CUdevice getInternalObject(const core::smart_refctd_ptr<CCUDADevice>& device)
+{
+	return getInternalObject(*device);
+}
+
+inline CUcontext getContext(const CCUDADevice* device)
+{
+	return getContext(*device);
+}
+
+inline CUcontext getContext(const core::smart_refctd_ptr<CCUDADevice>& device)
+{
+	return getContext(*device);
+}
+
+inline size_t roundToGranularity(const CCUDADevice* device, CUmemLocationType location, size_t size)
+{
+	return roundToGranularity(*device,location,size);
+}
+
+inline size_t roundToGranularity(const core::smart_refctd_ptr<CCUDADevice>& device, CUmemLocationType location, size_t size)
+{
+	return roundToGranularity(*device,location,size);
+}
+
+inline CUdeviceptr getDeviceptr(const CCUDAExportableMemory* memory)
+{
+	return getDeviceptr(*memory);
+}
+
+inline CUdeviceptr getDeviceptr(const core::smart_refctd_ptr<CCUDAExportableMemory>& memory)
+{
+	return getDeviceptr(*memory);
+}
+
+inline CUexternalMemory getInternalObject(const CCUDAImportedMemory* memory)
+{
+	return getInternalObject(*memory);
+}
+
+inline CUexternalMemory getInternalObject(const core::smart_refctd_ptr<CCUDAImportedMemory>& memory)
+{
+	return getInternalObject(*memory);
+}
+
+inline CUresult getMappedBuffer(const CCUDAImportedMemory* memory, CUdeviceptr* mappedBuffer)
+{
+	return getMappedBuffer(*memory,mappedBuffer);
+}
+
+inline CUresult getMappedBuffer(const core::smart_refctd_ptr<CCUDAImportedMemory>& memory, CUdeviceptr* mappedBuffer)
+{
+	return getMappedBuffer(*memory,mappedBuffer);
+}
+
+inline CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore* semaphore)
+{
+	return getInternalObject(*semaphore);
+}
+
+inline CUexternalSemaphore getInternalObject(const core::smart_refctd_ptr<CCUDAImportedSemaphore>& semaphore)
+{
+	return getInternalObject(*semaphore);
+}
+
 }
 
 #define ASSERT_CUDA_SUCCESS(expr, handler) \
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 1fd88d1b04..623c07ec9e 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -6,6 +6,7 @@
 - Consumers can request native CUDA with `find_package(Nabla CONFIG COMPONENTS Core CUDAInteropNative)` and override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>`.
 - A consumer can use a newer compatible local CUDA SDK through `CUDAInteropNative` without rebuilding Nabla or the clean `CUDAInterop` target.
 - Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
+- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`, so opt-in code can keep CUDA usage terse without moving CUDA types into clean headers.
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED)
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index d868b2eaa7..4c001ab6ce 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -36,10 +36,10 @@ using namespace nbl::video;
 
 	CUdeviceptr mappedVulkanMemory = 0;
 	if (importedFromVulkan)
-		cuda_native::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory);
+		cuda_native::getMappedBuffer(importedFromVulkan,&mappedVulkanMemory);
 
-	const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(*cudaMemory);
-	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(*importedSemaphore):nullptr;
+	const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(cudaMemory);
+	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(importedSemaphore):nullptr;
 	return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore;
 }
 
@@ -102,7 +102,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!handler)
 			return true;
 
-		const auto& devices = nbl::video::cuda_native::getAvailableDevices(*handler);
+		const auto& devices = nbl::video::cuda_native::getAvailableDevices(handler);
 		if (devices.empty())
 			return true;
 

From d85657e381ecd537aa20a16ab227aa38754083d4 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 16:48:06 +0200
Subject: [PATCH 10/27] Document CUDA interop target split

---
 src/nbl/ext/CUDAInterop/README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 623c07ec9e..a73b9d9c21 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -1,5 +1,8 @@
 # CUDA Interop Targets
 
+This extension keeps CUDA interop available without making CUDA a default public
+compile-time dependency of Nabla.
+
 - `Nabla::Nabla` stays CUDA-free. `find_package(Nabla CONFIG)` does not require the CUDA SDK.
 - `Nabla::ext::CUDAInterop` is the clean Nabla interop target. Its public headers do not include `cuda.h` or `nvrtc.h`, so consumers can use a CUDA-enabled Nabla package without installing the CUDA SDK.
 - `Nabla::ext::CUDAInteropNative` is the explicit raw CUDA opt-in target. It exposes `CUDAInteropNative.h`, CUDA Driver API and NVRTC types, and requires `CUDAToolkit`.
@@ -8,6 +11,18 @@
 - Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
 - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`, so opt-in code can keep CUDA usage terse without moving CUDA types into clean headers.
 
+## Design
+
+- The default Nabla package remains relocatable and usable on machines without the CUDA SDK.
+- CUDA is used privately to build the interop library. CUDA SDK headers become visible to consumers only when `CUDAInteropNative` is requested.
+- Clean interop headers expose Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
+- Native interop headers expose raw CUDA Driver API and NVRTC types for examples and applications that need direct CUDA work.
+- The split is intentionally similar to the OpenCV CUDA shape: common CUDA-facing headers stay clean, while raw CUDA access lives behind explicit opt-in accessor/native headers.
+- This avoids a transitive public compile-time dependency on CUDA while preserving the low-level workflow for kernels, `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, and external semaphores.
+- Package consumers can pick their own compatible CUDA SDK for native code without rebuilding Nabla or the clean interop library.
+
+## Usage
+
 ```cmake
 find_package(Nabla CONFIG REQUIRED)
 target_link_libraries(app PRIVATE Nabla::Nabla)

From 6e8c4f99399b3111c2800a8ffd5f36cd9b17c418 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 16:58:35 +0200
Subject: [PATCH 11/27] Trim CUDA interop README wording

---
 src/nbl/ext/CUDAInterop/README.md | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index a73b9d9c21..104f7f2eca 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -1,25 +1,22 @@
 # CUDA Interop Targets
 
-This extension keeps CUDA interop available without making CUDA a default public
-compile-time dependency of Nabla.
-
-- `Nabla::Nabla` stays CUDA-free. `find_package(Nabla CONFIG)` does not require the CUDA SDK.
-- `Nabla::ext::CUDAInterop` is the clean Nabla interop target. Its public headers do not include `cuda.h` or `nvrtc.h`, so consumers can use a CUDA-enabled Nabla package without installing the CUDA SDK.
-- `Nabla::ext::CUDAInteropNative` is the explicit raw CUDA opt-in target. It exposes `CUDAInteropNative.h`, CUDA Driver API and NVRTC types, and requires `CUDAToolkit`.
-- Consumers can request native CUDA with `find_package(Nabla CONFIG COMPONENTS Core CUDAInteropNative)` and override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>`.
-- A consumer can use a newer compatible local CUDA SDK through `CUDAInteropNative` without rebuilding Nabla or the clean `CUDAInterop` target.
-- Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
-- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`, so opt-in code can keep CUDA usage terse without moving CUDA types into clean headers.
+- `Nabla::Nabla` does not require the CUDA SDK.
+- `Nabla::ext::CUDAInterop` provides Nabla CUDA interop types. Its public headers do not include `cuda.h` or `nvrtc.h`.
+- `Nabla::ext::CUDAInteropNative` provides raw CUDA Driver API and NVRTC access through `CUDAInteropNative.h`.
+- `CUDAInteropNative` requires `CUDAToolkit`. `CUDAInterop` does not expose that requirement to consumers.
+- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInteropNative`.
+- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla or `CUDAInterop`.
+- Changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
+- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
 
 ## Design
 
-- The default Nabla package remains relocatable and usable on machines without the CUDA SDK.
-- CUDA is used privately to build the interop library. CUDA SDK headers become visible to consumers only when `CUDAInteropNative` is requested.
-- Clean interop headers expose Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
-- Native interop headers expose raw CUDA Driver API and NVRTC types for examples and applications that need direct CUDA work.
-- The split is intentionally similar to the OpenCV CUDA shape: common CUDA-facing headers stay clean, while raw CUDA access lives behind explicit opt-in accessor/native headers.
-- This avoids a transitive public compile-time dependency on CUDA while preserving the low-level workflow for kernels, `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, and external semaphores.
-- Package consumers can pick their own compatible CUDA SDK for native code without rebuilding Nabla or the clean interop library.
+- CUDA is used privately while building the interop library.
+- CUDA SDK headers become visible to consumers only through `CUDAInteropNative`.
+- `CUDAInterop` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
+- `CUDAInteropNative` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects.
+- The target split follows the same general dependency shape used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header.
+- This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`.
 
 ## Usage
 

From 881e9b83c19388647336d56ef438f07b66781641 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 17:43:35 +0200
Subject: [PATCH 12/27] Move CUDA interop into Nabla

---
 cmake/NablaConfig.cmake.in                    |  17 +--
 examples_tests                                |   2 +-
 include/nbl/ext/CUDAInterop/CCUDADevice.h     |   2 +-
 .../ext/CUDAInterop/CCUDAExportableMemory.h   |   2 +-
 include/nbl/ext/CUDAInterop/CCUDAHandler.h    |   2 +-
 .../nbl/ext/CUDAInterop/CCUDAImportedMemory.h |   2 +-
 .../ext/CUDAInterop/CCUDAImportedSemaphore.h  |   2 +-
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   |  40 +++----
 src/nbl/CMakeLists.txt                        |  46 ++++----
 src/nbl/ext/CMakeLists.txt                    |   8 --
 src/nbl/ext/CUDAInterop/CMakeLists.txt        |  51 ++-------
 src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp  | 100 ++++++++++++++++++
 src/nbl/ext/CUDAInterop/README.md             |  28 +++--
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt  |  14 ++-
 .../ext/CUDAInterop/smoke/clean_opt_in.cpp    |   4 +-
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |   2 +-
 16 files changed, 183 insertions(+), 139 deletions(-)
 create mode 100644 src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp

diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index afff3dcccc..8b9f62e548 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -7,7 +7,6 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso
 set(_NBL_NABLA_LOAD_CORE OFF)
 set(_NBL_NABLA_LOAD_NSC OFF)
 set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF)
-set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE OFF)
 set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS})
 set(_NBL_NABLA_HAS_CORE_EXPORTS OFF)
 set(_NBL_NABLA_HAS_NSC_EXPORTS OFF)
@@ -31,12 +30,6 @@ if(_NBL_NABLA_COMPONENTS)
       set(_NBL_NABLA_LOAD_CORE ON)
       set(_NBL_NABLA_LOAD_CUDA_INTEROP ON)
       set(Nabla_CUDAInterop_FOUND TRUE)
-    elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInteropNative")
-      set(_NBL_NABLA_LOAD_CORE ON)
-      set(_NBL_NABLA_LOAD_CUDA_INTEROP ON)
-      set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE ON)
-      set(Nabla_CUDAInterop_FOUND TRUE)
-      set(Nabla_CUDAInteropNative_FOUND TRUE)
     else()
       set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE)
     endif()
@@ -93,10 +86,6 @@ if(_NBL_NABLA_LOAD_NSC)
 endif()
 
 if(_NBL_NABLA_LOAD_CUDA_INTEROP)
-  _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
-endif()
-
-if(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE)
   include(CMakeFindDependencyMacro)
 
   if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "")
@@ -104,9 +93,9 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE)
   endif()
 
   find_dependency(CUDAToolkit 13.0 REQUIRED)
-  _nbl_try_include_component("CUDAInteropNative" "NablaCUDAInteropNativeExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND)
-  if(_NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND AND TARGET Nabla::ext::CUDAInteropNative)
-    target_link_libraries(Nabla::ext::CUDAInteropNative INTERFACE CUDA::toolkit)
+  _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
+  if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
+    target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
   endif()
 endif()
 
diff --git a/examples_tests b/examples_tests
index dfa2b7ac39..3b59c9bc05 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit dfa2b7ac39c6b9ae94ae2eb70c8f6ec251a9715e
+Subproject commit 3b59c9bc05d8784277d3a18e11f423dcb8ae2b74
diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h
index 25c40e7ed6..7b994e053f 100644
--- a/include/nbl/ext/CUDAInterop/CCUDADevice.h
+++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h
@@ -22,7 +22,7 @@ namespace cuda_native
 struct SAccess;
 }
 
-class CCUDADevice : public core::IReferenceCounted
+class NBL_API2 CCUDADevice : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
index 5973c31fac..b331d6a258 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
@@ -26,7 +26,7 @@ enum class ECUDAMemoryLocation : uint32_t
 	HOST_NUMA_CURRENT = 4
 };
 
-class CCUDAExportableMemory : public core::IReferenceCounted
+class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
index 063598a518..6a3cc6c496 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -25,7 +25,7 @@ namespace cuda_native
 struct SAccess;
 }
 
-class CCUDAHandler : public core::IReferenceCounted
+class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
index 8a24f83907..adb803f12c 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
@@ -16,7 +16,7 @@ namespace cuda_native
 struct SAccess;
 }
 
-class CCUDAImportedMemory : public core::IReferenceCounted
+class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
index 3ee03fb045..894f2444c0 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
@@ -19,7 +19,7 @@ namespace cuda_native
 struct SAccess;
 }
 
-class CCUDAImportedSemaphore : public core::IReferenceCounted
+class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index ea6313f26b..b73f2ae252 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -143,8 +143,8 @@ struct SCUDADeviceInfo
 	int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
 };
 
-const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
-const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
+NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
 
 inline const CUDA& getCUDAFunctionTable(const CCUDAHandler* handler)
 {
@@ -166,14 +166,14 @@ inline const NVRTC& getNVRTCFunctionTable(const core::smart_refctd_ptr<CCUDAHand
 	return getNVRTCFunctionTable(*handler);
 }
 
-bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
-bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
-bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
+NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
+NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
+NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
 
 template<typename T>
 T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
 
-const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
+NBL_API2 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
 
 inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler* handler)
 {
@@ -185,12 +185,12 @@ inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const core::smar
 	return getAvailableDevices(*handler);
 }
 
-nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
 	return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames);
 }
-nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
 	return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames);
@@ -215,8 +215,8 @@ inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& han
 {
 	return createProgram(*handler,prog,file,headerCount,headerContents,includeNames);
 }
-nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
-nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
+NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
+NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
 
 struct ptx_and_nvrtcResult_t
 {
@@ -224,8 +224,8 @@ struct ptx_and_nvrtcResult_t
 	nvrtcResult result;
 };
 
-ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
-ptx_and_nvrtcResult_t compileDirectlyToPTX(
+NBL_API2 ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
+NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
@@ -238,7 +238,7 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 {
 	return compileDirectlyToPTX(handler,std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
 }
-ptx_and_nvrtcResult_t compileDirectlyToPTX(
+NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	CCUDAHandler& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
@@ -292,13 +292,13 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log);
 }
 
-CUdevice getInternalObject(const CCUDADevice& device);
-CUcontext getContext(const CCUDADevice& device);
-size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
-CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
-CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
-CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
-CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
+NBL_API2 CUdevice getInternalObject(const CCUDADevice& device);
+NBL_API2 CUcontext getContext(const CCUDADevice& device);
+NBL_API2 size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
+NBL_API2 CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
+NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
+NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
+NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
 
 inline CUdevice getInternalObject(const CCUDADevice* device)
 {
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index ecf7f555c3..f0f7b275c0 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -124,6 +124,20 @@ set(NBL_CORE_SOURCES
 	core/alloc/refctd_memory_resource.cpp
 	core/hash/blake.cpp
 )
+
+set(NBL_CUDA_INTEROP_SOURCES
+	ext/CUDAInterop/CUDAInteropStubs.cpp
+)
+if(NBL_COMPILE_WITH_CUDA)
+	set(NBL_CUDA_INTEROP_SOURCES
+		ext/CUDAInterop/CCUDADevice.cpp
+		ext/CUDAInterop/CCUDAExportableMemory.cpp
+		ext/CUDAInterop/CCUDAHandler.cpp
+		ext/CUDAInterop/CCUDAImportedMemory.cpp
+		ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+	)
+endif()
+
 set(NBL_SYSTEM_SOURCES
 	system/DefaultFuncPtrLoader.cpp
 	system/IFileBase.cpp
@@ -306,6 +320,7 @@ set(NABLA_SRCS_COMMON
 	${NBL_VIDEO_SOURCES}
 	${NBL_SCENE_SOURCES}
 	${NBL_META_SOURCES}
+	${NBL_CUDA_INTEROP_SOURCES}
 )
 
 if(MSVC)
@@ -416,6 +431,11 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 	target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 endif()
 
+if(NBL_COMPILE_WITH_CUDA)
+	target_compile_definitions(Nabla PRIVATE _NBL_COMPILE_WITH_CUDA_)
+	target_include_directories(Nabla PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+endif()
+
 set(INTERFACE_BUILD_DEFINITIONS
 	_DXC_DLL_="${DXC_DLL}"
 )
@@ -783,35 +803,17 @@ add_subdirectory(ext EXCLUDE_FROM_ALL)
 propagate_changed_variables_to_parent_scope()
 
 if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB})
-	set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF)
-
-	set(_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS)
-	if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
-		list(APPEND _NBL_EXT_CUDA_INTEROP_INSTALL_ARGS EXPORT NablaCUDAInteropExportTargets)
-	endif()
-	nbl_install_lib_spec(${NBL_EXT_CUDA_INTEROP_LIB} "nbl/ext/CUDA_INTEROP" ${_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS})
-
 	if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
+		install(TARGETS ${NBL_EXT_CUDA_INTEROP_LIB}
+			EXPORT NablaCUDAInteropExportTargets
+			COMPONENT Libraries
+		)
 		install(EXPORT NablaCUDAInteropExportTargets
 			NAMESPACE Nabla::
 			DESTINATION cmake
 			COMPONENT Libraries
 		)
 	endif()
-
-	if(DEFINED NBL_EXT_CUDA_INTEROP_NATIVE_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB})
-		if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
-			install(TARGETS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}
-				EXPORT NablaCUDAInteropNativeExportTargets
-				COMPONENT Libraries
-			)
-			install(EXPORT NablaCUDAInteropNativeExportTargets
-				NAMESPACE Nabla::
-				DESTINATION cmake
-				COMPONENT Libraries
-			)
-		endif()
-	endif()
 endif()
 
 if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB})
diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt
index 59ae49285e..264cfc7c2d 100644
--- a/src/nbl/ext/CMakeLists.txt
+++ b/src/nbl/ext/CMakeLists.txt
@@ -40,18 +40,10 @@ endif()
 
 add_subdirectory(CUDAInterop)
 if (NBL_COMPILE_WITH_CUDA)
-    set(NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS
-        ${NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS}
-        PARENT_SCOPE
-    )
     set(NBL_EXT_CUDA_INTEROP_LIB
         ${NBL_EXT_CUDA_INTEROP_LIB}
         PARENT_SCOPE
     )
-    set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB
-        ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}
-        PARENT_SCOPE
-    )
 endif()
 
 if (NBL_BUILD_IMGUI)
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
index 973fbb232a..438ab51d8f 100644
--- a/src/nbl/ext/CUDAInterop/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -1,50 +1,17 @@
 include(${NBL_ROOT_PATH}/cmake/common.cmake)
 
 if (NBL_COMPILE_WITH_CUDA)
-	set(NBL_EXT_INTERNAL_INCLUDE_DIR "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop")
+	set(NBL_EXT_CUDA_INTEROP_LIB "NblExtCUDA_INTEROP")
 
-	set(NBL_EXT_CUDA_INTEROP_H
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInterop.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInteropNative.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDADevice.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAExportableMemory.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAHandler.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedMemory.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedSemaphore.h
+	add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE)
+	target_link_libraries(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE
+		$<BUILD_INTERFACE:Nabla>
+		$<BUILD_INTERFACE:CUDA::toolkit>
+		$<INSTALL_INTERFACE:Nabla::Nabla>
 	)
-
-	set(NBL_EXT_CUDA_INTEROP_SRC
-		CCUDADevice.cpp
-		CCUDAExportableMemory.cpp
-		CCUDAHandler.cpp
-		CCUDAImportedMemory.cpp
-		CCUDAImportedSemaphore.cpp
-	)
-
-	nbl_create_ext_library_project(
-		CUDA_INTEROP
-		"${NBL_EXT_CUDA_INTEROP_H}"
-		"${NBL_EXT_CUDA_INTEROP_SRC}"
-		""
-		""
-		""
-	)
-
-	target_compile_definitions(${LIB_NAME} PRIVATE _NBL_COMPILE_WITH_CUDA_)
-	target_include_directories(${LIB_NAME} PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
-	set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
-	add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME})
-
-	set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "NblExtCUDA_INTEROP_NATIVE")
-	add_library(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE)
-	target_link_libraries(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE
-		$<BUILD_INTERFACE:${LIB_NAME}>
-		$<INSTALL_INTERFACE:Nabla::ext::CUDAInterop>
-		CUDA::toolkit
-	)
-	set_target_properties(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInteropNative")
-	add_library(Nabla::ext::CUDAInteropNative ALIAS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB})
-	set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}" PARENT_SCOPE)
+	set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
+	add_library(Nabla::ext::CUDAInterop ALIAS ${NBL_EXT_CUDA_INTEROP_LIB})
+	set(NBL_EXT_CUDA_INTEROP_LIB "${NBL_EXT_CUDA_INTEROP_LIB}" PARENT_SCOPE)
 endif()
 
 add_subdirectory(smoke)
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp b/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp
new file mode 100644
index 0000000000..db2b068391
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp
@@ -0,0 +1,100 @@
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+
+namespace nbl::video
+{
+
+struct CCUDAHandler::SNativeState {};
+struct CCUDADevice::SNativeState {};
+struct CCUDAExportableMemory::SNativeState {};
+struct CCUDAImportedMemory::SNativeState {};
+struct CCUDAImportedSemaphore::SNativeState {};
+
+CCUDAHandler::CCUDAHandler(
+	std::unique_ptr<SNativeState>&& nativeState,
+	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers,
+	core::smart_refctd_ptr<system::ILogger>&& _logger,
+	int _version)
+	: m_native(std::move(nativeState))
+	, m_headers(std::move(_headers))
+	, m_logger(std::move(_logger))
+	, m_version(_version)
+{}
+
+CCUDAHandler::~CCUDAHandler() = default;
+
+core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr<system::ILogger>&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&&, IPhysicalDevice*)
+{
+	return nullptr;
+}
+
+CCUDADevice::CCUDADevice(
+	core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection,
+	IPhysicalDevice* const vulkanDevice,
+	const E_VIRTUAL_ARCHITECTURE virtualArchitecture,
+	std::unique_ptr<SNativeState>&& nativeState,
+	core::smart_refctd_ptr<CCUDAHandler>&& handler)
+	: m_logger(nullptr)
+	, m_vulkanConnection(std::move(vulkanConnection))
+	, m_physicalDevice(vulkanDevice)
+	, m_virtualArchitecture(virtualArchitecture)
+	, m_handler(std::move(handler))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDADevice::~CCUDADevice() = default;
+
+size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const
+{
+	return size;
+}
+
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&&)
+{
+	return nullptr;
+}
+
+CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_params(std::move(params))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAExportableMemory::~CCUDAExportableMemory() = default;
+
+core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const
+{
+	return nullptr;
+}
+
+CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAImportedMemory::~CCUDAImportedMemory() = default;
+
+CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default;
+
+}
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 104f7f2eca..6eee617714 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -1,21 +1,22 @@
 # CUDA Interop Targets
 
 - `Nabla::Nabla` does not require the CUDA SDK.
-- `Nabla::ext::CUDAInterop` provides Nabla CUDA interop types. Its public headers do not include `cuda.h` or `nvrtc.h`.
-- `Nabla::ext::CUDAInteropNative` provides raw CUDA Driver API and NVRTC access through `CUDAInteropNative.h`.
-- `CUDAInteropNative` requires `CUDAToolkit`. `CUDAInterop` does not expose that requirement to consumers.
-- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInteropNative`.
-- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla or `CUDAInterop`.
+- `Nabla::Nabla` provides Nabla CUDA interop types when the package was built with CUDA support.
+- Nabla CUDA interop public headers do not include `cuda.h` or `nvrtc.h`.
+- `Nabla::ext::CUDAInterop` is the raw CUDA Driver API and NVRTC opt-in target.
+- `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`.
+- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInterop`.
+- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla.
 - Changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
 - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
 
 ## Design
 
-- CUDA is used privately while building the interop library.
-- CUDA SDK headers become visible to consumers only through `CUDAInteropNative`.
-- `CUDAInterop` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
-- `CUDAInteropNative` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects.
-- The target split follows the same general dependency shape used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header.
+- CUDA is used privately while building `Nabla::Nabla`.
+- CUDA SDK headers become visible to consumers only through `Nabla::ext::CUDAInterop`.
+- `Nabla::Nabla` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
+- `Nabla::ext::CUDAInterop` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects.
+- The dependency shape follows the same general model used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header.
 - This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`.
 
 ## Usage
@@ -27,10 +28,5 @@ target_link_libraries(app PRIVATE Nabla::Nabla)
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop)
-target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)
-```
-
-```cmake
-find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInteropNative)
-target_link_libraries(app PRIVATE Nabla::ext::CUDAInteropNative)
+target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop)
 ```
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index 71bdac260d..bdda95fb03 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -4,9 +4,9 @@ project(NblExtCUDAInteropSmoke CXX)
 option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF)
 
 if(NOT TARGET Nabla::Nabla)
-	set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core CUDAInterop)
+	set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core)
 	if(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE)
-		list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInteropNative)
+		list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInterop)
 	endif()
 	find_package(Nabla REQUIRED CONFIG COMPONENTS ${_NBL_CUDA_INTEROP_SMOKE_COMPONENTS})
 endif()
@@ -23,12 +23,10 @@ endfunction()
 nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp)
 target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla)
 
-if(TARGET Nabla::ext::CUDAInterop)
-	nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanOptInSmoke clean_opt_in.cpp)
-	target_link_libraries(NblExtCUDAInteropCleanOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
-endif()
+nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanNablaSmoke clean_opt_in.cpp)
+target_link_libraries(NblExtCUDAInteropCleanNablaSmoke PRIVATE Nabla::Nabla)
 
-if(TARGET Nabla::ext::CUDAInteropNative)
+if(TARGET Nabla::ext::CUDAInterop)
 	nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp)
-	target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInteropNative)
+	target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
 endif()
diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
index 6952433f9e..348caa766e 100644
--- a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
@@ -4,11 +4,11 @@
 #include <type_traits>
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
-#error "Nabla::ext::CUDAInterop must not propagate the CUDA build define."
+#error "Nabla::Nabla must not propagate the CUDA build define."
 #endif
 
 #ifdef CUDA_VERSION
-#error "Nabla::ext::CUDAInterop must not require CUDA SDK headers."
+#error "Nabla::Nabla must not require CUDA SDK headers."
 #endif
 
 namespace
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 4c001ab6ce..a78f710040 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -8,7 +8,7 @@
 #include <utility>
 
 #ifndef CUDA_VERSION
-#error "Nabla::ext::CUDAInteropNative must expose CUDA SDK headers."
+#error "Nabla::ext::CUDAInterop must expose CUDA SDK headers."
 #endif
 
 namespace

From 5dd1134ffc7d144e24f0ee3a55a283025b01fed8 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 06:15:03 +0200
Subject: [PATCH 13/27] Document CUDA interop accessor model

---
 src/nbl/ext/CUDAInterop/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 6eee617714..a7c1e654be 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -19,6 +19,14 @@
 - The dependency shape follows the same general model used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header.
 - This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`.
 
+## OpenCV Reference
+
+- OpenCV's common CUDA header includes OpenCV headers, not raw CUDA SDK headers: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L51-L52).
+- OpenCV keeps the public stream type as an OpenCV abstraction and grants access through `StreamAccessor`: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L916-L979).
+- OpenCV's raw CUDA opt-in header says it is the only header that depends on the CUDA Runtime API, then includes `<cuda_runtime.h>` and exposes accessor types: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79).
+- OpenCV also keeps implementation CUDA headers private and includes `<cuda.h>` / `<cuda_runtime.h>` there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61).
+- The same split is used here: Nabla CUDA objects stay in `Nabla::Nabla`, and raw CUDA handles/functions are available only after including `CUDAInteropNative.h` and linking `Nabla::ext::CUDAInterop`.
+
 ## Usage
 
 ```cmake

From e514df7f505bbc168f13ccc50750a62c2e6680bf Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 06:41:06 +0200
Subject: [PATCH 14/27] Inline CUDA interop stubs

---
 src/nbl/CMakeLists.txt                        |  15 +--
 src/nbl/ext/CUDAInterop/CCUDADevice.cpp       |  50 ++++++++-
 .../ext/CUDAInterop/CCUDAExportableMemory.cpp |  27 ++++-
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      |  38 ++++++-
 .../ext/CUDAInterop/CCUDAImportedMemory.cpp   |  21 +++-
 .../CUDAInterop/CCUDAImportedSemaphore.cpp    |  22 +++-
 src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp  | 100 ------------------
 src/nbl/ext/CUDAInterop/README.md             |  12 +++
 8 files changed, 169 insertions(+), 116 deletions(-)
 delete mode 100644 src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp

diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index f0f7b275c0..ccb600ca32 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -126,17 +126,12 @@ set(NBL_CORE_SOURCES
 )
 
 set(NBL_CUDA_INTEROP_SOURCES
-	ext/CUDAInterop/CUDAInteropStubs.cpp
+	ext/CUDAInterop/CCUDADevice.cpp
+	ext/CUDAInterop/CCUDAExportableMemory.cpp
+	ext/CUDAInterop/CCUDAHandler.cpp
+	ext/CUDAInterop/CCUDAImportedMemory.cpp
+	ext/CUDAInterop/CCUDAImportedSemaphore.cpp
 )
-if(NBL_COMPILE_WITH_CUDA)
-	set(NBL_CUDA_INTEROP_SOURCES
-		ext/CUDAInterop/CCUDADevice.cpp
-		ext/CUDAInterop/CCUDAExportableMemory.cpp
-		ext/CUDAInterop/CCUDAHandler.cpp
-		ext/CUDAInterop/CCUDAImportedMemory.cpp
-		ext/CUDAInterop/CCUDAImportedSemaphore.cpp
-	)
-endif()
 
 set(NBL_SYSTEM_SOURCES
 	system/DefaultFuncPtrLoader.cpp
diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
index 5f59545173..7d002c86ca 100644
--- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
@@ -1,13 +1,15 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
 
 #ifdef _WIN32
 #include <winternl.h>
 #endif
 
-#ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
 {
 
@@ -241,4 +243,50 @@ CCUDADevice::~CCUDADevice()
 
 }
 
+#else
+
+namespace nbl::video
+{
+
+// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols.
+struct CCUDADevice::SNativeState {};
+
+CCUDADevice::CCUDADevice(
+	core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection,
+	IPhysicalDevice* const vulkanDevice,
+	const E_VIRTUAL_ARCHITECTURE virtualArchitecture,
+	std::unique_ptr<SNativeState>&& nativeState,
+	core::smart_refctd_ptr<CCUDAHandler>&& handler)
+	: m_logger(nullptr)
+	, m_vulkanConnection(std::move(vulkanConnection))
+	, m_physicalDevice(vulkanDevice)
+	, m_virtualArchitecture(virtualArchitecture)
+	, m_handler(std::move(handler))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDADevice::~CCUDADevice() = default;
+
+size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const
+{
+	return size;
+}
+
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&&)
+{
+	return nullptr;
+}
+
+}
+
 #endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
index 94d18c40bb..a89e42b2f6 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
@@ -2,9 +2,11 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "CUDAInteropNativeState.hpp"
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
+#include "CUDAInteropNativeState.hpp"
+
 namespace nbl::video
 {
 
@@ -66,4 +68,27 @@ CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory)
 }
 }
 
+#else
+
+namespace nbl::video
+{
+
+// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols.
+struct CCUDAExportableMemory::SNativeState {};
+
+CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_params(std::move(params))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAExportableMemory::~CCUDAExportableMemory() = default;
+
+core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const
+{
+	return nullptr;
+}
+
+}
+
 #endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 49e36083d4..51f0656f6c 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -2,10 +2,11 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "CUDAInteropNativeState.hpp"
-#include "nbl/system/CFileView.h"
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
+#include "CUDAInteropNativeState.hpp"
+#include "nbl/system/CFileView.h"
 #include "jitify/jitify.hpp"
 
 
@@ -769,4 +770,37 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 
 }
 
+#else
+
+namespace nbl::video
+{
+
+// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols.
+struct CCUDAHandler::SNativeState {};
+
+CCUDAHandler::CCUDAHandler(
+	std::unique_ptr<SNativeState>&& nativeState,
+	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers,
+	core::smart_refctd_ptr<system::ILogger>&& _logger,
+	int _version)
+	: m_native(std::move(nativeState))
+	, m_headers(std::move(_headers))
+	, m_logger(std::move(_logger))
+	, m_version(_version)
+{}
+
+CCUDAHandler::~CCUDAHandler() = default;
+
+core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr<system::ILogger>&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&&, IPhysicalDevice*)
+{
+	return nullptr;
+}
+
+}
+
 #endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
index bbc65f91ab..8de3ce3e63 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
@@ -2,9 +2,10 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "CUDAInteropNativeState.hpp"
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
+#include "CUDAInteropNativeState.hpp"
 
 namespace nbl::video
 {
@@ -44,4 +45,22 @@ CCUDAImportedMemory::~CCUDAImportedMemory()
 
 }
 
+#else
+
+namespace nbl::video
+{
+
+// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols.
+struct CCUDAImportedMemory::SNativeState {};
+
+CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAImportedMemory::~CCUDAImportedMemory() = default;
+
+}
+
 #endif
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
index b6e3b319f7..fdbb56b0cf 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
@@ -2,9 +2,11 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "CUDAInteropNativeState.hpp"
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
+#include "CUDAInteropNativeState.hpp"
+
 namespace nbl::video
 {
 CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState)
@@ -30,4 +32,22 @@ CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 }
 }
 
+#else
+
+namespace nbl::video
+{
+
+// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols.
+struct CCUDAImportedSemaphore::SNativeState {};
+
+CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default;
+
+}
+
 #endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp b/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp
deleted file mode 100644
index db2b068391..0000000000
--- a/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
-
-namespace nbl::video
-{
-
-struct CCUDAHandler::SNativeState {};
-struct CCUDADevice::SNativeState {};
-struct CCUDAExportableMemory::SNativeState {};
-struct CCUDAImportedMemory::SNativeState {};
-struct CCUDAImportedSemaphore::SNativeState {};
-
-CCUDAHandler::CCUDAHandler(
-	std::unique_ptr<SNativeState>&& nativeState,
-	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers,
-	core::smart_refctd_ptr<system::ILogger>&& _logger,
-	int _version)
-	: m_native(std::move(nativeState))
-	, m_headers(std::move(_headers))
-	, m_logger(std::move(_logger))
-	, m_version(_version)
-{}
-
-CCUDAHandler::~CCUDAHandler() = default;
-
-core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr<system::ILogger>&&)
-{
-	return nullptr;
-}
-
-core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&&, IPhysicalDevice*)
-{
-	return nullptr;
-}
-
-CCUDADevice::CCUDADevice(
-	core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection,
-	IPhysicalDevice* const vulkanDevice,
-	const E_VIRTUAL_ARCHITECTURE virtualArchitecture,
-	std::unique_ptr<SNativeState>&& nativeState,
-	core::smart_refctd_ptr<CCUDAHandler>&& handler)
-	: m_logger(nullptr)
-	, m_vulkanConnection(std::move(vulkanConnection))
-	, m_physicalDevice(vulkanDevice)
-	, m_virtualArchitecture(virtualArchitecture)
-	, m_handler(std::move(handler))
-	, m_native(std::move(nativeState))
-{}
-
-CCUDADevice::~CCUDADevice() = default;
-
-size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const
-{
-	return size;
-}
-
-core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&)
-{
-	return nullptr;
-}
-
-core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&&)
-{
-	return nullptr;
-}
-
-core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&&)
-{
-	return nullptr;
-}
-
-CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
-	: m_device(std::move(device))
-	, m_params(std::move(params))
-	, m_native(std::move(nativeState))
-{}
-
-CCUDAExportableMemory::~CCUDAExportableMemory() = default;
-
-core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const
-{
-	return nullptr;
-}
-
-CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState)
-	: m_device(std::move(device))
-	, m_src(std::move(src))
-	, m_native(std::move(nativeState))
-{}
-
-CCUDAImportedMemory::~CCUDAImportedMemory() = default;
-
-CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState)
-	: m_device(std::move(device))
-	, m_src(std::move(src))
-	, m_native(std::move(nativeState))
-{}
-
-CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default;
-
-}
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index a7c1e654be..407b5e81b3 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -38,3 +38,15 @@ target_link_libraries(app PRIVATE Nabla::Nabla)
 find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop)
 target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop)
 ```
+
+## Properties
+
+- `Nabla::Nabla` can be built with CUDA support without making CUDA SDK headers a public compile-time requirement.
+- Consumers that only link `Nabla::Nabla` do not need a CUDA SDK to parse Nabla headers.
+- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop` explicitly.
+- Raw CUDA access is not wrapped away. Native code can use CUDA Driver API types, NVRTC types, and Nabla native accessors in the opt-in path.
+- The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds.
+- CUDA OFF implementations are local stubs in the same `.cpp` files. Clean API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
+- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`, so CUDA OFF builds do not need `cuda.h` or `nvrtc.h`.
+- A package built with CUDA support can be consumed without a local CUDA SDK unless the `CUDAInterop` component is requested.
+- A consumer can use a compatible local CUDA SDK for native interop without rebuilding Nabla.

From e53c838207aaf5f15513e6e622038d852154cfbb Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 09:25:29 +0200
Subject: [PATCH 15/27] Refine CUDA interop boundary

---
 examples_tests                                |  2 +-
 include/nbl/ext/CUDAInterop/CCUDADevice.h     |  4 --
 .../ext/CUDAInterop/CCUDAExportableMemory.h   | 18 +-----
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 21 +++++++
 src/nbl/ext/CUDAInterop/CCUDADevice.cpp       | 60 +++++++++---------
 .../ext/CUDAInterop/CCUDAExportableMemory.cpp | 12 ++--
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      | 52 ++++++++++++++-
 .../CUDAInterop/CUDAInteropNativeState.hpp    | 10 ---
 src/nbl/ext/CUDAInterop/README.md             | 63 ++++++++++---------
 .../ext/CUDAInterop/smoke/clean_opt_in.cpp    | 13 ++--
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  4 +-
 11 files changed, 147 insertions(+), 112 deletions(-)

diff --git a/examples_tests b/examples_tests
index 3b59c9bc05..fbb82d36e0 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 3b59c9bc05d8784277d3a18e11f423dcb8ae2b74
+Subproject commit fbb82d36e0f767e867a477a9d1a7035c7cbd56ca
diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h
index 7b994e053f..12465f40f4 100644
--- a/include/nbl/ext/CUDAInterop/CCUDADevice.h
+++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h
@@ -81,10 +81,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 
 		bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); }
 
-		size_t roundToGranularity(ECUDAMemoryLocation location, size_t size) const;
-
-		core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams);
-
 		core::smart_refctd_ptr<CCUDAImportedMemory> importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem);
 
 		core::smart_refctd_ptr<CCUDAImportedSemaphore> importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sem);
diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
index b331d6a258..80a9b3630a 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
@@ -18,36 +18,22 @@ namespace cuda_native
 struct SAccess;
 }
 
-enum class ECUDAMemoryLocation : uint32_t
-{
-	DEVICE = 1,
-	HOST = 2,
-	HOST_NUMA = 3,
-	HOST_NUMA_CURRENT = 4
-};
-
 class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
-		struct SCreationParams
+		struct SCachedCreationParams
 		{
 			size_t size;
 			uint32_t alignment;
-			ECUDAMemoryLocation location;
-		};
-
-		struct SCachedCreationParams : SCreationParams
-		{
 			size_t granularSize;
 			external_handle_t externalHandle;
+			bool deviceLocal;
 		};
 
 		CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState);
 		~CCUDAExportableMemory() override;
 
-		const SCreationParams& getCreationParams() const { return m_params; }
-
 		core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
 
 	private:
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index b73f2ae252..dd87d93e43 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -18,6 +18,9 @@
 namespace nbl::video::cuda_native
 {
 
+inline constexpr int MinimumCUDADriverVersion = 13000;
+inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000;
+
 using LibLoader = system::DefaultFuncPtrLoader;
 
 NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader
@@ -143,6 +146,13 @@ struct SCUDADeviceInfo
 	int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
 };
 
+struct SExportableMemoryCreationParams
+{
+	size_t size;
+	uint32_t alignment;
+	CUmemLocationType location;
+};
+
 NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
 NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
 
@@ -295,6 +305,7 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 NBL_API2 CUdevice getInternalObject(const CCUDADevice& device);
 NBL_API2 CUcontext getContext(const CCUDADevice& device);
 NBL_API2 size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
+NBL_API2 core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params);
 NBL_API2 CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
 NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
 NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
@@ -330,6 +341,16 @@ inline size_t roundToGranularity(const core::smart_refctd_ptr<CCUDADevice>& devi
 	return roundToGranularity(*device,location,size);
 }
 
+inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice* device, SExportableMemoryCreationParams&& params)
+{
+	return createExportableMemory(*device,std::move(params));
+}
+
+inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(const core::smart_refctd_ptr<CCUDADevice>& device, SExportableMemoryCreationParams&& params)
+{
+	return createExportableMemory(*device,std::move(params));
+}
+
 inline CUdeviceptr getDeviceptr(const CCUDAExportableMemory* memory)
 {
 	return getDeviceptr(*memory);
diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
index 7d002c86ca..ebac00b7b4 100644
--- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
@@ -58,11 +58,6 @@ CCUDADevice::CCUDADevice(
 	}
 }
 
-size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation location, size_t size) const
-{
-	return cuda_native::roundToGranularity(*this,cuda_native::toNative(location),size);
-}
-
 namespace cuda_native
 {
 
@@ -84,6 +79,11 @@ size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location,
 
 }
 
+static bool isDeviceLocal(CUmemLocationType location)
+{
+	return location==CU_MEM_LOCATION_TYPE_DEVICE;
+}
+
 static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory)
 {
 	const auto handler = device.getHandler();
@@ -117,12 +117,23 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 	return CUDA_SUCCESS;
 }
 
-core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams)
+namespace cuda_native
 {
-	CCUDAExportableMemory::SCachedCreationParams params = { inParams };
 
-	auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
-	const auto nativeLocation = cuda_native::toNative(params.location);
+core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams)
+{
+	const auto handler = device.getHandler();
+	auto& native = SAccess::native(device);
+	auto logger = SAccess::logger(device);
+
+	CCUDAExportableMemory::SCachedCreationParams params = {
+		.size = inParams.size,
+		.alignment = inParams.alignment,
+		.granularSize = roundToGranularity(device, inParams.location, inParams.size),
+		.deviceLocal = isDeviceLocal(inParams.location)
+	};
+
+	auto& cu = getCUDAFunctionTable(*handler);
 	
 #ifdef _WIN32
 	OBJECT_ATTRIBUTES metadata = {
@@ -132,35 +143,34 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 
 	 const auto prop = CUmemAllocationProp{
 		.type = CU_MEM_ALLOCATION_TYPE_PINNED,
-		.requestedHandleTypes = cuda_native::getAllocationHandleType(),
-		.location = { .type = nativeLocation, .id = m_native->handle },
+		.requestedHandleTypes = getAllocationHandleType(),
+		.location = { .type = inParams.location, .id = native.handle },
 #ifdef _WIN32
 		.win32HandleMetaData = &metadata,
 #endif
 	};
 
-	params.granularSize = roundToGranularity(params.location, params.size);
 	auto nativeState = std::make_unique<CCUDAExportableMemory::SNativeState>();
 
 	CUmemGenericAllocationHandle mem;
 	if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err)
 	{
-		m_logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR);
+		logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR);
 		return nullptr;
 	}
 	
 	if (auto err = cu.pcuMemExportToShareableHandle(&params.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err)
 	{
-		m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR);
-		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler);
+		logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler);
 		return nullptr;
 	}
 
-	if (const auto err = reserveAddressAndMapMemory(*this,&nativeState->ptr, params.granularSize, params.alignment, nativeLocation, mem); CUDA_SUCCESS != err)
+	if (const auto err = reserveAddressAndMapMemory(device,&nativeState->ptr, params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err)
 	{
-		m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
+		logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
-		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler);
 
 		bool closeSucceed = CloseExternalHandle(params.externalHandle);
 		assert(closeSucceed);
@@ -175,7 +185,9 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 		return nullptr;
 	}
 	
-	return core::make_smart_refctd_ptr<CCUDAExportableMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(params), std::move(nativeState));
+	return core::make_smart_refctd_ptr<CCUDAExportableMemory>(core::smart_refctd_ptr<CCUDADevice>(&device), std::move(params), std::move(nativeState));
+}
+
 }
 
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem)
@@ -267,16 +279,6 @@ CCUDADevice::CCUDADevice(
 
 CCUDADevice::~CCUDADevice() = default;
 
-size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const
-{
-	return size;
-}
-
-core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&)
-{
-	return nullptr;
-}
-
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&&)
 {
 	return nullptr;
diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
index a89e42b2f6..a65d1b680c 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
@@ -22,14 +22,10 @@ core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsM
 	uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1;
 	uint32_t vram = pd->getDeviceLocalMemoryTypeBits();
 
-	switch (m_params.location)
-	{
-    case ECUDAMemoryLocation::DEVICE: memoryTypeBits &=  vram; break;
-    case ECUDAMemoryLocation::HOST_NUMA:
-    case ECUDAMemoryLocation::HOST_NUMA_CURRENT:
-    case ECUDAMemoryLocation::HOST:   memoryTypeBits &= ~vram; break;
-    default: break;
-	}
+	if (m_params.deviceLocal)
+		memoryTypeBits &= vram;
+	else
+		memoryTypeBits &= ~vram;
 
 	IDeviceMemoryBacked::SDeviceMemoryRequirements req = {};
 	req.size = m_params.granularSize;
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 51f0656f6c..777a1db14a 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -12,6 +12,21 @@
 
 namespace nbl::video
 {
+
+namespace
+{
+
+int cudaVersionMajor(int version)
+{
+	return version/1000;
+}
+
+int cudaVersionMinor(int version)
+{
+	return (version%1000)/10;
+}
+
+}
 	
 CCUDAHandler::CCUDAHandler(
 	std::unique_ptr<SNativeState>&& nativeState,
@@ -455,6 +470,8 @@ bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
 
 core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger)
 {
+	const system::logger_opt_ptr logger(_logger.get());
+
 	cuda_native::CUDA cuda = cuda_native::CUDA(
 		#if defined(_NBL_WINDOWS_API_)
 			"nvcuda"
@@ -502,18 +519,32 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	#define SAFE_CUDA_CALL(FUNC,...) \
 	{\
 		if (!cuda.p ## FUNC)\
+		{\
+			logger.log("CCUDAHandler: CUDA Driver API function %s was not found. Need CUDA driver runtime %d.%d or newer.",system::ILogger::ELL_ERROR,#FUNC,cudaVersionMajor(cuda_native::MinimumCUDADriverVersion),cudaVersionMinor(cuda_native::MinimumCUDADriverVersion));\
 			return nullptr;\
+		}\
 		auto result = cuda.p ## FUNC(__VA_ARGS__);\
 		if (result!=CUDA_SUCCESS)\
+		{\
+			logger.log("CCUDAHandler: %s failed with CUDA error code %d.",system::ILogger::ELL_ERROR,#FUNC,static_cast<int>(result));\
 			return nullptr;\
+		}\
 	}
 	
 	SAFE_CUDA_CALL(cuInit,0)
 				
 	int cudaVersion = 0;
 	SAFE_CUDA_CALL(cuDriverGetVersion,&cudaVersion)
-	if (cudaVersion<13000)
+	if (cudaVersion<cuda_native::MinimumCUDADriverVersion)
+	{
+		logger.log(
+			"CCUDAHandler: CUDA driver runtime %d.%d is below required %d.%d.",
+			system::ILogger::ELL_ERROR,
+			cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion),
+			cudaVersionMajor(cuda_native::MinimumCUDADriverVersion),cudaVersionMinor(cuda_native::MinimumCUDADriverVersion)
+		);
 		return nullptr;
+	}
 
 	// stop the pollution
 	#undef SAFE_CUDA_CALL
@@ -521,11 +552,26 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 
 	// check nvrtc existence and compatibility
 	if (!nvrtc.pnvrtcVersion)
+	{
+		logger.log("CCUDAHandler: NVRTC runtime was not found. Need NVRTC %d.x or newer.",system::ILogger::ELL_ERROR,cuda_native::MinimumNVRTCMajorVersion);
 		return nullptr;
+	}
 	int nvrtcVersion[2] = { -1,-1 };
-	nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1);
-	if (nvrtcVersion[0]<9)
+	const auto nvrtcVersionResult = nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1);
+	if (nvrtcVersionResult!=NVRTC_SUCCESS)
+	{
+		logger.log("CCUDAHandler: nvrtcVersion failed with NVRTC error code %d.",system::ILogger::ELL_ERROR,static_cast<int>(nvrtcVersionResult));
 		return nullptr;
+	}
+	if (nvrtcVersion[0]<cuda_native::MinimumNVRTCMajorVersion)
+	{
+		logger.log(
+			"CCUDAHandler: NVRTC runtime %d.%d is below required %d.x.",
+			system::ILogger::ELL_ERROR,
+			nvrtcVersion[0],nvrtcVersion[1],cuda_native::MinimumNVRTCMajorVersion
+		);
+		return nullptr;
+	}
 
 	// add headers
 	core::vector<core::smart_refctd_ptr<system::IFile>> headers;
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
index 2dc3c3bbca..47701359ba 100644
--- a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
+++ b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
@@ -57,16 +57,6 @@ struct CCUDAImportedSemaphore::SNativeState
 namespace cuda_native
 {
 
-inline CUmemLocationType toNative(ECUDAMemoryLocation location)
-{
-	return static_cast<CUmemLocationType>(static_cast<uint32_t>(location));
-}
-
-inline ECUDAMemoryLocation toNabla(CUmemLocationType location)
-{
-	return static_cast<ECUDAMemoryLocation>(static_cast<uint32_t>(location));
-}
-
 inline CUmemAllocationHandleType getAllocationHandleType()
 {
 #ifdef _WIN32
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 407b5e81b3..cf3a89cdd1 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -1,32 +1,12 @@
 # CUDA Interop Targets
 
-- `Nabla::Nabla` does not require the CUDA SDK.
-- `Nabla::Nabla` provides Nabla CUDA interop types when the package was built with CUDA support.
-- Nabla CUDA interop public headers do not include `cuda.h` or `nvrtc.h`.
-- `Nabla::ext::CUDAInterop` is the raw CUDA Driver API and NVRTC opt-in target.
+- `Nabla::Nabla` owns the CUDA interop implementation and exported symbols.
+- `Nabla::Nabla` public headers do not include `cuda.h` or `nvrtc.h`.
+- `Nabla::ext::CUDAInterop` is the explicit raw CUDA Driver API and NVRTC opt-in target.
 - `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`.
 - Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInterop`.
-- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla.
-- Changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
 - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
 
-## Design
-
-- CUDA is used privately while building `Nabla::Nabla`.
-- CUDA SDK headers become visible to consumers only through `Nabla::ext::CUDAInterop`.
-- `Nabla::Nabla` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
-- `Nabla::ext::CUDAInterop` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects.
-- The dependency shape follows the same general model used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header.
-- This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`.
-
-## OpenCV Reference
-
-- OpenCV's common CUDA header includes OpenCV headers, not raw CUDA SDK headers: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L51-L52).
-- OpenCV keeps the public stream type as an OpenCV abstraction and grants access through `StreamAccessor`: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L916-L979).
-- OpenCV's raw CUDA opt-in header says it is the only header that depends on the CUDA Runtime API, then includes `<cuda_runtime.h>` and exposes accessor types: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79).
-- OpenCV also keeps implementation CUDA headers private and includes `<cuda.h>` / `<cuda_runtime.h>` there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61).
-- The same split is used here: Nabla CUDA objects stay in `Nabla::Nabla`, and raw CUDA handles/functions are available only after including `CUDAInteropNative.h` and linking `Nabla::ext::CUDAInterop`.
-
 ## Usage
 
 ```cmake
@@ -35,18 +15,39 @@ target_link_libraries(app PRIVATE Nabla::Nabla)
 ```
 
 ```cmake
-find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop)
+find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
 target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop)
 ```
 
+```cpp
+#include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
+
+auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
+    .size = size,
+    .alignment = alignment,
+    .location = CU_MEM_LOCATION_TYPE_DEVICE,
+});
+```
+
 ## Properties
 
-- `Nabla::Nabla` can be built with CUDA support without making CUDA SDK headers a public compile-time requirement.
-- Consumers that only link `Nabla::Nabla` do not need a CUDA SDK to parse Nabla headers.
-- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop` explicitly.
-- Raw CUDA access is not wrapped away. Native code can use CUDA Driver API types, NVRTC types, and Nabla native accessors in the opt-in path.
+- Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers.
+- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop`.
+- Raw CUDA access is not wrapped away in the native opt-in path. Native code uses CUDA Driver API and NVRTC types directly.
+- CUDA SDK structs with version-sensitive layout are kept out of exported Nabla ABI.
+- The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs.
+- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla.
+- `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor.
 - The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds.
-- CUDA OFF implementations are local stubs in the same `.cpp` files. Clean API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
+- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
 - CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`, so CUDA OFF builds do not need `cuda.h` or `nvrtc.h`.
-- A package built with CUDA support can be consumed without a local CUDA SDK unless the `CUDAInterop` component is requested.
-- A consumer can use a compatible local CUDA SDK for native interop without rebuilding Nabla.
+
+## Related Designs
+
+- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79).
+- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61).
+- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: [`device.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27).
+- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: [`device_impl.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30), [`device.cpp`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.cpp#L10-L48).
+- ONNX Runtime keeps accelerator dependencies behind execution providers and supports provider shared libraries loaded only when requested: [`Build with Execution Providers`](https://onnxruntime.ai/docs/build/eps.html#execution-provider-shared-libraries).
+- ggml/llama.cpp keeps the generic backend API separate from CUDA and builds CUDA as an explicit backend target with CUDA libraries linked to that backend: [`ggml-backend.h`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/include/ggml-backend.h#L1488-L1499), [`ggml-cuda CMakeLists.txt`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-cuda/CMakeLists.txt#L982-L1072).
+- TensorFlow PluggableDevice uses separate device plugin packages so accelerator toolchains and dependencies do not become core TensorFlow requirements: [`PluggableDevice`](https://blog.tensorflow.org/2021/06/pluggabledevice-device-plugins-for-TensorFlow.html).
diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
index 348caa766e..e36fe65701 100644
--- a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
@@ -23,14 +23,11 @@ class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramewo
 
 	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
 	{
-		static_assert(std::is_same_v<decltype(nbl::video::CCUDAExportableMemory::SCreationParams{}.location), nbl::video::ECUDAMemoryLocation>);
-
-		const nbl::video::CCUDAExportableMemory::SCreationParams params = {
-			.size = 4096,
-			.alignment = 4096,
-			.location = nbl::video::ECUDAMemoryLocation::DEVICE,
-		};
-		return isAPILoaded() && params.location==nbl::video::ECUDAMemoryLocation::DEVICE;
+		static_assert(std::is_class_v<nbl::video::CCUDADevice>);
+		static_assert(std::is_class_v<nbl::video::CCUDAExportableMemory>);
+		static_assert(std::is_class_v<nbl::video::CCUDAImportedMemory>);
+		static_assert(std::is_class_v<nbl::video::CCUDAImportedSemaphore>);
+		return isAPILoaded();
 	}
 
 	void workLoopBody() override {}
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index a78f710040..6dda3d275e 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -22,10 +22,10 @@ using namespace nbl::video;
 	core::smart_refctd_ptr<IDeviceMemoryAllocation> vulkanMemory,
 	core::smart_refctd_ptr<ISemaphore> vulkanSemaphore)
 {
-	auto cudaMemory = cudaDevice.createExportableMemory({
+	auto cudaMemory = cuda_native::createExportableMemory(cudaDevice, {
 		.size = 4096,
 		.alignment = 4096,
-		.location = ECUDAMemoryLocation::DEVICE,
+		.location = CU_MEM_LOCATION_TYPE_DEVICE,
 	});
 	if (!cudaMemory)
 		return false;

From 141790523f61caa5fbbf45223ba4cfa0bade78c9 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 11:58:20 +0200
Subject: [PATCH 16/27] Add CUDA interop runtime header discovery

---
 CMakeLists.txt                                |   1 +
 cmake/NablaCUDAInteropHelpers.cmake           | 182 ++++++++++
 cmake/NablaConfig.cmake.in                    |   3 +
 examples_tests                                |   2 +-
 include/nbl/ext/CUDAInterop/CCUDAHandler.h    |  16 +
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      | 331 +++++++++++++++++-
 src/nbl/ext/CUDAInterop/CMakeLists.txt        |  19 +-
 src/nbl/ext/CUDAInterop/README.md             |  48 ++-
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt  |   8 +-
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  42 +++
 10 files changed, 641 insertions(+), 11 deletions(-)
 create mode 100644 cmake/NablaCUDAInteropHelpers.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 14845789fc..9251a3ee68 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -312,6 +312,7 @@ if(NBL_ENABLE_CONFIG_INSTALL)
 		set(_NBL_NABLA_CONFIG_FILES
 			"${CMAKE_CURRENT_BINARY_DIR}/NablaConfig.cmake"
 			"${CMAKE_CURRENT_BINARY_DIR}/NablaConfigVersion.cmake"
+			"${CMAKE_CURRENT_LIST_DIR}/cmake/NablaCUDAInteropHelpers.cmake"
 		)
 
 		install(EXPORT NablaExportTargets
diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake
new file mode 100644
index 0000000000..6486789aeb
--- /dev/null
+++ b/cmake/NablaCUDAInteropHelpers.cmake
@@ -0,0 +1,182 @@
+function(_nbl_cuda_interop_collect_runtime_include_dirs _OUT_INCLUDE_DIRS)
+	set(_include_dirs ${ARGN})
+
+	if(DEFINED CUDAToolkit_INCLUDE_DIRS AND NOT "${CUDAToolkit_INCLUDE_DIRS}" STREQUAL "")
+		list(APPEND _include_dirs ${CUDAToolkit_INCLUDE_DIRS})
+	endif()
+
+	if(TARGET CUDA::toolkit)
+		get_target_property(_cuda_toolkit_include_dirs CUDA::toolkit INTERFACE_INCLUDE_DIRECTORIES)
+		if(_cuda_toolkit_include_dirs AND NOT _cuda_toolkit_include_dirs STREQUAL "NOTFOUND")
+			list(APPEND _include_dirs ${_cuda_toolkit_include_dirs})
+		endif()
+	endif()
+
+	if(_include_dirs)
+		list(REMOVE_DUPLICATES _include_dirs)
+	endif()
+
+	set(${_OUT_INCLUDE_DIRS} ${_include_dirs} PARENT_SCOPE)
+endfunction()
+
+function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT)
+	set(_include_dirs ${ARGN})
+	set(_json "{\n  \"cudaRuntimeIncludeDirs\": [")
+	set(_first ON)
+
+	foreach(_include_dir IN LISTS _include_dirs)
+		if("${_include_dir}" STREQUAL "")
+			continue()
+		endif()
+
+		file(TO_CMAKE_PATH "${_include_dir}" _include_dir_json)
+		string(REPLACE "\"" "\\\"" _include_dir_json "${_include_dir_json}")
+
+		if(_first)
+			string(APPEND _json "\n")
+			set(_first OFF)
+		else()
+			string(APPEND _json ",\n")
+		endif()
+		string(APPEND _json "    \"${_include_dir_json}\"")
+	endforeach()
+
+	if(NOT _first)
+		string(APPEND _json "\n  ]\n}\n")
+	else()
+		string(APPEND _json "]\n}\n")
+	endif()
+
+	set(${_OUT_CONTENT} "${_json}" PARENT_SCOPE)
+endfunction()
+
+function(_nbl_cuda_interop_collect_configs _OUT_CONFIGS)
+	if(CMAKE_CONFIGURATION_TYPES)
+		set(_configs ${CMAKE_CONFIGURATION_TYPES})
+	elseif(CMAKE_BUILD_TYPE)
+		set(_configs "${CMAKE_BUILD_TYPE}")
+	else()
+		set(_configs Debug)
+	endif()
+
+	list(REMOVE_DUPLICATES _configs)
+	set(${_OUT_CONFIGS} ${_configs} PARENT_SCOPE)
+endfunction()
+
+function(_nbl_cuda_interop_collect_target_runtime_jsons TARGET_NAME _OUT_FILES _OVERRIDE_OUTPUT)
+	_nbl_cuda_interop_collect_configs(_configs)
+	set(_runtime_jsons "")
+
+	if(NOT "${_OVERRIDE_OUTPUT}" STREQUAL "")
+		foreach(_config IN LISTS _configs)
+			set(_runtime_paths_json "${_OVERRIDE_OUTPUT}")
+			string(REPLACE "$<CONFIG>" "${_config}" _runtime_paths_json "${_runtime_paths_json}")
+			if(_runtime_paths_json MATCHES "\\$<")
+				message(FATAL_ERROR "Nabla: CUDA interop runtime JSON path supports only plain paths or $<CONFIG>.")
+			endif()
+			cmake_path(IS_ABSOLUTE _runtime_paths_json _is_abs)
+			if(NOT _is_abs)
+				cmake_path(ABSOLUTE_PATH _runtime_paths_json BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_paths_json)
+			endif()
+			cmake_path(NORMAL_PATH _runtime_paths_json OUTPUT_VARIABLE _runtime_paths_json)
+			list(APPEND _runtime_jsons "${_runtime_paths_json}")
+		endforeach()
+		list(REMOVE_DUPLICATES _runtime_jsons)
+		set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE)
+		return()
+	endif()
+
+	foreach(_config IN LISTS _configs)
+		string(TOUPPER "${_config}" _config_upper)
+		get_target_property(_runtime_output_dir "${TARGET_NAME}" "RUNTIME_OUTPUT_DIRECTORY_${_config_upper}")
+
+		if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND")
+			get_target_property(_runtime_output_dir "${TARGET_NAME}" RUNTIME_OUTPUT_DIRECTORY)
+		endif()
+		if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper})
+			set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper}}")
+		endif()
+		if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
+			set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
+		endif()
+		if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND")
+			if(CMAKE_CONFIGURATION_TYPES)
+				set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}/${_config}")
+			else()
+				set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
+			endif()
+		endif()
+
+		string(REPLACE "$<CONFIG>" "${_config}" _runtime_output_dir "${_runtime_output_dir}")
+		if(_runtime_output_dir MATCHES "\\$<")
+			message(FATAL_ERROR "Nabla: nbl_configure_cuda_interop_runtime supports only plain runtime output directories or $<CONFIG>.")
+		endif()
+
+		cmake_path(IS_ABSOLUTE _runtime_output_dir _is_abs)
+		if(NOT _is_abs)
+			cmake_path(ABSOLUTE_PATH _runtime_output_dir BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_output_dir)
+		endif()
+		cmake_path(NORMAL_PATH _runtime_output_dir OUTPUT_VARIABLE _runtime_output_dir)
+
+		list(APPEND _runtime_jsons "${_runtime_output_dir}/nbl_cuda_interop_runtime.json")
+	endforeach()
+
+	list(REMOVE_DUPLICATES _runtime_jsons)
+	set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE)
+endfunction()
+
+function(nbl_configure_cuda_interop_runtime TARGET_NAME)
+	cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN})
+
+	if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS)
+		message(FATAL_ERROR "Nabla: unexpected arguments for nbl_configure_cuda_interop_runtime: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}")
+	endif()
+
+	if(NOT TARGET "${TARGET_NAME}")
+		message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist")
+	endif()
+
+	_nbl_cuda_interop_collect_runtime_include_dirs(_include_dirs ${_NBL_CUDA_INTEROP_INCLUDE_DIRS})
+
+	_nbl_cuda_interop_make_runtime_paths_json(_runtime_paths_json_content ${_include_dirs})
+	_nbl_cuda_interop_collect_target_runtime_jsons("${TARGET_NAME}" _runtime_paths_jsons "${_NBL_CUDA_INTEROP_RUNTIME_JSON}")
+
+	foreach(_runtime_paths_json IN LISTS _runtime_paths_jsons)
+		file(GENERATE OUTPUT "${_runtime_paths_json}" CONTENT "${_runtime_paths_json_content}" TARGET "${TARGET_NAME}")
+	endforeach()
+
+	set_source_files_properties(${_runtime_paths_jsons} PROPERTIES GENERATED TRUE HEADER_FILE_ONLY TRUE)
+	target_sources("${TARGET_NAME}" PRIVATE ${_runtime_paths_jsons})
+endfunction()
+
+function(nbl_target_link_cuda_interop TARGET_NAME)
+	set(_args ${ARGN})
+	set(_scope PRIVATE)
+
+	if(_args)
+		list(GET _args 0 _first_arg)
+		if(_first_arg MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$")
+			set(_scope "${_first_arg}")
+			list(REMOVE_AT _args 0)
+		endif()
+	endif()
+
+	cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${_args})
+
+	if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS)
+		message(FATAL_ERROR "Nabla: unexpected arguments for nbl_target_link_cuda_interop: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}")
+	endif()
+
+	if(NOT TARGET "${TARGET_NAME}")
+		message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist")
+	endif()
+	if(NOT TARGET Nabla::ext::CUDAInterop)
+		message(FATAL_ERROR "Nabla: Nabla::ext::CUDAInterop is not available. Request the CUDAInterop package component or enable NBL_COMPILE_WITH_CUDA.")
+	endif()
+
+	target_link_libraries("${TARGET_NAME}" ${_scope} Nabla::ext::CUDAInterop)
+	nbl_configure_cuda_interop_runtime("${TARGET_NAME}"
+		RUNTIME_JSON "${_NBL_CUDA_INTEROP_RUNTIME_JSON}"
+		INCLUDE_DIRS ${_NBL_CUDA_INTEROP_INCLUDE_DIRS}
+	)
+endfunction()
diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index 8b9f62e548..0464340ce3 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -96,6 +96,9 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP)
   _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
   if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
     target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
+    if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake")
+      include("${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake")
+    endif()
   endif()
 endif()
 
diff --git a/examples_tests b/examples_tests
index fbb82d36e0..b2c639c8b7 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit fbb82d36e0f767e867a477a9d1a7035c7cbd56ca
+Subproject commit b2c639c8b71c3b860418dc4b3e46ad147ba5f256
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
index 6a3cc6c496..bed4f9a31c 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -8,6 +8,7 @@
 #include "nbl/core/definitions.h"
 
 #include "nbl/system/declarations.h"
+#include "nbl/system/path.h"
 
 #include <array>
 #include <cstdint>
@@ -25,6 +26,21 @@ namespace cuda_native
 struct SAccess;
 }
 
+namespace cuda_interop
+{
+inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.json";
+
+struct SRuntimeCompileEnvironment
+{
+	core::vector<system::path> includeDirs;
+	core::vector<system::path> runtimePathFiles;
+};
+
+NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs = {});
+NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs, core::vector<system::path> runtimePathFiles);
+NBL_API2 core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment);
+}
+
 class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 {
 	public:
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 777a1db14a..fce7fd2b5a 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -3,6 +3,324 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/system/ModuleLookupUtils.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <sstream>
+#include <string_view>
+#include <system_error>
+
+namespace nbl::video::cuda_interop
+{
+namespace
+{
+
+std::string readEnvironmentVariable(std::string_view name)
+{
+	#if defined(_NBL_PLATFORM_WINDOWS_)
+	char* value = nullptr;
+	size_t size = 0;
+	if (_dupenv_s(&value,&size,std::string(name).c_str()) || !value)
+		return {};
+	std::string result(value);
+	std::free(value);
+	return result;
+	#else
+	if (const char* value = std::getenv(std::string(name).c_str()))
+		return value;
+	return {};
+	#endif
+}
+
+bool isDirectory(const system::path& path)
+{
+	std::error_code error;
+	return std::filesystem::exists(path,error) && std::filesystem::is_directory(path,error);
+}
+
+bool isRegularFile(const system::path& path)
+{
+	std::error_code error;
+	return std::filesystem::exists(path,error) && std::filesystem::is_regular_file(path,error);
+}
+
+system::path normalizedAbsolute(system::path path)
+{
+	std::error_code error;
+	auto absolute = std::filesystem::absolute(path,error);
+	if (error)
+		absolute = std::move(path);
+	return absolute.lexically_normal();
+}
+
+bool looksLikeCUDAIncludeDir(const system::path& path)
+{
+	if (!isDirectory(path))
+		return false;
+
+	return isRegularFile(path/"cuda_fp16.h") ||
+		isRegularFile(path/"cuda_runtime_api.h") ||
+		isRegularFile(path/"vector_types.h") ||
+		isRegularFile(path/"cuda.h") ||
+		isRegularFile(path/"nv"/"target");
+}
+
+void appendIncludeDir(core::vector<system::path>& includeDirs, system::path path)
+{
+	if (path.empty() || !looksLikeCUDAIncludeDir(path))
+		return;
+
+	path = normalizedAbsolute(std::move(path));
+	const auto pathString = path.generic_string();
+	const auto alreadyAdded = std::find_if(includeDirs.begin(),includeDirs.end(),[&](const system::path& existing) {
+		return existing.generic_string()==pathString;
+	});
+	if (alreadyAdded==includeDirs.end())
+		includeDirs.push_back(std::move(path));
+}
+
+void appendCUDAIncludeRoot(core::vector<system::path>& includeDirs, const system::path& root)
+{
+	if (root.empty())
+		return;
+
+	appendIncludeDir(includeDirs,root);
+	appendIncludeDir(includeDirs,root/"include");
+}
+
+core::vector<std::string> parseStringArray(std::string_view text, std::string_view key)
+{
+	core::vector<std::string> values;
+	const std::string quotedKey = "\"" + std::string(key) + "\"";
+	const auto keyPos = text.find(quotedKey);
+	if (keyPos==std::string_view::npos)
+		return values;
+
+	const auto arrayBegin = text.find('[',keyPos+quotedKey.size());
+	if (arrayBegin==std::string_view::npos)
+		return values;
+	const auto arrayEnd = text.find(']',arrayBegin+1);
+	if (arrayEnd==std::string_view::npos)
+		return values;
+
+	for (auto pos = arrayBegin+1; pos<arrayEnd;)
+	{
+		const auto quoteBegin = text.find('"',pos);
+		if (quoteBegin==std::string_view::npos || quoteBegin>=arrayEnd)
+			break;
+
+		std::string value;
+		auto cursor = quoteBegin+1;
+		for (; cursor<arrayEnd; ++cursor)
+		{
+			const char c = text[cursor];
+			if (c=='\\')
+			{
+				if (++cursor<arrayEnd)
+					value.push_back(text[cursor]);
+				continue;
+			}
+			if (c=='"')
+				break;
+			value.push_back(c);
+		}
+
+		values.push_back(std::move(value));
+		pos = cursor+1;
+	}
+
+	return values;
+}
+
+void appendRuntimePathsConfig(core::vector<system::path>& includeDirs, const system::path& configFile)
+{
+	if (!isRegularFile(configFile))
+		return;
+
+	std::ifstream input(configFile);
+	if (!input)
+		return;
+
+	std::stringstream buffer;
+	buffer << input.rdbuf();
+	for (const auto& path : parseStringArray(buffer.str(),"cudaRuntimeIncludeDirs"))
+		appendIncludeDir(includeDirs,system::path(path));
+}
+
+void appendRuntimePathsConfigEnv(core::vector<system::path>& includeDirs, std::string_view name)
+{
+	const auto value = readEnvironmentVariable(name);
+	if (value.empty())
+		return;
+
+	#if defined(_NBL_PLATFORM_WINDOWS_)
+	constexpr char Separator = ';';
+	#else
+	constexpr char Separator = ':';
+	#endif
+
+	size_t begin = 0;
+	while (begin<value.size())
+	{
+		const auto end = value.find(Separator,begin);
+		const auto segment = value.substr(begin,end==std::string::npos ? std::string::npos:end-begin);
+		appendRuntimePathsConfig(includeDirs,system::path(segment));
+		if (end==std::string::npos)
+			break;
+		begin = end+1;
+	}
+}
+
+void appendRuntimePathsConfigs(core::vector<system::path>& includeDirs, const core::vector<system::path>& explicitRuntimePathFiles)
+{
+	for (const auto& runtimePathFile : explicitRuntimePathFiles)
+		appendRuntimePathsConfig(includeDirs,runtimePathFile);
+
+	appendRuntimePathsConfigEnv(includeDirs,"NBL_CUDA_INTEROP_RUNTIME_JSON");
+	appendRuntimePathsConfigEnv(includeDirs,"Nabla_CUDA_INTEROP_RUNTIME_JSON");
+
+	const auto exeDir = system::executableDirectory();
+	if (!exeDir.empty())
+		appendRuntimePathsConfig(includeDirs,exeDir/RuntimePathsFileName);
+
+	#if defined(_NBL_PLATFORM_WINDOWS_)
+	const auto releaseModuleDir = system::loadedModuleDirectory("Nabla.dll");
+	if (!releaseModuleDir.empty())
+		appendRuntimePathsConfig(includeDirs,releaseModuleDir/RuntimePathsFileName);
+	const auto debugModuleDir = system::loadedModuleDirectory("Nabla_debug.dll");
+	if (!debugModuleDir.empty())
+		appendRuntimePathsConfig(includeDirs,debugModuleDir/RuntimePathsFileName);
+	#endif
+}
+
+void appendAppLocalIncludeDirs(core::vector<system::path>& includeDirs)
+{
+	const auto exeDir = system::executableDirectory();
+	if (exeDir.empty())
+		return;
+
+	appendIncludeDir(includeDirs,exeDir/"cuda"/"include");
+	appendIncludeDir(includeDirs,exeDir/"nvidia"/"cu13"/"include");
+	appendIncludeDir(includeDirs,exeDir/"Libraries"/"cuda"/"include");
+	appendIncludeDir(includeDirs,exeDir.parent_path()/"cuda"/"include");
+}
+
+void appendPythonPackageIncludeDirs(core::vector<system::path>& includeDirs, const system::path& root)
+{
+	if (root.empty())
+		return;
+
+	appendIncludeDir(includeDirs,root/"Lib"/"site-packages"/"nvidia"/"cu13"/"include");
+	appendIncludeDir(includeDirs,root/"lib"/"site-packages"/"nvidia"/"cu13"/"include");
+	appendIncludeDir(includeDirs,root/"Library"/"include");
+	appendIncludeDir(includeDirs,root/"include");
+}
+
+void appendPathListEnv(core::vector<system::path>& includeDirs, std::string_view name)
+{
+	const auto value = readEnvironmentVariable(name);
+	if (value.empty())
+		return;
+
+	#if defined(_NBL_PLATFORM_WINDOWS_)
+	constexpr char Separator = ';';
+	#else
+	constexpr char Separator = ':';
+	#endif
+
+	size_t begin = 0;
+	while (begin<value.size())
+	{
+		const auto end = value.find(Separator,begin);
+		const auto segment = value.substr(begin,end==std::string::npos ? std::string::npos:end-begin);
+		appendIncludeDir(includeDirs,system::path(segment));
+		if (end==std::string::npos)
+			break;
+		begin = end+1;
+	}
+}
+
+void appendEnvironmentIncludeDirs(core::vector<system::path>& includeDirs)
+{
+	appendPathListEnv(includeDirs,"NBL_CUDA_RUNTIME_INCLUDE_DIRS");
+	appendPathListEnv(includeDirs,"Nabla_CUDA_RUNTIME_INCLUDE_DIRS");
+
+	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_PATH"));
+	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_HOME"));
+	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_ROOT"));
+	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDAToolkit_ROOT"));
+
+	appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("VIRTUAL_ENV"));
+	appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("CONDA_PREFIX"));
+}
+
+void appendCUDAInstallRoots(core::vector<system::path>& includeDirs, const system::path& root)
+{
+	if (!isDirectory(root))
+		return;
+
+	core::vector<system::path> candidates;
+	std::error_code error;
+	for (const auto& entry : std::filesystem::directory_iterator(root,error))
+	{
+		if (error)
+			break;
+		if (!entry.is_directory(error))
+			continue;
+		candidates.push_back(entry.path()/"include");
+	}
+
+	std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) {
+		return lhs.generic_string()>rhs.generic_string();
+	});
+	for (const auto& candidate : candidates)
+		appendIncludeDir(includeDirs,candidate);
+}
+
+void appendSystemIncludeDirs(core::vector<system::path>& includeDirs)
+{
+	#if defined(_NBL_PLATFORM_WINDOWS_)
+	appendCUDAInstallRoots(includeDirs,"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA");
+	#else
+	appendIncludeDir(includeDirs,"/usr/local/cuda/include");
+	appendCUDAInstallRoots(includeDirs,"/usr/local");
+	appendIncludeDir(includeDirs,"/usr/include");
+	#endif
+}
+
+}
+
+SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs, core::vector<system::path> runtimePathFiles)
+{
+	SRuntimeCompileEnvironment environment;
+	environment.runtimePathFiles = std::move(runtimePathFiles);
+	for (auto& includeDir : explicitIncludeDirs)
+		appendIncludeDir(environment.includeDirs,std::move(includeDir));
+
+	appendRuntimePathsConfigs(environment.includeDirs,environment.runtimePathFiles);
+	appendAppLocalIncludeDirs(environment.includeDirs);
+	appendEnvironmentIncludeDirs(environment.includeDirs);
+	appendSystemIncludeDirs(environment.includeDirs);
+
+	return environment;
+}
+
+SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs)
+{
+	return findRuntimeCompileEnvironment(std::move(explicitIncludeDirs),{});
+}
+
+core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment)
+{
+	core::vector<std::string> options;
+	for (const auto& includeDir : environment.includeDirs)
+		options.push_back("-I" + includeDir.generic_string());
+	return options;
+}
+
+}
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
@@ -671,7 +989,18 @@ static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nv
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
-	result = compileProgram(handler,program,nvrtcOptions);
+	const auto runtimeEnvironment = cuda_interop::findRuntimeCompileEnvironment();
+	const auto runtimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment);
+	core::vector<const char*> options;
+	options.reserve(nvrtcOptions.size()+runtimeIncludeOptions.size());
+	for (const auto option : nvrtcOptions)
+		options.push_back(option);
+	for (const auto& option : runtimeIncludeOptions)
+		options.push_back(option.c_str());
+
+	const auto* optionsBegin = options.empty() ? nullptr:options.data();
+	const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size();
+	result = compileProgram(handler,program,{optionsBegin,optionsEnd});
 	if (log)
 		getProgramLog(handler,program,*log);
 	if (result!=NVRTC_SUCCESS)
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
index 438ab51d8f..a9e1663fa9 100644
--- a/src/nbl/ext/CUDAInterop/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -1,13 +1,22 @@
-include(${NBL_ROOT_PATH}/cmake/common.cmake)
+include(common)
+include(NablaCUDAInteropHelpers)
 
 if (NBL_COMPILE_WITH_CUDA)
 	set(NBL_EXT_CUDA_INTEROP_LIB "NblExtCUDA_INTEROP")
 
-	add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE)
+	file(GLOB NBL_EXT_CUDA_INTEROP_IDE_HEADERS CONFIGURE_DEPENDS "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop/*.h")
+	set(NBL_EXT_CUDA_INTEROP_IDE_SOURCES
+		${NBL_EXT_CUDA_INTEROP_IDE_HEADERS}
+		CMakeLists.txt
+		README.md
+	)
+	set_source_files_properties(${NBL_EXT_CUDA_INTEROP_IDE_SOURCES} PROPERTIES HEADER_FILE_ONLY TRUE)
+
+	# Header-only opt-in target. It builds no artifact and adds CUDA SDK usage requirements only for native interop consumers.
+	add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE EXCLUDE_FROM_ALL ${NBL_EXT_CUDA_INTEROP_IDE_SOURCES})
 	target_link_libraries(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE
-		$<BUILD_INTERFACE:Nabla>
-		$<BUILD_INTERFACE:CUDA::toolkit>
-		$<INSTALL_INTERFACE:Nabla::Nabla>
+		Nabla
+		CUDA::toolkit
 	)
 	set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
 	add_library(Nabla::ext::CUDAInterop ALIAS ${NBL_EXT_CUDA_INTEROP_LIB})
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index cf3a89cdd1..837f3ab28e 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -2,8 +2,12 @@
 
 - `Nabla::Nabla` owns the CUDA interop implementation and exported symbols.
 - `Nabla::Nabla` public headers do not include `cuda.h` or `nvrtc.h`.
+- The SDK-free interop headers stay stable for CUDA ON and CUDA OFF Nabla builds.
 - `Nabla::ext::CUDAInterop` is the explicit raw CUDA Driver API and NVRTC opt-in target.
+- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It does not build a library or executable artifact.
+- The target only carries usage requirements and IDE-visible sources.
 - `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`.
+- `CUDAInteropNative.h` is the small opt-in header that includes CUDA SDK headers such as `cuda.h` and `nvrtc.h`.
 - Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInterop`.
 - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
 
@@ -16,12 +20,28 @@ target_link_libraries(app PRIVATE Nabla::Nabla)
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
-target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop)
+nbl_target_link_cuda_interop(native_app PRIVATE)
+```
+
+```cmake
+find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
+nbl_target_link_cuda_interop(native_app PRIVATE
+    INCLUDE_DIRS "${cuda_runtime_headers}"
+)
+```
+
+```cmake
+nbl_target_link_cuda_interop(native_app PRIVATE
+    RUNTIME_JSON "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/my_cuda_runtime.json"
+)
 ```
 
 ```cpp
 #include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
 
+auto runtimeEnv = nbl::video::cuda_interop::findRuntimeCompileEnvironment();
+auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnv);
+
 auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
     .size = size,
     .alignment = alignment,
@@ -29,6 +49,23 @@ auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
 });
 ```
 
+## Runtime Header Discovery
+
+- `nbl_target_link_cuda_interop(<target> <scope>)` links `Nabla::ext::CUDAInterop` and configures runtime include discovery for that target.
+- The helper is defined once in `NablaCUDAInteropHelpers.cmake` and is available from the source tree and installed `NablaConfig.cmake`.
+- For each target it writes `nbl_cuda_interop_runtime.json` next to the executable during CMake generation.
+- `RUNTIME_JSON <path>` overrides the generated JSON location. Plain paths and `$<CONFIG>` are supported.
+- `cuda_interop::findRuntimeCompileEnvironment` can also receive explicit JSON paths at runtime.
+- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
+- The JSON is a build artifact. Nabla packages do not install JSON files with host-specific CUDA paths.
+- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`.
+- Runtime lookup reads `nbl_cuda_interop_runtime.json` first, then checks app-local include bundles, explicit environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
+- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list.
+- Production machines do not need the full CUDA SDK just because Nabla was built with CUDA.
+- If an application compiles CUDA source with NVRTC and includes headers such as `cuda_fp16.h`, it must provide those runtime headers through the generated JSON path, an app-local bundle, a runtime/header package, or an installed toolkit.
+- `CUDA_PATH` is a developer fallback. It is not required for packaged applications.
+- Direct `target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)` remains possible, but it only adds compile/link usage requirements and does not create the runtime discovery JSON.
+
 ## Properties
 
 - Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers.
@@ -38,12 +75,17 @@ auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
 - The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs.
 - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla.
 - `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor.
+- Runtime CUDA header discovery is independent from the CUDA SDK used to build Nabla.
+- Native consumers can use a newer compatible CUDA SDK or a runtime/header package without rebuilding Nabla.
+- Toggling Nabla CUDA support does not change SDK-free public header parse requirements for consumers.
 - The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds.
-- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
-- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`, so CUDA OFF builds do not need `cuda.h` or `nvrtc.h`.
+- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
+- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`.
 
 ## Related Designs
 
+This split follows the same public-boundary pattern used by mature GPU projects: SDK-free default headers, native access through an explicit opt-in path, and SDK-dependent implementation details outside the default public API.
+
 - OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79).
 - OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61).
 - Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: [`device.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27).
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index bdda95fb03..7118eeff09 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.30)
 project(NblExtCUDAInteropSmoke CXX)
 
 option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF)
+set(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON "" CACHE FILEPATH "Optional CUDA interop runtime JSON path used by the native smoke.")
 
 if(NOT TARGET Nabla::Nabla)
 	set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core)
@@ -28,5 +29,10 @@ target_link_libraries(NblExtCUDAInteropCleanNablaSmoke PRIVATE Nabla::Nabla)
 
 if(TARGET Nabla::ext::CUDAInterop)
 	nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp)
-	target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
+	set(_nbl_cuda_interop_smoke_args PRIVATE)
+	if(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON)
+		list(APPEND _nbl_cuda_interop_smoke_args RUNTIME_JSON "${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}")
+		target_compile_definitions(NblExtCUDAInteropNativeOptInSmoke PRIVATE NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON="${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}")
+	endif()
+	nbl_target_link_cuda_interop(NblExtCUDAInteropNativeOptInSmoke ${_nbl_cuda_interop_smoke_args})
 endif()
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 6dda3d275e..3b799a56cf 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -4,6 +4,7 @@
 #include <algorithm>
 #include <array>
 #include <cstdint>
+#include <filesystem>
 #include <type_traits>
 #include <utility>
 
@@ -82,6 +83,30 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 	releaseContext();
 	return ok && std::ranges::equal(input, output);
 }
+
+bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler)
+{
+	constexpr const char* Source = R"cuda(
+		#include <cuda_fp16.h>
+		extern "C" __global__ void fp16_probe(unsigned short* out)
+		{
+			out[0] = sizeof(__half);
+		}
+	)cuda";
+
+	std::string log;
+	auto [ptx, result] = cuda_native::compileDirectlyToPTX(
+		handler,
+		Source,
+		"cuda_fp16_discovery_probe.cu",
+		{nullptr,nullptr},
+		0,
+		nullptr,
+		nullptr,
+		&log
+	);
+	return result==NVRTC_SUCCESS && ptx && ptx->getSize()>0u;
+}
 }
 
 class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramework
@@ -98,10 +123,27 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 
 		static_assert(std::is_same_v<decltype(nbl::video::cuda_native::getInternalObject(std::declval<const nbl::video::CCUDADevice&>())), CUdevice>);
 
+		#ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON
+		const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON});
+		if (!std::filesystem::exists(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON))
+			return false;
+		#else
+		const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment();
+		#endif
+		const auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment);
+		const auto hasRuntimeHeaders = std::find_if(runtimeEnvironment.includeDirs.begin(),runtimeEnvironment.includeDirs.end(),[](const auto& includeDir) {
+			return std::filesystem::exists(includeDir/"cuda_fp16.h") || std::filesystem::exists(includeDir/"cuda_runtime_api.h");
+		})!=runtimeEnvironment.includeDirs.end();
+		if (includeOptions.empty() || !hasRuntimeHeaders)
+			return false;
+
 		auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
 		if (!handler)
 			return true;
 
+		if (!cudaFp16HeaderCompileProbe(*handler))
+			return false;
+
 		const auto& devices = nbl::video::cuda_native::getAvailableDevices(handler);
 		if (devices.empty())
 			return true;

From 045432e616810403aa55d1232cd57fbbcc6dc8d1 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 12:47:01 +0200
Subject: [PATCH 17/27] Tighten CUDA interop native helpers

---
 cmake/NablaCUDAInteropHelpers.cmake           |  30 +--
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 255 ++++++++----------
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      |  81 ++----
 src/nbl/ext/CUDAInterop/README.md             |  27 ++
 4 files changed, 178 insertions(+), 215 deletions(-)

diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake
index 6486789aeb..9c1ac657d4 100644
--- a/cmake/NablaCUDAInteropHelpers.cmake
+++ b/cmake/NablaCUDAInteropHelpers.cmake
@@ -21,8 +21,7 @@ endfunction()
 
 function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT)
 	set(_include_dirs ${ARGN})
-	set(_json "{\n  \"cudaRuntimeIncludeDirs\": [")
-	set(_first ON)
+	set(_cuda_runtime_include_dir_entries "")
 
 	foreach(_include_dir IN LISTS _include_dirs)
 		if("${_include_dir}" STREQUAL "")
@@ -32,21 +31,22 @@ function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT)
 		file(TO_CMAKE_PATH "${_include_dir}" _include_dir_json)
 		string(REPLACE "\"" "\\\"" _include_dir_json "${_include_dir_json}")
 
-		if(_first)
-			string(APPEND _json "\n")
-			set(_first OFF)
-		else()
-			string(APPEND _json ",\n")
-		endif()
-		string(APPEND _json "    \"${_include_dir_json}\"")
+		list(APPEND _cuda_runtime_include_dir_entries "    \"${_include_dir_json}\"")
 	endforeach()
 
-	if(NOT _first)
-		string(APPEND _json "\n  ]\n}\n")
-	else()
-		string(APPEND _json "]\n}\n")
-	endif()
-
+	set(_json_entry_separator [=[
+,
+]=])
+	list(JOIN _cuda_runtime_include_dir_entries "${_json_entry_separator}" _cuda_runtime_include_dirs)
+
+	set(_json [=[
+{
+  "cudaRuntimeIncludeDirs": [
+@_cuda_runtime_include_dirs@
+  ]
+}
+]=])
+	string(CONFIGURE "${_json}" _json @ONLY)
 	set(${_OUT_CONTENT} "${_json}" PARENT_SCOPE)
 endfunction()
 
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index dd87d93e43..6833ad8189 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -9,6 +9,11 @@
 #include "nbl/asset/ICPUBuffer.h"
 #include "nbl/system/DynamicFunctionCaller.h"
 
+#include <concepts>
+#include <string>
+#include <type_traits>
+#include <utility>
+
 #include "cuda.h"
 #include "nvrtc.h"
 #if CUDA_VERSION < 13000
@@ -153,27 +158,64 @@ struct SExportableMemoryCreationParams
 	CUmemLocationType location;
 };
 
-NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
-NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+namespace detail
+{
+
+template<typename>
+struct is_smart_refctd_ptr : std::false_type {};
+
+template<typename T>
+struct is_smart_refctd_ptr<core::smart_refctd_ptr<T>> : std::true_type {};
+
+template<typename T>
+inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr<std::remove_cvref_t<T>>::value;
 
-inline const CUDA& getCUDAFunctionTable(const CCUDAHandler* handler)
+template<typename T>
+inline constexpr bool is_indirect_object_v = std::is_pointer_v<std::remove_cvref_t<T>> || is_smart_refctd_ptr_v<T>;
+
+template<typename Object>
+decltype(auto) as_ref(Object&& object)
 {
-	return getCUDAFunctionTable(*handler);
+	using object_t = std::remove_cvref_t<Object>;
+	if constexpr (std::is_pointer_v<object_t>)
+		return *object;
+	else if constexpr (is_smart_refctd_ptr_v<Object>)
+		return *object;
+	else
+		return std::forward<Object>(object);
 }
 
-inline const CUDA& getCUDAFunctionTable(const core::smart_refctd_ptr<CCUDAHandler>& handler)
-{
-	return getCUDAFunctionTable(*handler);
+template<typename Object, typename Target>
+concept object_like = is_indirect_object_v<Object> && requires(Object&& object) {
+	{ as_ref(std::forward<Object>(object)) } -> std::convertible_to<Target&>;
+};
+
+template<typename Object, typename Target>
+concept const_object_like = is_indirect_object_v<Object> && requires(Object&& object) {
+	{ as_ref(std::forward<Object>(object)) } -> std::convertible_to<const Target&>;
+};
+
+template<typename Source>
+concept program_text_source = std::same_as<std::remove_cvref_t<Source>, std::string> ||
+	std::convertible_to<Source, const char*>;
+
 }
 
-inline const NVRTC& getNVRTCFunctionTable(const CCUDAHandler* handler)
+NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
+NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+
+template<typename Handler>
+requires detail::const_object_like<Handler, CCUDAHandler>
+inline const CUDA& getCUDAFunctionTable(Handler&& handler)
 {
-	return getNVRTCFunctionTable(*handler);
+	return getCUDAFunctionTable(detail::as_ref(std::forward<Handler>(handler)));
 }
 
-inline const NVRTC& getNVRTCFunctionTable(const core::smart_refctd_ptr<CCUDAHandler>& handler)
+template<typename Handler>
+requires detail::const_object_like<Handler, CCUDAHandler>
+inline const NVRTC& getNVRTCFunctionTable(Handler&& handler)
 {
-	return getNVRTCFunctionTable(*handler);
+	return getNVRTCFunctionTable(detail::as_ref(std::forward<Handler>(handler)));
 }
 
 NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
@@ -185,14 +227,11 @@ T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
 
 NBL_API2 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
 
-inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler* handler)
-{
-	return getAvailableDevices(*handler);
-}
-
-inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const core::smart_refctd_ptr<CCUDAHandler>& handler)
+template<typename Handler>
+requires detail::const_object_like<Handler, CCUDAHandler>
+inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(Handler&& handler)
 {
-	return getAvailableDevices(*handler);
+	return getAvailableDevices(detail::as_ref(std::forward<Handler>(handler)));
 }
 
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
@@ -201,29 +240,26 @@ inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, cons
 	return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames);
 }
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames);
-}
-inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames);
-}
-inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames);
-}
-inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames);
-}
-inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+
+template<typename Handler, typename Source>
+requires detail::object_like<Handler, CCUDAHandler> && detail::program_text_source<Source>
+inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
-	return createProgram(*handler,prog,file,headerCount,headerContents,includeNames);
+	auto& handlerRef = detail::as_ref(std::forward<Handler>(handler));
+	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
+		return createProgram(handlerRef,prog,std::string(std::forward<Source>(source)),name,headerCount,headerContents,includeNames);
+	else
+	{
+		const char* sourceText = source;
+		return createProgram(handlerRef,prog,sourceText,name,headerCount,headerContents,includeNames);
+	}
 }
-inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+
+template<typename Handler, typename File>
+requires detail::object_like<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
+inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, File file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
-	return createProgram(*handler,prog,file,headerCount,headerContents,includeNames);
+	return createProgram(detail::as_ref(std::forward<Handler>(handler)),prog,static_cast<system::IFile*>(file),headerCount,headerContents,includeNames);
 }
 NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
@@ -253,53 +289,34 @@ NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 );
+
+template<typename Handler, typename Source>
+requires detail::object_like<Handler, CCUDAHandler> && detail::program_text_source<Source>
 inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler* handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	const core::smart_refctd_ptr<CCUDAHandler>& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler* handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	const core::smart_refctd_ptr<CCUDAHandler>& handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler* handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	Handler&& handler, Source&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 )
 {
-	return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log);
+	auto& handlerRef = detail::as_ref(std::forward<Handler>(handler));
+	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
+		return compileDirectlyToPTX(handlerRef,std::string(std::forward<Source>(source)),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+	else
+	{
+		const char* sourceText = source;
+		return compileDirectlyToPTX(handlerRef,sourceText,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+	}
 }
+
+template<typename Handler, typename File>
+requires detail::object_like<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
 inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	const core::smart_refctd_ptr<CCUDAHandler>& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	Handler&& handler, File file, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 )
 {
-	return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log);
+	return compileDirectlyToPTX(detail::as_ref(std::forward<Handler>(handler)),static_cast<system::IFile*>(file),nvrtcOptions,headerCount,headerContents,includeNames,log);
 }
 
 NBL_API2 CUdevice getInternalObject(const CCUDADevice& device);
@@ -311,84 +328,50 @@ NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
 NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
 NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
 
-inline CUdevice getInternalObject(const CCUDADevice* device)
-{
-	return getInternalObject(*device);
-}
-
-inline CUdevice getInternalObject(const core::smart_refctd_ptr<CCUDADevice>& device)
-{
-	return getInternalObject(*device);
-}
-
-inline CUcontext getContext(const CCUDADevice* device)
-{
-	return getContext(*device);
-}
-
-inline CUcontext getContext(const core::smart_refctd_ptr<CCUDADevice>& device)
-{
-	return getContext(*device);
-}
-
-inline size_t roundToGranularity(const CCUDADevice* device, CUmemLocationType location, size_t size)
-{
-	return roundToGranularity(*device,location,size);
-}
-
-inline size_t roundToGranularity(const core::smart_refctd_ptr<CCUDADevice>& device, CUmemLocationType location, size_t size)
-{
-	return roundToGranularity(*device,location,size);
-}
-
-inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice* device, SExportableMemoryCreationParams&& params)
-{
-	return createExportableMemory(*device,std::move(params));
-}
-
-inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(const core::smart_refctd_ptr<CCUDADevice>& device, SExportableMemoryCreationParams&& params)
-{
-	return createExportableMemory(*device,std::move(params));
-}
-
-inline CUdeviceptr getDeviceptr(const CCUDAExportableMemory* memory)
-{
-	return getDeviceptr(*memory);
-}
-
-inline CUdeviceptr getDeviceptr(const core::smart_refctd_ptr<CCUDAExportableMemory>& memory)
-{
-	return getDeviceptr(*memory);
-}
-
-inline CUexternalMemory getInternalObject(const CCUDAImportedMemory* memory)
+template<typename Object>
+requires (
+	detail::const_object_like<Object, CCUDADevice> ||
+	detail::const_object_like<Object, CCUDAImportedMemory> ||
+	detail::const_object_like<Object, CCUDAImportedSemaphore>
+)
+inline auto getInternalObject(Object&& object)
 {
-	return getInternalObject(*memory);
+	return getInternalObject(detail::as_ref(std::forward<Object>(object)));
 }
 
-inline CUexternalMemory getInternalObject(const core::smart_refctd_ptr<CCUDAImportedMemory>& memory)
+template<typename Device>
+requires detail::const_object_like<Device, CCUDADevice>
+inline CUcontext getContext(Device&& device)
 {
-	return getInternalObject(*memory);
+	return getContext(detail::as_ref(std::forward<Device>(device)));
 }
 
-inline CUresult getMappedBuffer(const CCUDAImportedMemory* memory, CUdeviceptr* mappedBuffer)
+template<typename Device>
+requires detail::const_object_like<Device, CCUDADevice>
+inline size_t roundToGranularity(Device&& device, CUmemLocationType location, size_t size)
 {
-	return getMappedBuffer(*memory,mappedBuffer);
+	return roundToGranularity(detail::as_ref(std::forward<Device>(device)),location,size);
 }
 
-inline CUresult getMappedBuffer(const core::smart_refctd_ptr<CCUDAImportedMemory>& memory, CUdeviceptr* mappedBuffer)
+template<typename Device>
+requires detail::object_like<Device, CCUDADevice>
+inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(Device&& device, SExportableMemoryCreationParams&& params)
 {
-	return getMappedBuffer(*memory,mappedBuffer);
+	return createExportableMemory(detail::as_ref(std::forward<Device>(device)),std::move(params));
 }
 
-inline CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore* semaphore)
+template<typename Memory>
+requires detail::const_object_like<Memory, CCUDAExportableMemory>
+inline CUdeviceptr getDeviceptr(Memory&& memory)
 {
-	return getInternalObject(*semaphore);
+	return getDeviceptr(detail::as_ref(std::forward<Memory>(memory)));
 }
 
-inline CUexternalSemaphore getInternalObject(const core::smart_refctd_ptr<CCUDAImportedSemaphore>& semaphore)
+template<typename Memory>
+requires detail::const_object_like<Memory, CCUDAImportedMemory>
+inline CUresult getMappedBuffer(Memory&& memory, CUdeviceptr* mappedBuffer)
 {
-	return getInternalObject(*semaphore);
+	return getMappedBuffer(detail::as_ref(std::forward<Memory>(memory)),mappedBuffer);
 }
 
 }
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index fce7fd2b5a..13046d6d1e 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -5,11 +5,11 @@
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
 #include "nbl/system/ModuleLookupUtils.h"
 
+#include "nlohmann/json.hpp"
+
 #include <algorithm>
 #include <cstdlib>
 #include <fstream>
-#include <sstream>
-#include <string_view>
 #include <system_error>
 
 namespace nbl::video::cuda_interop
@@ -17,21 +17,11 @@ namespace nbl::video::cuda_interop
 namespace
 {
 
-std::string readEnvironmentVariable(std::string_view name)
+std::string readEnvironmentVariable(const char* name)
 {
-	#if defined(_NBL_PLATFORM_WINDOWS_)
-	char* value = nullptr;
-	size_t size = 0;
-	if (_dupenv_s(&value,&size,std::string(name).c_str()) || !value)
-		return {};
-	std::string result(value);
-	std::free(value);
-	return result;
-	#else
-	if (const char* value = std::getenv(std::string(name).c_str()))
+	if (const char* value = std::getenv(name))
 		return value;
 	return {};
-	#endif
 }
 
 bool isDirectory(const system::path& path)
@@ -90,50 +80,6 @@ void appendCUDAIncludeRoot(core::vector<system::path>& includeDirs, const system
 	appendIncludeDir(includeDirs,root/"include");
 }
 
-core::vector<std::string> parseStringArray(std::string_view text, std::string_view key)
-{
-	core::vector<std::string> values;
-	const std::string quotedKey = "\"" + std::string(key) + "\"";
-	const auto keyPos = text.find(quotedKey);
-	if (keyPos==std::string_view::npos)
-		return values;
-
-	const auto arrayBegin = text.find('[',keyPos+quotedKey.size());
-	if (arrayBegin==std::string_view::npos)
-		return values;
-	const auto arrayEnd = text.find(']',arrayBegin+1);
-	if (arrayEnd==std::string_view::npos)
-		return values;
-
-	for (auto pos = arrayBegin+1; pos<arrayEnd;)
-	{
-		const auto quoteBegin = text.find('"',pos);
-		if (quoteBegin==std::string_view::npos || quoteBegin>=arrayEnd)
-			break;
-
-		std::string value;
-		auto cursor = quoteBegin+1;
-		for (; cursor<arrayEnd; ++cursor)
-		{
-			const char c = text[cursor];
-			if (c=='\\')
-			{
-				if (++cursor<arrayEnd)
-					value.push_back(text[cursor]);
-				continue;
-			}
-			if (c=='"')
-				break;
-			value.push_back(c);
-		}
-
-		values.push_back(std::move(value));
-		pos = cursor+1;
-	}
-
-	return values;
-}
-
 void appendRuntimePathsConfig(core::vector<system::path>& includeDirs, const system::path& configFile)
 {
 	if (!isRegularFile(configFile))
@@ -143,13 +89,20 @@ void appendRuntimePathsConfig(core::vector<system::path>& includeDirs, const sys
 	if (!input)
 		return;
 
-	std::stringstream buffer;
-	buffer << input.rdbuf();
-	for (const auto& path : parseStringArray(buffer.str(),"cudaRuntimeIncludeDirs"))
-		appendIncludeDir(includeDirs,system::path(path));
+	const auto json = nlohmann::json::parse(input,nullptr,false);
+	if (json.is_discarded())
+		return;
+
+	const auto paths = json.find("cudaRuntimeIncludeDirs");
+	if (paths==json.end() || !paths->is_array())
+		return;
+
+	for (const auto& path : *paths)
+		if (path.is_string())
+			appendIncludeDir(includeDirs,system::path(path.get<std::string>()));
 }
 
-void appendRuntimePathsConfigEnv(core::vector<system::path>& includeDirs, std::string_view name)
+void appendRuntimePathsConfigEnv(core::vector<system::path>& includeDirs, const char* name)
 {
 	const auto value = readEnvironmentVariable(name);
 	if (value.empty())
@@ -218,7 +171,7 @@ void appendPythonPackageIncludeDirs(core::vector<system::path>& includeDirs, con
 	appendIncludeDir(includeDirs,root/"include");
 }
 
-void appendPathListEnv(core::vector<system::path>& includeDirs, std::string_view name)
+void appendPathListEnv(core::vector<system::path>& includeDirs, const char* name)
 {
 	const auto value = readEnvironmentVariable(name);
 	if (value.empty())
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 837f3ab28e..c75300016e 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -66,6 +66,33 @@ auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
 - `CUDA_PATH` is a developer fallback. It is not required for packaged applications.
 - Direct `target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)` remains possible, but it only adds compile/link usage requirements and does not create the runtime discovery JSON.
 
+## Runtime Header Distribution
+
+Nabla packages do not ship CUDA runtime headers. That is a packaging choice, not a hard legal requirement for applications that need NVRTC runtime compilation.
+
+NVIDIA CUDA EULA limits CUDA redistribution to selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A then lists the CUDA Toolkit files that may be redistributed with applications. See:
+
+- https://docs.nvidia.com/cuda/eula/#distribution
+- https://docs.nvidia.com/cuda/eula/#attachment-a
+
+Relevant Attachment A header entries include:
+
+- `nvrtc.h` under `NVIDIA Runtime Compilation Library and Header`.
+- `cuda_occupancy.h` under `CUDA Occupancy Calculation Header Library`.
+- `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp` under `CUDA Floating Point Type Headers`.
+- `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, `vector_types.h` under `CUDA Headers for Runtime Compilation`.
+
+CuPy documents the same runtime-compile problem. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They also show the common `vector_types.h` failure and recommend `nvidia-cuda-runtime-cu12` for PyPI installs or `cuda-cudart-dev` from system packages:
+
+- https://docs.cupy.dev/en/v13.5.0/install.html#cupy-always-raises-nvrtc-error-compilation-6
+- https://github.com/cupy/cupy/issues/8466
+
+For Nabla consumers this means:
+
+- The default Nabla package stays SDK-free for consumers that only link `Nabla::Nabla`.
+- Native interop consumers can install CUDA runtime headers through an official package, point `NBL_CUDA_INTEROP_RUNTIME_JSON` at their own JSON, pass `INCLUDE_DIRS` to `nbl_target_link_cuda_interop`, or ship an app-local header bundle if their distribution model allows it.
+- Shipping such headers is a consumer packaging decision. Nabla runtime discovery supports it, but Nabla does not install host-specific CUDA header paths or redistribute CUDA headers by default.
+
 ## Properties
 
 - Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers.

From 8a119dda501a7f9c6f979ee7e6d98e6840c04d35 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 13:24:10 +0200
Subject: [PATCH 18/27] Hide CUDA interop native state construction

---
 include/nbl/ext/CUDAInterop/CCUDADevice.h     |  7 ++--
 .../ext/CUDAInterop/CCUDAExportableMemory.h   |  6 ++-
 include/nbl/ext/CUDAInterop/CCUDAHandler.h    |  6 +--
 .../nbl/ext/CUDAInterop/CCUDAImportedMemory.h |  7 ++--
 .../ext/CUDAInterop/CCUDAImportedSemaphore.h  |  6 ++-
 src/nbl/ext/CUDAInterop/CCUDADevice.cpp       | 22 ++++++++---
 .../ext/CUDAInterop/CCUDAExportableMemory.cpp | 24 +++++++++++-
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      | 16 ++++++--
 .../ext/CUDAInterop/CCUDAImportedMemory.cpp   |  8 +++-
 .../CUDAInterop/CCUDAImportedSemaphore.cpp    |  8 +++-
 .../CUDAInterop/CUDAInteropNativeState.hpp    |  9 +++++
 src/nbl/ext/CUDAInterop/README.md             | 38 +++++++++++++++++--
 12 files changed, 126 insertions(+), 31 deletions(-)

diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h
index 12465f40f4..94eb450802 100644
--- a/include/nbl/ext/CUDAInterop/CCUDADevice.h
+++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h
@@ -25,7 +25,6 @@ struct SAccess;
 class NBL_API2 CCUDADevice : public core::IReferenceCounted
 {
 	public:
-		struct SNativeState;
 #ifdef _WIN32
 		static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32;
 #else
@@ -68,8 +67,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 		};
 		inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;}
 
-		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr<SNativeState>&& nativeState, core::smart_refctd_ptr<CCUDAHandler>&& handler);
-
 		~CCUDADevice() override;
 
 		inline core::SRange<const char* const> geDefaultCompileOptions() const
@@ -86,8 +83,12 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 		core::smart_refctd_ptr<CCUDAImportedSemaphore> importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sem);
 
 	private:
+		friend class CCUDAHandler;
 		friend struct cuda_native::SAccess;
 
+		struct SNativeState;
+		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr<SNativeState>&& nativeState, core::smart_refctd_ptr<CCUDAHandler>&& handler);
+
 		static constexpr auto CudaMemoryLocationCount = 5;
 
 		const system::logger_opt_ptr m_logger;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
index 80a9b3630a..6d29739408 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
@@ -21,7 +21,6 @@ struct SAccess;
 class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 {
 	public:
-		struct SNativeState;
 		struct SCachedCreationParams
 		{
 			size_t size;
@@ -31,7 +30,6 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 			bool deviceLocal;
 		};
 
-		CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState);
 		~CCUDAExportableMemory() override;
 
 		core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
@@ -39,6 +37,10 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 	private:
 		friend struct cuda_native::SAccess;
 
+		struct SNativeState;
+		CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState);
+		static core::smart_refctd_ptr<CCUDAExportableMemory> create(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState);
+
 		core::smart_refctd_ptr<CCUDADevice> m_device;
 		SCachedCreationParams m_params;
 		std::unique_ptr<SNativeState> m_native;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
index bed4f9a31c..f6b5d578a8 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -44,11 +44,8 @@ NBL_API2 core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompile
 class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 {
 	public:
-		struct SNativeState;
 		static core::smart_refctd_ptr<CCUDAHandler> create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger);
 
-		CCUDAHandler(std::unique_ptr<SNativeState>&& nativeState, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version);
-
 		inline core::SRange<system::IFile* const> getSTDHeaders()
 		{
 			auto begin = m_headers.empty() ? nullptr:(&m_headers[0].get());
@@ -75,6 +72,9 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 	private:
 		friend struct cuda_native::SAccess;
 
+		struct SNativeState;
+		CCUDAHandler(std::unique_ptr<SNativeState>&& nativeState, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version);
+
 		std::unique_ptr<SNativeState> m_native;
 		core::vector<SCUDADeviceInfo> m_availableDevices;
 		core::vector<core::smart_refctd_ptr<system::IFile>> m_headers;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
index adb803f12c..87f804ce76 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
@@ -19,14 +19,15 @@ struct SAccess;
 class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
 {
 	public:
-		struct SNativeState;
-		CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState);
-
 		~CCUDAImportedMemory() override;
 
 	private:
+		friend class CCUDADevice;
 		friend struct cuda_native::SAccess;
 
+		struct SNativeState;
+		CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState);
+
 		core::smart_refctd_ptr<CCUDADevice> m_device;
 		core::smart_refctd_ptr<IDeviceMemoryAllocation> m_src;
 		std::unique_ptr<SNativeState> m_native;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
index 894f2444c0..c8bf77313e 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
@@ -22,13 +22,15 @@ struct SAccess;
 class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted
 {
 	public:
-		struct SNativeState;
-		CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState);
 		~CCUDAImportedSemaphore() override;
 
 	private:
+		friend class CCUDADevice;
 		friend struct cuda_native::SAccess;
 
+		struct SNativeState;
+		CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState);
+
 		core::smart_refctd_ptr<CCUDADevice> m_device;
 		core::smart_refctd_ptr<ISemaphore> m_src;
 		std::unique_ptr<SNativeState> m_native;
diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
index ebac00b7b4..8e696d0827 100644
--- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
@@ -27,6 +27,8 @@ CCUDADevice::CCUDADevice(
 	m_handler(std::move(handler)),
 	m_native(std::move(nativeState))
 {
+	assert(m_native);
+
 	m_defaultCompileOptions.push_back("--std=c++14");
 	m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]);
 	m_defaultCompileOptions.push_back("-dc");
@@ -150,7 +152,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 #endif
 	};
 
-	auto nativeState = std::make_unique<CCUDAExportableMemory::SNativeState>();
+	auto nativeState = SAccess::makeExportableMemoryNativeState();
 
 	CUmemGenericAllocationHandle mem;
 	if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err)
@@ -166,7 +168,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 		return nullptr;
 	}
 
-	if (const auto err = reserveAddressAndMapMemory(device,&nativeState->ptr, params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err)
+	if (const auto err = reserveAddressAndMapMemory(device,&SAccess::deviceptr(*nativeState), params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err)
 	{
 		logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
@@ -185,7 +187,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 		return nullptr;
 	}
 	
-	return core::make_smart_refctd_ptr<CCUDAExportableMemory>(core::smart_refctd_ptr<CCUDADevice>(&device), std::move(params), std::move(nativeState));
+	return SAccess::makeExportableMemory(core::smart_refctd_ptr<CCUDADevice>(&device),std::move(params),std::move(nativeState));
 }
 
 }
@@ -215,7 +217,10 @@ core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(co
 		m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR);
 		return nullptr;
 	}
-	return core::make_smart_refctd_ptr<CCUDAImportedMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(mem), std::make_unique<CCUDAImportedMemory::SNativeState>(cuExtMem));
+	return core::smart_refctd_ptr<CCUDAImportedMemory>(
+		new CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice>(this),std::move(mem),std::make_unique<CCUDAImportedMemory::SNativeState>(cuExtMem)),
+		core::dont_grab
+	);
 }
 
 core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sema)
@@ -245,7 +250,10 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 		return nullptr;
 	}
 	
-	return core::make_smart_refctd_ptr<CCUDAImportedSemaphore>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(sema), std::make_unique<CCUDAImportedSemaphore::SNativeState>(cusema));
+	return core::smart_refctd_ptr<CCUDAImportedSemaphore>(
+		new CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice>(this),std::move(sema),std::make_unique<CCUDAImportedSemaphore::SNativeState>(cusema)),
+		core::dont_grab
+	);
 }
 
 CCUDADevice::~CCUDADevice()
@@ -275,7 +283,9 @@ CCUDADevice::CCUDADevice(
 	, m_virtualArchitecture(virtualArchitecture)
 	, m_handler(std::move(handler))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
 
 CCUDADevice::~CCUDADevice() = default;
 
diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
index a65d1b680c..7d5483af04 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
@@ -14,7 +14,17 @@ CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice>
 	: m_device(std::move(device))
 	, m_params(std::move(params))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
+
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDAExportableMemory::create(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
+{
+	return core::smart_refctd_ptr<CCUDAExportableMemory>(
+		new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)),
+		core::dont_grab
+	);
+}
 
 core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const
 {
@@ -76,7 +86,17 @@ CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice>
 	: m_device(std::move(device))
 	, m_params(std::move(params))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
+
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDAExportableMemory::create(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
+{
+	return core::smart_refctd_ptr<CCUDAExportableMemory>(
+		new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)),
+		core::dont_grab
+	);
+}
 
 CCUDAExportableMemory::~CCUDAExportableMemory() = default;
 
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 13046d6d1e..229a27cfac 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -309,6 +309,8 @@ CCUDAHandler::CCUDAHandler(
 	, m_logger(std::move(_logger))
 	, m_version(_version)
 {
+	assert(m_native);
+
 	for (auto& header : m_headers)
 	{
 		m_headerContents.push_back(reinterpret_cast<const char*>(header->getMappedPointer()));
@@ -858,7 +860,10 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 		));
 	}
 
-	return core::make_smart_refctd_ptr<CCUDAHandler>(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc)), std::move(headers), std::move(_logger), cudaVersion);
+	return core::smart_refctd_ptr<CCUDAHandler>(
+		new CCUDAHandler(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger),cudaVersion),
+		core::dont_grab
+	);
 }
 
 namespace cuda_native
@@ -1090,7 +1095,10 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 			if (arch==CCUDADevice::EVA_COUNT)
 				continue;
 
-			return core::make_smart_refctd_ptr<CCUDADevice>(std::move(vulkanConnection), physicalDevice, arch, std::make_unique<CCUDADevice::SNativeState>(device.handle), core::smart_refctd_ptr<CCUDAHandler>(this));
+			return core::smart_refctd_ptr<CCUDADevice>(
+				new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique<CCUDADevice::SNativeState>(device.handle),core::smart_refctd_ptr<CCUDAHandler>(this)),
+				core::dont_grab
+			);
 		}
 	}
 	return nullptr;
@@ -1115,7 +1123,9 @@ CCUDAHandler::CCUDAHandler(
 	, m_headers(std::move(_headers))
 	, m_logger(std::move(_logger))
 	, m_version(_version)
-{}
+{
+	assert(m_native);
+}
 
 CCUDAHandler::~CCUDAHandler() = default;
 
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
index 8de3ce3e63..3a8ed56371 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
@@ -14,7 +14,9 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> dev
 	: m_device(std::move(device))
 	, m_src(std::move(src))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
 
 namespace cuda_native
 {
@@ -57,7 +59,9 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> dev
 	: m_device(std::move(device))
 	, m_src(std::move(src))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
 
 CCUDAImportedMemory::~CCUDAImportedMemory() = default;
 
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
index fdbb56b0cf..6d980ed126 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
@@ -13,7 +13,9 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevic
 	: m_device(std::move(device))
 	, m_src(std::move(src))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
 
 namespace cuda_native
 {
@@ -44,7 +46,9 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevic
 	: m_device(std::move(device))
 	, m_src(std::move(src))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
 
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default;
 
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
index 47701359ba..74cb7823d5 100644
--- a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
+++ b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
@@ -76,6 +76,15 @@ struct SAccess
 
 	static CCUDAExportableMemory::SNativeState& native(CCUDAExportableMemory& memory) { return *memory.m_native; }
 	static const CCUDAExportableMemory::SNativeState& native(const CCUDAExportableMemory& memory) { return *memory.m_native; }
+	static std::unique_ptr<CCUDAExportableMemory::SNativeState> makeExportableMemoryNativeState()
+	{
+		return std::unique_ptr<CCUDAExportableMemory::SNativeState>(new CCUDAExportableMemory::SNativeState());
+	}
+	static CUdeviceptr& deviceptr(CCUDAExportableMemory::SNativeState& nativeState) { return nativeState.ptr; }
+	static core::smart_refctd_ptr<CCUDAExportableMemory> makeExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, CCUDAExportableMemory::SCachedCreationParams&& params, std::unique_ptr<CCUDAExportableMemory::SNativeState>&& nativeState)
+	{
+		return CCUDAExportableMemory::create(std::move(device),std::move(params),std::move(nativeState));
+	}
 
 	static CCUDAImportedMemory::SNativeState& native(CCUDAImportedMemory& memory) { return *memory.m_native; }
 	static const CCUDAImportedMemory::SNativeState& native(const CCUDAImportedMemory& memory) { return *memory.m_native; }
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index c75300016e..214d5add14 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -11,18 +11,26 @@
 - Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInterop`.
 - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
 
-## Usage
+## Basic Usage
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED)
 target_link_libraries(app PRIVATE Nabla::Nabla)
 ```
 
+This path does not require CUDA SDK headers on the consuming project.
+
+## Native Opt-In
+
+Use the native opt-in path only in targets that include `CUDAInteropNative.h` or use raw CUDA Driver API/NVRTC types.
+
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
 nbl_target_link_cuda_interop(native_app PRIVATE)
 ```
 
+`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes runtime CUDA header discovery JSON for `native_app`.
+
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
 nbl_target_link_cuda_interop(native_app PRIVATE
@@ -36,19 +44,42 @@ nbl_target_link_cuda_interop(native_app PRIVATE
 )
 ```
 
+Pseudo flow:
+
 ```cpp
 #include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
 
-auto runtimeEnv = nbl::video::cuda_interop::findRuntimeCompileEnvironment();
-auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnv);
+auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger));
+auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice);
 
 auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
     .size = size,
     .alignment = alignment,
     .location = CU_MEM_LOCATION_TYPE_DEVICE,
 });
+
+std::string log;
+auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX(
+    handler,
+    cudaSource,
+    "kernel.cu",
+    cudaDevice->geDefaultCompileOptions(),
+    0,
+    nullptr,
+    nullptr,
+    &log
+);
 ```
 
+`compileDirectlyToPTX` performs runtime CUDA header discovery internally. Code that drives NVRTC manually can call `cuda_interop::findRuntimeCompileEnvironment` and `cuda_interop::makeNVRTCIncludeOptions` directly.
+
+Reference smoke:
+
+- CMake target setup: `src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt`
+- SDK-free package boundary check: `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp`
+- Default Nabla package usage without native opt-in: `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp`
+- Native CUDA opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC and raw interop usage: `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp`
+
 ## Runtime Header Discovery
 
 - `nbl_target_link_cuda_interop(<target> <scope>)` links `Nabla::ext::CUDAInterop` and configures runtime include discovery for that target.
@@ -100,6 +131,7 @@ For Nabla consumers this means:
 - Raw CUDA access is not wrapped away in the native opt-in path. Native code uses CUDA Driver API and NVRTC types directly.
 - CUDA SDK structs with version-sensitive layout are kept out of exported Nabla ABI.
 - The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs.
+- Native state is PIMPL-owned by Nabla. Consumers cannot construct CUDA wrapper objects with arbitrary internal state.
 - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla.
 - `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor.
 - Runtime CUDA header discovery is independent from the CUDA SDK used to build Nabla.

From e018545fb659ee74400a2635f93f502cd1d0f4f3 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 13:49:22 +0200
Subject: [PATCH 19/27] Clean up CUDA runtime header discovery

---
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp | 112 ++++++++++++-----------
 src/nbl/ext/CUDAInterop/README.md        |   5 +-
 2 files changed, 64 insertions(+), 53 deletions(-)

diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 229a27cfac..de7f14b58f 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -17,6 +17,12 @@ namespace nbl::video::cuda_interop
 namespace
 {
 
+#if defined(_NBL_PLATFORM_WINDOWS_)
+inline constexpr char EnvironmentPathListSeparator = ';';
+#else
+inline constexpr char EnvironmentPathListSeparator = ':';
+#endif
+
 std::string readEnvironmentVariable(const char* name)
 {
 	if (const char* value = std::getenv(name))
@@ -71,6 +77,39 @@ void appendIncludeDir(core::vector<system::path>& includeDirs, system::path path
 		includeDirs.push_back(std::move(path));
 }
 
+void appendCUDAIncludeDirsBelow(core::vector<system::path>& includeDirs, const system::path& root, uint32_t maxDepth)
+{
+	if (!isDirectory(root))
+		return;
+
+	if (looksLikeCUDAIncludeDir(root))
+	{
+		appendIncludeDir(includeDirs,root);
+		return;
+	}
+	if (maxDepth==0u)
+		return;
+
+	core::vector<system::path> candidates;
+	std::error_code error;
+	for (const auto& entry : std::filesystem::directory_iterator(root,error))
+	{
+		if (error)
+			break;
+
+		std::error_code entryError;
+		if (!entry.is_directory(entryError))
+			continue;
+		candidates.push_back(entry.path());
+	}
+
+	std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) {
+		return lhs.generic_string()>rhs.generic_string();
+	});
+	for (const auto& candidate : candidates)
+		appendCUDAIncludeDirsBelow(includeDirs,candidate,maxDepth-1u);
+}
+
 void appendCUDAIncludeRoot(core::vector<system::path>& includeDirs, const system::path& root)
 {
 	if (root.empty())
@@ -102,24 +141,20 @@ void appendRuntimePathsConfig(core::vector<system::path>& includeDirs, const sys
 			appendIncludeDir(includeDirs,system::path(path.get<std::string>()));
 }
 
-void appendRuntimePathsConfigEnv(core::vector<system::path>& includeDirs, const char* name)
+template<typename Append>
+void appendPathListEnv(const char* name, Append append)
 {
 	const auto value = readEnvironmentVariable(name);
 	if (value.empty())
 		return;
 
-	#if defined(_NBL_PLATFORM_WINDOWS_)
-	constexpr char Separator = ';';
-	#else
-	constexpr char Separator = ':';
-	#endif
-
 	size_t begin = 0;
 	while (begin<value.size())
 	{
-		const auto end = value.find(Separator,begin);
+		const auto end = value.find(EnvironmentPathListSeparator,begin);
 		const auto segment = value.substr(begin,end==std::string::npos ? std::string::npos:end-begin);
-		appendRuntimePathsConfig(includeDirs,system::path(segment));
+		if (!segment.empty())
+			append(system::path(segment));
 		if (end==std::string::npos)
 			break;
 		begin = end+1;
@@ -131,21 +166,13 @@ void appendRuntimePathsConfigs(core::vector<system::path>& includeDirs, const co
 	for (const auto& runtimePathFile : explicitRuntimePathFiles)
 		appendRuntimePathsConfig(includeDirs,runtimePathFile);
 
-	appendRuntimePathsConfigEnv(includeDirs,"NBL_CUDA_INTEROP_RUNTIME_JSON");
-	appendRuntimePathsConfigEnv(includeDirs,"Nabla_CUDA_INTEROP_RUNTIME_JSON");
+	const auto appendConfig = [&](const system::path& path) { appendRuntimePathsConfig(includeDirs,path); };
+	appendPathListEnv("NBL_CUDA_INTEROP_RUNTIME_JSON",appendConfig);
+	appendPathListEnv("Nabla_CUDA_INTEROP_RUNTIME_JSON",appendConfig);
 
 	const auto exeDir = system::executableDirectory();
 	if (!exeDir.empty())
 		appendRuntimePathsConfig(includeDirs,exeDir/RuntimePathsFileName);
-
-	#if defined(_NBL_PLATFORM_WINDOWS_)
-	const auto releaseModuleDir = system::loadedModuleDirectory("Nabla.dll");
-	if (!releaseModuleDir.empty())
-		appendRuntimePathsConfig(includeDirs,releaseModuleDir/RuntimePathsFileName);
-	const auto debugModuleDir = system::loadedModuleDirectory("Nabla_debug.dll");
-	if (!debugModuleDir.empty())
-		appendRuntimePathsConfig(includeDirs,debugModuleDir/RuntimePathsFileName);
-	#endif
 }
 
 void appendAppLocalIncludeDirs(core::vector<system::path>& includeDirs)
@@ -155,9 +182,10 @@ void appendAppLocalIncludeDirs(core::vector<system::path>& includeDirs)
 		return;
 
 	appendIncludeDir(includeDirs,exeDir/"cuda"/"include");
-	appendIncludeDir(includeDirs,exeDir/"nvidia"/"cu13"/"include");
+	appendCUDAIncludeDirsBelow(includeDirs,exeDir/"nvidia",4u);
 	appendIncludeDir(includeDirs,exeDir/"Libraries"/"cuda"/"include");
 	appendIncludeDir(includeDirs,exeDir.parent_path()/"cuda"/"include");
+	appendCUDAIncludeDirsBelow(includeDirs,exeDir.parent_path()/"nvidia",4u);
 }
 
 void appendPythonPackageIncludeDirs(core::vector<system::path>& includeDirs, const system::path& root)
@@ -165,40 +193,17 @@ void appendPythonPackageIncludeDirs(core::vector<system::path>& includeDirs, con
 	if (root.empty())
 		return;
 
-	appendIncludeDir(includeDirs,root/"Lib"/"site-packages"/"nvidia"/"cu13"/"include");
-	appendIncludeDir(includeDirs,root/"lib"/"site-packages"/"nvidia"/"cu13"/"include");
+	appendCUDAIncludeDirsBelow(includeDirs,root/"Lib"/"site-packages"/"nvidia",4u);
+	appendCUDAIncludeDirsBelow(includeDirs,root/"lib"/"site-packages"/"nvidia",4u);
 	appendIncludeDir(includeDirs,root/"Library"/"include");
 	appendIncludeDir(includeDirs,root/"include");
 }
 
-void appendPathListEnv(core::vector<system::path>& includeDirs, const char* name)
-{
-	const auto value = readEnvironmentVariable(name);
-	if (value.empty())
-		return;
-
-	#if defined(_NBL_PLATFORM_WINDOWS_)
-	constexpr char Separator = ';';
-	#else
-	constexpr char Separator = ':';
-	#endif
-
-	size_t begin = 0;
-	while (begin<value.size())
-	{
-		const auto end = value.find(Separator,begin);
-		const auto segment = value.substr(begin,end==std::string::npos ? std::string::npos:end-begin);
-		appendIncludeDir(includeDirs,system::path(segment));
-		if (end==std::string::npos)
-			break;
-		begin = end+1;
-	}
-}
-
 void appendEnvironmentIncludeDirs(core::vector<system::path>& includeDirs)
 {
-	appendPathListEnv(includeDirs,"NBL_CUDA_RUNTIME_INCLUDE_DIRS");
-	appendPathListEnv(includeDirs,"Nabla_CUDA_RUNTIME_INCLUDE_DIRS");
+	const auto appendInclude = [&](const system::path& path) { appendIncludeDir(includeDirs,path); };
+	appendPathListEnv("NBL_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude);
+	appendPathListEnv("Nabla_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude);
 
 	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_PATH"));
 	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_HOME"));
@@ -942,13 +947,18 @@ ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 	return {std::move(ptx),SAccess::native(handler).nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
 }
 
+static const core::vector<std::string>& getDefaultRuntimeIncludeOptions()
+{
+	static const auto RuntimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(cuda_interop::findRuntimeCompileEnvironment());
+	return RuntimeIncludeOptions;
+}
+
 static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
 {
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
-	const auto runtimeEnvironment = cuda_interop::findRuntimeCompileEnvironment();
-	const auto runtimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment);
+	const auto& runtimeIncludeOptions = getDefaultRuntimeIncludeOptions();
 	core::vector<const char*> options;
 	options.reserve(nvrtcOptions.size()+runtimeIncludeOptions.size());
 	for (const auto option : nvrtcOptions)
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 214d5add14..0d7b01a033 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -90,8 +90,9 @@ Reference smoke:
 - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
 - The JSON is a build artifact. Nabla packages do not install JSON files with host-specific CUDA paths.
 - Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`.
-- Runtime lookup reads `nbl_cuda_interop_runtime.json` first, then checks app-local include bundles, explicit environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
-- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list.
+- Runtime lookup reads explicit JSON paths and `NBL_CUDA_INTEROP_RUNTIME_JSON` first, then checks executable-local `nbl_cuda_interop_runtime.json`, app-local include bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
+- App-local and Python/conda package probing looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in the path.
+- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list and caches the default discovery result after first use.
 - Production machines do not need the full CUDA SDK just because Nabla was built with CUDA.
 - If an application compiles CUDA source with NVRTC and includes headers such as `cuda_fp16.h`, it must provide those runtime headers through the generated JSON path, an app-local bundle, a runtime/header package, or an installed toolkit.
 - `CUDA_PATH` is a developer fallback. It is not required for packaged applications.

From c6ef6eea004ceeb2b25378f1312deb79cd21f283 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 14:30:58 +0200
Subject: [PATCH 20/27] Move CUDA interop API back into video

---
 include/nbl/ext/CUDAInterop/CUDAInterop.h     |  13 --
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   |   2 +-
 include/nbl/ext/OptiX/IDenoiser.h             |   2 +-
 .../{ext/CUDAInterop => video}/CCUDADevice.h  |   6 +-
 .../CCUDAExportableMemory.h                   |   0
 .../{ext/CUDAInterop => video}/CCUDAHandler.h |   0
 .../CCUDAImportedMemory.h                     |   4 +-
 .../CCUDAImportedSemaphore.h                  |   0
 include/nbl/video/CUDAInterop.h               |  13 ++
 src/nbl/CMakeLists.txt                        |  10 +-
 src/nbl/ext/CUDAInterop/README.md             | 138 ++++++++----------
 .../ext/CUDAInterop/smoke/clean_opt_in.cpp    |   2 +-
 .../ext/CUDAInterop/smoke/public_boundary.cpp |   2 +-
 .../CUDAInterop => video}/CCUDADevice.cpp     |   2 +-
 .../CCUDAExportableMemory.cpp                 |   2 +-
 .../CUDAInterop => video}/CCUDAHandler.cpp    |   2 +-
 .../CCUDAImportedMemory.cpp                   |   2 +-
 .../CCUDAImportedSemaphore.cpp                |   2 +-
 .../CUDAInteropNativeState.hpp                |   4 +-
 19 files changed, 91 insertions(+), 115 deletions(-)
 delete mode 100644 include/nbl/ext/CUDAInterop/CUDAInterop.h
 rename include/nbl/{ext/CUDAInterop => video}/CCUDADevice.h (94%)
 rename include/nbl/{ext/CUDAInterop => video}/CCUDAExportableMemory.h (100%)
 rename include/nbl/{ext/CUDAInterop => video}/CCUDAHandler.h (100%)
 rename include/nbl/{ext/CUDAInterop => video}/CCUDAImportedMemory.h (86%)
 rename include/nbl/{ext/CUDAInterop => video}/CCUDAImportedSemaphore.h (100%)
 create mode 100644 include/nbl/video/CUDAInterop.h
 rename src/nbl/{ext/CUDAInterop => video}/CCUDADevice.cpp (99%)
 rename src/nbl/{ext/CUDAInterop => video}/CCUDAExportableMemory.cpp (98%)
 rename src/nbl/{ext/CUDAInterop => video}/CCUDAHandler.cpp (99%)
 rename src/nbl/{ext/CUDAInterop => video}/CCUDAImportedMemory.cpp (97%)
 rename src/nbl/{ext/CUDAInterop => video}/CCUDAImportedSemaphore.cpp (97%)
 rename src/nbl/{ext/CUDAInterop => video}/CUDAInteropNativeState.hpp (96%)

diff --git a/include/nbl/ext/CUDAInterop/CUDAInterop.h b/include/nbl/ext/CUDAInterop/CUDAInterop.h
deleted file mode 100644
index 06d9016dc8..0000000000
--- a/include/nbl/ext/CUDAInterop/CUDAInterop.h
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-#ifndef _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
-#define _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
-
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
-#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
-#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
-
-#endif
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 6833ad8189..9d23fcb4ef 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -4,7 +4,7 @@
 #ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
 #define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #include "nbl/asset/ICPUBuffer.h"
 #include "nbl/system/DynamicFunctionCaller.h"
diff --git a/include/nbl/ext/OptiX/IDenoiser.h b/include/nbl/ext/OptiX/IDenoiser.h
index 496383d92d..bb0677657d 100644
--- a/include/nbl/ext/OptiX/IDenoiser.h
+++ b/include/nbl/ext/OptiX/IDenoiser.h
@@ -5,7 +5,7 @@
 #ifndef __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__
 #define __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__
 
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "nbl/video/CCUDAHandler.h"
 
 #include <optix.h>
 #include <optix_denoiser_tiling.h>
diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
similarity index 94%
rename from include/nbl/ext/CUDAInterop/CCUDADevice.h
rename to include/nbl/video/CCUDADevice.h
index 94eb450802..bc1931e363 100644
--- a/include/nbl/ext/CUDAInterop/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -5,9 +5,9 @@
 #define _NBL_VIDEO_C_CUDA_DEVICE_H_
 
 #include "nbl/video/declarations.h"
-#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
+#include "nbl/video/CCUDAExportableMemory.h"
+#include "nbl/video/CCUDAImportedMemory.h"
+#include "nbl/video/CCUDAImportedSemaphore.h"
 
 #include <cstring>
 #include <memory>
diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h
similarity index 100%
rename from include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
rename to include/nbl/video/CCUDAExportableMemory.h
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
similarity index 100%
rename from include/nbl/ext/CUDAInterop/CCUDAHandler.h
rename to include/nbl/video/CCUDAHandler.h
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h
similarity index 86%
rename from include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
rename to include/nbl/video/CCUDAImportedMemory.h
index 87f804ce76..ac41c110a2 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
+++ b/include/nbl/video/CCUDAImportedMemory.h
@@ -1,5 +1,5 @@
-#ifndef _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
-#define _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
+#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_
+#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_
 
 #include "nbl/video/declarations.h"
 
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h
similarity index 100%
rename from include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
rename to include/nbl/video/CCUDAImportedSemaphore.h
diff --git a/include/nbl/video/CUDAInterop.h b/include/nbl/video/CUDAInterop.h
new file mode 100644
index 0000000000..57e92ae647
--- /dev/null
+++ b/include/nbl/video/CUDAInterop.h
@@ -0,0 +1,13 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_
+#define _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_
+
+#include "nbl/video/CCUDADevice.h"
+#include "nbl/video/CCUDAExportableMemory.h"
+#include "nbl/video/CCUDAHandler.h"
+#include "nbl/video/CCUDAImportedMemory.h"
+#include "nbl/video/CCUDAImportedSemaphore.h"
+
+#endif
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index ccb600ca32..d56c223e34 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -126,11 +126,11 @@ set(NBL_CORE_SOURCES
 )
 
 set(NBL_CUDA_INTEROP_SOURCES
-	ext/CUDAInterop/CCUDADevice.cpp
-	ext/CUDAInterop/CCUDAExportableMemory.cpp
-	ext/CUDAInterop/CCUDAHandler.cpp
-	ext/CUDAInterop/CCUDAImportedMemory.cpp
-	ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+	video/CCUDADevice.cpp
+	video/CCUDAExportableMemory.cpp
+	video/CCUDAHandler.cpp
+	video/CCUDAImportedMemory.cpp
+	video/CCUDAImportedSemaphore.cpp
 )
 
 set(NBL_SYSTEM_SOURCES
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 0d7b01a033..e99edd82c0 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -1,50 +1,50 @@
-# CUDA Interop Targets
+# CUDA Interop
 
-- `Nabla::Nabla` owns the CUDA interop implementation and exported symbols.
-- `Nabla::Nabla` public headers do not include `cuda.h` or `nvrtc.h`.
-- The SDK-free interop headers stay stable for CUDA ON and CUDA OFF Nabla builds.
-- `Nabla::ext::CUDAInterop` is the explicit raw CUDA Driver API and NVRTC opt-in target.
-- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It does not build a library or executable artifact.
-- The target only carries usage requirements and IDE-visible sources.
-- `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`.
-- `CUDAInteropNative.h` is the small opt-in header that includes CUDA SDK headers such as `cuda.h` and `nvrtc.h`.
-- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInterop`.
-- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
+## Layout
 
-## Basic Usage
+- `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and its implementation in `src/nbl/video/CCUDA*.cpp`.
+- Those headers do not include CUDA SDK headers. Consumers that only link `Nabla::Nabla` do not need `cuda.h`, `nvrtc.h`, or a CUDA SDK install just to parse Nabla headers.
+- `Nabla::ext::CUDAInterop` is an `INTERFACE` target for native CUDA opt-in. It builds no library. It only adds `CUDAInteropNative.h`, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop.
+- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*` accessors for CUDA Driver API and NVRTC types.
+
+## CMake Usage
+
+Default Nabla usage stays SDK-free:
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED)
 target_link_libraries(app PRIVATE Nabla::Nabla)
 ```
 
-This path does not require CUDA SDK headers on the consuming project.
-
-## Native Opt-In
-
-Use the native opt-in path only in targets that include `CUDAInteropNative.h` or use raw CUDA Driver API/NVRTC types.
+Native CUDA interop is explicit:
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
 nbl_target_link_cuda_interop(native_app PRIVATE)
 ```
 
-`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes runtime CUDA header discovery JSON for `native_app`.
+`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes `nbl_cuda_interop_runtime.json` next to the target executable during CMake generation.
+
+Optional overrides:
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
 nbl_target_link_cuda_interop(native_app PRIVATE
     INCLUDE_DIRS "${cuda_runtime_headers}"
 )
-```
 
-```cmake
 nbl_target_link_cuda_interop(native_app PRIVATE
     RUNTIME_JSON "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/my_cuda_runtime.json"
 )
 ```
 
-Pseudo flow:
+Consumers can also choose the SDK used for native compilation with:
+
+```cmake
+cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>
+```
+
+## Native Usage
 
 ```cpp
 #include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
@@ -71,85 +71,61 @@ auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX(
 );
 ```
 
-`compileDirectlyToPTX` performs runtime CUDA header discovery internally. Code that drives NVRTC manually can call `cuda_interop::findRuntimeCompileEnvironment` and `cuda_interop::makeNVRTCIncludeOptions` directly.
+Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly.
+
+Smoke examples:
+
+- `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free.
+- `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks default package usage without native opt-in.
+- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks native opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, and raw interop usage.
 
-Reference smoke:
+## ABI
 
-- CMake target setup: `src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt`
-- SDK-free package boundary check: `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp`
-- Default Nabla package usage without native opt-in: `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp`
-- Native CUDA opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC and raw interop usage: `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp`
+- `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI.
+- Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes.
+- CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
+- `CUDAInteropNative.h` declares exported accessor functions whose definitions still live in `Nabla.dll`.
+- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs.
+- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime.
 
 ## Runtime Header Discovery
 
-- `nbl_target_link_cuda_interop(<target> <scope>)` links `Nabla::ext::CUDAInterop` and configures runtime include discovery for that target.
-- The helper is defined once in `NablaCUDAInteropHelpers.cmake` and is available from the source tree and installed `NablaConfig.cmake`.
-- For each target it writes `nbl_cuda_interop_runtime.json` next to the executable during CMake generation.
-- `RUNTIME_JSON <path>` overrides the generated JSON location. Plain paths and `$<CONFIG>` are supported.
-- `cuda_interop::findRuntimeCompileEnvironment` can also receive explicit JSON paths at runtime.
-- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
-- The JSON is a build artifact. Nabla packages do not install JSON files with host-specific CUDA paths.
-- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`.
-- Runtime lookup reads explicit JSON paths and `NBL_CUDA_INTEROP_RUNTIME_JSON` first, then checks executable-local `nbl_cuda_interop_runtime.json`, app-local include bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
-- App-local and Python/conda package probing looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in the path.
-- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list and caches the default discovery result after first use.
-- Production machines do not need the full CUDA SDK just because Nabla was built with CUDA.
-- If an application compiles CUDA source with NVRTC and includes headers such as `cuda_fp16.h`, it must provide those runtime headers through the generated JSON path, an app-local bundle, a runtime/header package, or an installed toolkit.
-- `CUDA_PATH` is a developer fallback. It is not required for packaged applications.
-- Direct `target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)` remains possible, but it only adds compile/link usage requirements and does not create the runtime discovery JSON.
+NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`.
 
-## Runtime Header Distribution
+- `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into native CUDA interop.
+- The JSON is a build artifact. Nabla packages do not install host-specific CUDA paths.
+- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`.
+- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
+- Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
+- The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths.
+- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use.
 
-Nabla packages do not ship CUDA runtime headers. That is a packaging choice, not a hard legal requirement for applications that need NVRTC runtime compilation.
+Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
-NVIDIA CUDA EULA limits CUDA redistribution to selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A then lists the CUDA Toolkit files that may be redistributed with applications. See:
+Nabla does not ship CUDA runtime headers by default. NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See:
 
 - https://docs.nvidia.com/cuda/eula/#distribution
 - https://docs.nvidia.com/cuda/eula/#attachment-a
 
-Relevant Attachment A header entries include:
+Attachment A includes header groups relevant to NVRTC runtime compilation, including `nvrtc.h`, `cuda_fp16.h`, `cuda_bf16.h`, `cuda_fp8.h`, `cuda_fp6.h`, `cuda_fp4.h`, `cuda_runtime_api.h`, `cuda.h`, `vector_functions.h`, and `vector_types.h`.
 
-- `nvrtc.h` under `NVIDIA Runtime Compilation Library and Header`.
-- `cuda_occupancy.h` under `CUDA Occupancy Calculation Header Library`.
-- `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp` under `CUDA Floating Point Type Headers`.
-- `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, `vector_types.h` under `CUDA Headers for Runtime Compilation`.
-
-CuPy documents the same runtime-compile problem. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They also show the common `vector_types.h` failure and recommend `nvidia-cuda-runtime-cu12` for PyPI installs or `cuda-cudart-dev` from system packages:
+CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They show the common `vector_types.h` failure and recommend CUDA runtime header packages for PyPI/system package installs:
 
 - https://docs.cupy.dev/en/v13.5.0/install.html#cupy-always-raises-nvrtc-error-compilation-6
 - https://github.com/cupy/cupy/issues/8466
 
-For Nabla consumers this means:
-
-- The default Nabla package stays SDK-free for consumers that only link `Nabla::Nabla`.
-- Native interop consumers can install CUDA runtime headers through an official package, point `NBL_CUDA_INTEROP_RUNTIME_JSON` at their own JSON, pass `INCLUDE_DIRS` to `nbl_target_link_cuda_interop`, or ship an app-local header bundle if their distribution model allows it.
-- Shipping such headers is a consumer packaging decision. Nabla runtime discovery supports it, but Nabla does not install host-specific CUDA header paths or redistribute CUDA headers by default.
-
-## Properties
-
-- Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers.
-- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop`.
-- Raw CUDA access is not wrapped away in the native opt-in path. Native code uses CUDA Driver API and NVRTC types directly.
-- CUDA SDK structs with version-sensitive layout are kept out of exported Nabla ABI.
-- The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs.
-- Native state is PIMPL-owned by Nabla. Consumers cannot construct CUDA wrapper objects with arbitrary internal state.
-- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla.
-- `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor.
-- Runtime CUDA header discovery is independent from the CUDA SDK used to build Nabla.
-- Native consumers can use a newer compatible CUDA SDK or a runtime/header package without rebuilding Nabla.
-- Toggling Nabla CUDA support does not change SDK-free public header parse requirements for consumers.
-- The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds.
-- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
+## CUDA ON/OFF Builds
+
+- SDK-free public headers stay stable for CUDA ON and CUDA OFF Nabla builds.
 - CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`.
+- CUDA OFF implementations are local stubs in the same `.cpp` files. Factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
+- The Nabla source list stays stable, so CUDA interop `.cpp` files remain visible in IDE projects for both CUDA ON and CUDA OFF builds.
 
 ## Related Designs
 
-This split follows the same public-boundary pattern used by mature GPU projects: SDK-free default headers, native access through an explicit opt-in path, and SDK-dependent implementation details outside the default public API.
+The split follows the same boundary pattern used by mature GPU projects: default headers avoid vendor SDK requirements, native access is explicit, and implementation details stay outside the default public API.
 
-- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79).
-- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61).
-- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: [`device.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27).
-- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: [`device_impl.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30), [`device.cpp`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.cpp#L10-L48).
-- ONNX Runtime keeps accelerator dependencies behind execution providers and supports provider shared libraries loaded only when requested: [`Build with Execution Providers`](https://onnxruntime.ai/docs/build/eps.html#execution-provider-shared-libraries).
-- ggml/llama.cpp keeps the generic backend API separate from CUDA and builds CUDA as an explicit backend target with CUDA libraries linked to that backend: [`ggml-backend.h`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/include/ggml-backend.h#L1488-L1499), [`ggml-cuda CMakeLists.txt`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-cuda/CMakeLists.txt#L982-L1072).
-- TensorFlow PluggableDevice uses separate device plugin packages so accelerator toolchains and dependencies do not become core TensorFlow requirements: [`PluggableDevice`](https://blog.tensorflow.org/2021/06/pluggabledevice-device-plugins-for-TensorFlow.html).
+- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79
+- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61
+- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27
+- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30
diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
index e36fe65701..31bf461804 100644
--- a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
@@ -1,4 +1,4 @@
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 #include "nbl/system/IApplicationFramework.h"
 
 #include <type_traits>
diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
index eb7061f0ee..dc1c247806 100644
--- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
@@ -18,7 +18,7 @@
 #error "Nabla consumers must not include CUDA SDK headers."
 #endif
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #error "Nabla consumers must not get the CUDA opt-in define."
diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
similarity index 99%
rename from src/nbl/ext/CUDAInterop/CCUDADevice.cpp
rename to src/nbl/video/CCUDADevice.cpp
index 8e696d0827..fcafc8bc48 100644
--- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp
similarity index 98%
rename from src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
rename to src/nbl/video/CCUDAExportableMemory.cpp
index 7d5483af04..4eb37b720a 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
+++ b/src/nbl/video/CCUDAExportableMemory.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
similarity index 99%
rename from src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
rename to src/nbl/video/CCUDAHandler.cpp
index de7f14b58f..ced76b9713 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 #include "nbl/system/ModuleLookupUtils.h"
 
 #include "nlohmann/json.hpp"
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp
similarity index 97%
rename from src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
rename to src/nbl/video/CCUDAImportedMemory.cpp
index 3a8ed56371..9e58fbac10 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
+++ b/src/nbl/video/CCUDAImportedMemory.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp
similarity index 97%
rename from src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
rename to src/nbl/video/CCUDAImportedSemaphore.cpp
index 6d980ed126..bc1db625d1 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/video/CCUDAImportedSemaphore.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp
similarity index 96%
rename from src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
rename to src/nbl/video/CUDAInteropNativeState.hpp
index 74cb7823d5..79139d015d 100644
--- a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
+++ b/src/nbl/video/CUDAInteropNativeState.hpp
@@ -1,5 +1,5 @@
-#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
-#define _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
+#ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
+#define _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
 
 #include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
 

From d559a2caeafa9aef0c308b7716c77d4be076fc28 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 15:02:03 +0200
Subject: [PATCH 21/27] Move smart pointer helpers into core

---
 include/nbl/core/decl/smart_refctd_ptr.h      | 38 ++++++++
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 90 ++++++-------------
 2 files changed, 66 insertions(+), 62 deletions(-)

diff --git a/include/nbl/core/decl/smart_refctd_ptr.h b/include/nbl/core/decl/smart_refctd_ptr.h
index 7c231fea4b..78609fa34c 100644
--- a/include/nbl/core/decl/smart_refctd_ptr.h
+++ b/include/nbl/core/decl/smart_refctd_ptr.h
@@ -7,6 +7,10 @@
 
 #include "nbl/core/IReferenceCounted.h"
 
+#include <concepts>
+#include <type_traits>
+#include <utility>
+
 namespace nbl::core
 {
 
@@ -118,6 +122,40 @@ class smart_refctd_ptr
 };
 static_assert(sizeof(smart_refctd_ptr<IReferenceCounted>) == sizeof(IReferenceCounted*), "smart_refctd_ptr has a memory overhead!");
 
+template<typename>
+struct is_smart_refctd_ptr : std::false_type {};
+
+template<typename T>
+struct is_smart_refctd_ptr<smart_refctd_ptr<T>> : std::true_type {};
+
+template<typename T>
+inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr<std::remove_cvref_t<T>>::value;
+
+template<typename T>
+inline constexpr bool is_raw_pointer_or_smart_refctd_ptr_v = std::is_pointer_v<std::remove_cvref_t<T>> || is_smart_refctd_ptr_v<T>;
+
+template<typename Object>
+decltype(auto) dereference(Object&& object)
+{
+	using object_t = std::remove_cvref_t<Object>;
+	if constexpr (std::is_pointer_v<object_t>)
+		return *object;
+	else if constexpr (is_smart_refctd_ptr_v<Object>)
+		return *object;
+	else
+		return std::forward<Object>(object);
+}
+
+template<typename Object, typename Target>
+concept dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v<Object> && requires(Object&& object) {
+	{ dereference(std::forward<Object>(object)) } -> std::convertible_to<Target&>;
+};
+
+template<typename Object, typename Target>
+concept const_dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v<Object> && requires(Object&& object) {
+	{ dereference(std::forward<Object>(object)) } -> std::convertible_to<const Target&>;
+};
+
 
 template< class T, class... Args >
 smart_refctd_ptr<T> make_smart_refctd_ptr(Args&& ... args);
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 9d23fcb4ef..fe5fb5875e 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -161,40 +161,6 @@ struct SExportableMemoryCreationParams
 namespace detail
 {
 
-template<typename>
-struct is_smart_refctd_ptr : std::false_type {};
-
-template<typename T>
-struct is_smart_refctd_ptr<core::smart_refctd_ptr<T>> : std::true_type {};
-
-template<typename T>
-inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr<std::remove_cvref_t<T>>::value;
-
-template<typename T>
-inline constexpr bool is_indirect_object_v = std::is_pointer_v<std::remove_cvref_t<T>> || is_smart_refctd_ptr_v<T>;
-
-template<typename Object>
-decltype(auto) as_ref(Object&& object)
-{
-	using object_t = std::remove_cvref_t<Object>;
-	if constexpr (std::is_pointer_v<object_t>)
-		return *object;
-	else if constexpr (is_smart_refctd_ptr_v<Object>)
-		return *object;
-	else
-		return std::forward<Object>(object);
-}
-
-template<typename Object, typename Target>
-concept object_like = is_indirect_object_v<Object> && requires(Object&& object) {
-	{ as_ref(std::forward<Object>(object)) } -> std::convertible_to<Target&>;
-};
-
-template<typename Object, typename Target>
-concept const_object_like = is_indirect_object_v<Object> && requires(Object&& object) {
-	{ as_ref(std::forward<Object>(object)) } -> std::convertible_to<const Target&>;
-};
-
 template<typename Source>
 concept program_text_source = std::same_as<std::remove_cvref_t<Source>, std::string> ||
 	std::convertible_to<Source, const char*>;
@@ -205,17 +171,17 @@ NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
 NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
 
 template<typename Handler>
-requires detail::const_object_like<Handler, CCUDAHandler>
+requires core::const_dereferenceable_to<Handler, CCUDAHandler>
 inline const CUDA& getCUDAFunctionTable(Handler&& handler)
 {
-	return getCUDAFunctionTable(detail::as_ref(std::forward<Handler>(handler)));
+	return getCUDAFunctionTable(core::dereference(std::forward<Handler>(handler)));
 }
 
 template<typename Handler>
-requires detail::const_object_like<Handler, CCUDAHandler>
+requires core::const_dereferenceable_to<Handler, CCUDAHandler>
 inline const NVRTC& getNVRTCFunctionTable(Handler&& handler)
 {
-	return getNVRTCFunctionTable(detail::as_ref(std::forward<Handler>(handler)));
+	return getNVRTCFunctionTable(core::dereference(std::forward<Handler>(handler)));
 }
 
 NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
@@ -228,10 +194,10 @@ T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
 NBL_API2 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
 
 template<typename Handler>
-requires detail::const_object_like<Handler, CCUDAHandler>
+requires core::const_dereferenceable_to<Handler, CCUDAHandler>
 inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(Handler&& handler)
 {
-	return getAvailableDevices(detail::as_ref(std::forward<Handler>(handler)));
+	return getAvailableDevices(core::dereference(std::forward<Handler>(handler)));
 }
 
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
@@ -242,10 +208,10 @@ inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, cons
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 
 template<typename Handler, typename Source>
-requires detail::object_like<Handler, CCUDAHandler> && detail::program_text_source<Source>
+requires core::dereferenceable_to<Handler, CCUDAHandler> && detail::program_text_source<Source>
 inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
-	auto& handlerRef = detail::as_ref(std::forward<Handler>(handler));
+	auto& handlerRef = core::dereference(std::forward<Handler>(handler));
 	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
 		return createProgram(handlerRef,prog,std::string(std::forward<Source>(source)),name,headerCount,headerContents,includeNames);
 	else
@@ -256,10 +222,10 @@ inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&&
 }
 
 template<typename Handler, typename File>
-requires detail::object_like<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
+requires core::dereferenceable_to<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
 inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, File file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
-	return createProgram(detail::as_ref(std::forward<Handler>(handler)),prog,static_cast<system::IFile*>(file),headerCount,headerContents,includeNames);
+	return createProgram(core::dereference(std::forward<Handler>(handler)),prog,static_cast<system::IFile*>(file),headerCount,headerContents,includeNames);
 }
 NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
@@ -291,14 +257,14 @@ NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
 );
 
 template<typename Handler, typename Source>
-requires detail::object_like<Handler, CCUDAHandler> && detail::program_text_source<Source>
+requires core::dereferenceable_to<Handler, CCUDAHandler> && detail::program_text_source<Source>
 inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	Handler&& handler, Source&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 )
 {
-	auto& handlerRef = detail::as_ref(std::forward<Handler>(handler));
+	auto& handlerRef = core::dereference(std::forward<Handler>(handler));
 	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
 		return compileDirectlyToPTX(handlerRef,std::string(std::forward<Source>(source)),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
 	else
@@ -309,14 +275,14 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 }
 
 template<typename Handler, typename File>
-requires detail::object_like<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
+requires core::dereferenceable_to<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
 inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	Handler&& handler, File file, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 )
 {
-	return compileDirectlyToPTX(detail::as_ref(std::forward<Handler>(handler)),static_cast<system::IFile*>(file),nvrtcOptions,headerCount,headerContents,includeNames,log);
+	return compileDirectlyToPTX(core::dereference(std::forward<Handler>(handler)),static_cast<system::IFile*>(file),nvrtcOptions,headerCount,headerContents,includeNames,log);
 }
 
 NBL_API2 CUdevice getInternalObject(const CCUDADevice& device);
@@ -330,48 +296,48 @@ NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& sem
 
 template<typename Object>
 requires (
-	detail::const_object_like<Object, CCUDADevice> ||
-	detail::const_object_like<Object, CCUDAImportedMemory> ||
-	detail::const_object_like<Object, CCUDAImportedSemaphore>
+	core::const_dereferenceable_to<Object, CCUDADevice> ||
+	core::const_dereferenceable_to<Object, CCUDAImportedMemory> ||
+	core::const_dereferenceable_to<Object, CCUDAImportedSemaphore>
 )
 inline auto getInternalObject(Object&& object)
 {
-	return getInternalObject(detail::as_ref(std::forward<Object>(object)));
+	return getInternalObject(core::dereference(std::forward<Object>(object)));
 }
 
 template<typename Device>
-requires detail::const_object_like<Device, CCUDADevice>
+requires core::const_dereferenceable_to<Device, CCUDADevice>
 inline CUcontext getContext(Device&& device)
 {
-	return getContext(detail::as_ref(std::forward<Device>(device)));
+	return getContext(core::dereference(std::forward<Device>(device)));
 }
 
 template<typename Device>
-requires detail::const_object_like<Device, CCUDADevice>
+requires core::const_dereferenceable_to<Device, CCUDADevice>
 inline size_t roundToGranularity(Device&& device, CUmemLocationType location, size_t size)
 {
-	return roundToGranularity(detail::as_ref(std::forward<Device>(device)),location,size);
+	return roundToGranularity(core::dereference(std::forward<Device>(device)),location,size);
 }
 
 template<typename Device>
-requires detail::object_like<Device, CCUDADevice>
+requires core::dereferenceable_to<Device, CCUDADevice>
 inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(Device&& device, SExportableMemoryCreationParams&& params)
 {
-	return createExportableMemory(detail::as_ref(std::forward<Device>(device)),std::move(params));
+	return createExportableMemory(core::dereference(std::forward<Device>(device)),std::move(params));
 }
 
 template<typename Memory>
-requires detail::const_object_like<Memory, CCUDAExportableMemory>
+requires core::const_dereferenceable_to<Memory, CCUDAExportableMemory>
 inline CUdeviceptr getDeviceptr(Memory&& memory)
 {
-	return getDeviceptr(detail::as_ref(std::forward<Memory>(memory)));
+	return getDeviceptr(core::dereference(std::forward<Memory>(memory)));
 }
 
 template<typename Memory>
-requires detail::const_object_like<Memory, CCUDAImportedMemory>
+requires core::const_dereferenceable_to<Memory, CCUDAImportedMemory>
 inline CUresult getMappedBuffer(Memory&& memory, CUdeviceptr* mappedBuffer)
 {
-	return getMappedBuffer(detail::as_ref(std::forward<Memory>(memory)),mappedBuffer);
+	return getMappedBuffer(core::dereference(std::forward<Memory>(memory)),mappedBuffer);
 }
 
 }

From 38705b93794e820417a2b3f223d258e07aeebb8f Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 16:06:26 +0200
Subject: [PATCH 22/27] Use CUDA interop accessors

---
 examples_tests                                |   2 +-
 include/nbl/core/decl/smart_refctd_ptr.h      |  39 ----
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 219 ++++--------------
 src/nbl/ext/CUDAInterop/README.md             |  22 +-
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  18 +-
 src/nbl/video/CCUDADevice.cpp                 |  51 ++--
 src/nbl/video/CCUDAExportableMemory.cpp       |  10 +-
 src/nbl/video/CCUDAHandler.cpp                |  61 ++---
 src/nbl/video/CCUDAImportedMemory.cpp         |  11 +-
 src/nbl/video/CCUDAImportedSemaphore.cpp      |   7 +-
 src/nbl/video/CUDAInteropNativeState.hpp      |  17 +-
 11 files changed, 135 insertions(+), 322 deletions(-)

diff --git a/examples_tests b/examples_tests
index b2c639c8b7..1dc7f6a075 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit b2c639c8b71c3b860418dc4b3e46ad147ba5f256
+Subproject commit 1dc7f6a075c8c457b80388e59ef3da846bad03e4
diff --git a/include/nbl/core/decl/smart_refctd_ptr.h b/include/nbl/core/decl/smart_refctd_ptr.h
index 78609fa34c..814c807a84 100644
--- a/include/nbl/core/decl/smart_refctd_ptr.h
+++ b/include/nbl/core/decl/smart_refctd_ptr.h
@@ -7,10 +7,6 @@
 
 #include "nbl/core/IReferenceCounted.h"
 
-#include <concepts>
-#include <type_traits>
-#include <utility>
-
 namespace nbl::core
 {
 
@@ -122,41 +118,6 @@ class smart_refctd_ptr
 };
 static_assert(sizeof(smart_refctd_ptr<IReferenceCounted>) == sizeof(IReferenceCounted*), "smart_refctd_ptr has a memory overhead!");
 
-template<typename>
-struct is_smart_refctd_ptr : std::false_type {};
-
-template<typename T>
-struct is_smart_refctd_ptr<smart_refctd_ptr<T>> : std::true_type {};
-
-template<typename T>
-inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr<std::remove_cvref_t<T>>::value;
-
-template<typename T>
-inline constexpr bool is_raw_pointer_or_smart_refctd_ptr_v = std::is_pointer_v<std::remove_cvref_t<T>> || is_smart_refctd_ptr_v<T>;
-
-template<typename Object>
-decltype(auto) dereference(Object&& object)
-{
-	using object_t = std::remove_cvref_t<Object>;
-	if constexpr (std::is_pointer_v<object_t>)
-		return *object;
-	else if constexpr (is_smart_refctd_ptr_v<Object>)
-		return *object;
-	else
-		return std::forward<Object>(object);
-}
-
-template<typename Object, typename Target>
-concept dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v<Object> && requires(Object&& object) {
-	{ dereference(std::forward<Object>(object)) } -> std::convertible_to<Target&>;
-};
-
-template<typename Object, typename Target>
-concept const_dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v<Object> && requires(Object&& object) {
-	{ dereference(std::forward<Object>(object)) } -> std::convertible_to<const Target&>;
-};
-
-
 template< class T, class... Args >
 smart_refctd_ptr<T> make_smart_refctd_ptr(Args&& ... args);
 
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index fe5fb5875e..57669f591a 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -9,10 +9,7 @@
 #include "nbl/asset/ICPUBuffer.h"
 #include "nbl/system/DynamicFunctionCaller.h"
 
-#include <concepts>
 #include <string>
-#include <type_traits>
-#include <utility>
 
 #include "cuda.h"
 #include "nvrtc.h"
@@ -158,196 +155,62 @@ struct SExportableMemoryCreationParams
 	CUmemLocationType location;
 };
 
-namespace detail
-{
-
-template<typename Source>
-concept program_text_source = std::same_as<std::remove_cvref_t<Source>, std::string> ||
-	std::convertible_to<Source, const char*>;
-
-}
-
-NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
-NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
-
-template<typename Handler>
-requires core::const_dereferenceable_to<Handler, CCUDAHandler>
-inline const CUDA& getCUDAFunctionTable(Handler&& handler)
-{
-	return getCUDAFunctionTable(core::dereference(std::forward<Handler>(handler)));
-}
-
-template<typename Handler>
-requires core::const_dereferenceable_to<Handler, CCUDAHandler>
-inline const NVRTC& getNVRTCFunctionTable(Handler&& handler)
-{
-	return getNVRTCFunctionTable(core::dereference(std::forward<Handler>(handler)));
-}
-
-NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
-NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
-NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
-
-template<typename T>
-T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
-
-NBL_API2 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
-
-template<typename Handler>
-requires core::const_dereferenceable_to<Handler, CCUDAHandler>
-inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(Handler&& handler)
-{
-	return getAvailableDevices(core::dereference(std::forward<Handler>(handler)));
-}
-
-NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames);
-}
-NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-
-template<typename Handler, typename Source>
-requires core::dereferenceable_to<Handler, CCUDAHandler> && detail::program_text_source<Source>
-inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	auto& handlerRef = core::dereference(std::forward<Handler>(handler));
-	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
-		return createProgram(handlerRef,prog,std::string(std::forward<Source>(source)),name,headerCount,headerContents,includeNames);
-	else
-	{
-		const char* sourceText = source;
-		return createProgram(handlerRef,prog,sourceText,name,headerCount,headerContents,includeNames);
-	}
-}
-
-template<typename Handler, typename File>
-requires core::dereferenceable_to<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
-inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, File file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(core::dereference(std::forward<Handler>(handler)),prog,static_cast<system::IFile*>(file),headerCount,headerContents,includeNames);
-}
-NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
-NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
-
 struct ptx_and_nvrtcResult_t
 {
 	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
 	nvrtcResult result;
 };
 
-NBL_API2 ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
-NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-);
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler& handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(handler,std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-);
-
-template<typename Handler, typename Source>
-requires core::dereferenceable_to<Handler, CCUDAHandler> && detail::program_text_source<Source>
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	Handler&& handler, Source&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	auto& handlerRef = core::dereference(std::forward<Handler>(handler));
-	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
-		return compileDirectlyToPTX(handlerRef,std::string(std::forward<Source>(source)),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-	else
-	{
-		const char* sourceText = source;
-		return compileDirectlyToPTX(handlerRef,sourceText,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-	}
-}
-
-template<typename Handler, typename File>
-requires core::dereferenceable_to<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	Handler&& handler, File file, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(core::dereference(std::forward<Handler>(handler)),static_cast<system::IFile*>(file),nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-
-NBL_API2 CUdevice getInternalObject(const CCUDADevice& device);
-NBL_API2 CUcontext getContext(const CCUDADevice& device);
-NBL_API2 size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
-NBL_API2 core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params);
-NBL_API2 CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
-NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
-NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
-NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
-
-template<typename Object>
-requires (
-	core::const_dereferenceable_to<Object, CCUDADevice> ||
-	core::const_dereferenceable_to<Object, CCUDAImportedMemory> ||
-	core::const_dereferenceable_to<Object, CCUDAImportedSemaphore>
-)
-inline auto getInternalObject(Object&& object)
-{
-	return getInternalObject(core::dereference(std::forward<Object>(object)));
-}
-
-template<typename Device>
-requires core::const_dereferenceable_to<Device, CCUDADevice>
-inline CUcontext getContext(Device&& device)
-{
-	return getContext(core::dereference(std::forward<Device>(device)));
-}
+// These are opt-in CUDA-native declarations for symbols implemented and exported by Nabla.
+// Only consumers that include this header and link Nabla::ext::CUDAInterop see CUDA SDK types.
+class NBL_API2 CCUDAHandlerAccessor
+{
+	public:
+		static const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
+		static const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+		static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
+		static bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
+		static bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
+		static const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
+		static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+		static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
+		static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
+		static ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
+		static ptx_and_nvrtcResult_t compileDirectlyToPTX(
+			CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+			std::string* log=nullptr
+		);
+};
 
-template<typename Device>
-requires core::const_dereferenceable_to<Device, CCUDADevice>
-inline size_t roundToGranularity(Device&& device, CUmemLocationType location, size_t size)
+class NBL_API2 CCUDADeviceAccessor
 {
-	return roundToGranularity(core::dereference(std::forward<Device>(device)),location,size);
-}
+	public:
+		static CUdevice getInternalObject(const CCUDADevice& device);
+		static CUcontext getContext(const CCUDADevice& device);
+		static size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
+		static core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params);
+};
 
-template<typename Device>
-requires core::dereferenceable_to<Device, CCUDADevice>
-inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(Device&& device, SExportableMemoryCreationParams&& params)
+class NBL_API2 CCUDAExportableMemoryAccessor
 {
-	return createExportableMemory(core::dereference(std::forward<Device>(device)),std::move(params));
-}
+	public:
+		static CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
+};
 
-template<typename Memory>
-requires core::const_dereferenceable_to<Memory, CCUDAExportableMemory>
-inline CUdeviceptr getDeviceptr(Memory&& memory)
+class NBL_API2 CCUDAImportedMemoryAccessor
 {
-	return getDeviceptr(core::dereference(std::forward<Memory>(memory)));
-}
+	public:
+		static CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
+		static CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
+};
 
-template<typename Memory>
-requires core::const_dereferenceable_to<Memory, CCUDAImportedMemory>
-inline CUresult getMappedBuffer(Memory&& memory, CUdeviceptr* mappedBuffer)
+class NBL_API2 CCUDAImportedSemaphoreAccessor
 {
-	return getMappedBuffer(core::dereference(std::forward<Memory>(memory)),mappedBuffer);
-}
+	public:
+		static CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
+};
 
 }
 
-#define ASSERT_CUDA_SUCCESS(expr, handler) \
-	do { \
-		const auto cudaResult = (expr); \
-		if (!nbl::video::cuda_native::defaultHandleResult(*(handler), cudaResult)) { \
-			assert(false); \
-		} \
-	} while(0)
-
 #endif
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index e99edd82c0..ea92dcec7d 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -5,7 +5,7 @@
 - `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and its implementation in `src/nbl/video/CCUDA*.cpp`.
 - Those headers do not include CUDA SDK headers. Consumers that only link `Nabla::Nabla` do not need `cuda.h`, `nvrtc.h`, or a CUDA SDK install just to parse Nabla headers.
 - `Nabla::ext::CUDAInterop` is an `INTERFACE` target for native CUDA opt-in. It builds no library. It only adds `CUDAInteropNative.h`, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop.
-- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*` accessors for CUDA Driver API and NVRTC types.
+- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*Accessor` classes for CUDA Driver API and NVRTC types.
 
 ## CMake Usage
 
@@ -52,16 +52,17 @@ cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>
 auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger));
 auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice);
 
-auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
+auto memory = nbl::video::cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, {
     .size = size,
     .alignment = alignment,
     .location = CU_MEM_LOCATION_TYPE_DEVICE,
 });
 
 std::string log;
-auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX(
-    handler,
-    cudaSource,
+std::string cudaSource = loadKernelText();
+auto [ptx, result] = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
+    *handler,
+    std::move(cudaSource),
     "kernel.cu",
     cudaDevice->geDefaultCompileOptions(),
     0,
@@ -71,7 +72,12 @@ auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX(
 );
 ```
 
-Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly.
+Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly through accessor classes:
+
+- `CCUDAHandlerAccessor` exposes CUDA/NVRTC function tables, NVRTC program helpers, PTX compilation, native device enumeration, and default error handling.
+- `CCUDADeviceAccessor` exposes `CUdevice`, `CUcontext`, memory granularity, and CUDA allocation creation.
+- `CCUDAExportableMemoryAccessor`, `CCUDAImportedMemoryAccessor`, and `CCUDAImportedSemaphoreAccessor` expose the raw CUDA handles needed for interop.
+- Accessor methods take explicit Nabla references. Callers dereference `smart_refctd_ptr` at the call site instead of going through pointer/smart-pointer convenience overloads.
 
 Smoke examples:
 
@@ -84,7 +90,7 @@ Smoke examples:
 - `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI.
 - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes.
 - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
-- `CUDAInteropNative.h` declares exported accessor functions whose definitions still live in `Nabla.dll`.
+- `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI.
 - Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs.
 - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime.
 
@@ -98,7 +104,7 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud
 - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
 - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
 - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths.
-- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use.
+- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use.
 
 Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 3b799a56cf..0b07bfa137 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -23,7 +23,7 @@ using namespace nbl::video;
 	core::smart_refctd_ptr<IDeviceMemoryAllocation> vulkanMemory,
 	core::smart_refctd_ptr<ISemaphore> vulkanSemaphore)
 {
-	auto cudaMemory = cuda_native::createExportableMemory(cudaDevice, {
+	auto cudaMemory = cuda_native::CCUDADeviceAccessor::createExportableMemory(cudaDevice, {
 		.size = 4096,
 		.alignment = 4096,
 		.location = CU_MEM_LOCATION_TYPE_DEVICE,
@@ -37,16 +37,16 @@ using namespace nbl::video;
 
 	CUdeviceptr mappedVulkanMemory = 0;
 	if (importedFromVulkan)
-		cuda_native::getMappedBuffer(importedFromVulkan,&mappedVulkanMemory);
+		cuda_native::CCUDAImportedMemoryAccessor::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory);
 
-	const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(cudaMemory);
-	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(importedSemaphore):nullptr;
+	const CUdeviceptr cudaDevicePtr = cuda_native::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaMemory);
+	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::CCUDAImportedSemaphoreAccessor::getInternalObject(*importedSemaphore):nullptr;
 	return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore;
 }
 
 bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 {
-	auto& cuda = cuda_native::getCUDAFunctionTable(handler);
+	auto& cuda = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(handler);
 
 	CUcontext context = nullptr;
 	if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS)
@@ -95,9 +95,9 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler)
 	)cuda";
 
 	std::string log;
-	auto [ptx, result] = cuda_native::compileDirectlyToPTX(
+	auto [ptx, result] = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
 		handler,
-		Source,
+		std::string(Source),
 		"cuda_fp16_discovery_probe.cu",
 		{nullptr,nullptr},
 		0,
@@ -121,7 +121,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!isAPILoaded())
 			return false;
 
-		static_assert(std::is_same_v<decltype(nbl::video::cuda_native::getInternalObject(std::declval<const nbl::video::CCUDADevice&>())), CUdevice>);
+		static_assert(std::is_same_v<decltype(nbl::video::cuda_native::CCUDADeviceAccessor::getInternalObject(std::declval<const nbl::video::CCUDADevice&>())), CUdevice>);
 
 		#ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON
 		const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON});
@@ -144,7 +144,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!cudaFp16HeaderCompileProbe(*handler))
 			return false;
 
-		const auto& devices = nbl::video::cuda_native::getAvailableDevices(handler);
+		const auto& devices = nbl::video::cuda_native::CCUDAHandlerAccessor::getAvailableDevices(*handler);
 		if (devices.empty())
 			return true;
 
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index fcafc8bc48..359cd093a1 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -34,10 +34,12 @@ CCUDADevice::CCUDADevice(
 	m_defaultCompileOptions.push_back("-dc");
 	m_defaultCompileOptions.push_back("-use_fast_math");
 
-  const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
+  const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler);
 	
-	ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle), m_handler);
-	ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_native->context), m_handler);
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle)))
+		assert(false);
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context)))
+		assert(false);
 
 	for (uint32_t locationType = 0; locationType < m_native->allocationGranularity.size(); ++locationType)
 	{
@@ -50,30 +52,31 @@ CCUDADevice::CCUDADevice(
 
 	  const auto prop = CUmemAllocationProp{
       .type = CU_MEM_ALLOCATION_TYPE_PINNED,
-      .requestedHandleTypes = cuda_native::getAllocationHandleType(),
+      .requestedHandleTypes = cuda_native::SAccess::allocationHandleType(),
       .location = { .type = static_cast<CUmemLocationType>(locationType), .id = m_native->handle },
   #ifdef _WIN32
       .win32HandleMetaData = &metadata,
   #endif
     };
-		ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler);
+		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)))
+			assert(false);
 	}
 }
 
 namespace cuda_native
 {
 
-CUdevice getInternalObject(const CCUDADevice& device)
+CUdevice CCUDADeviceAccessor::getInternalObject(const CCUDADevice& device)
 {
 	return SAccess::native(device).handle;
 }
 
-CUcontext getContext(const CCUDADevice& device)
+CUcontext CCUDADeviceAccessor::getContext(const CCUDADevice& device)
 {
 	return SAccess::native(device).context;
 }
 
-size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size)
+size_t CCUDADeviceAccessor::roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size)
 {
 	const auto& granularity = SAccess::native(device).allocationGranularity[location];
 	return ((size - 1) / granularity + 1) * granularity;
@@ -90,7 +93,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 {
 	const auto handler = device.getHandler();
 	const auto& native = cuda_native::SAccess::native(device);
-	const auto& cu = cuda_native::getCUDAFunctionTable(*handler);
+	const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*handler);
 	
 	CUdeviceptr ptr = 0;
 	if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err)
@@ -98,7 +101,8 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 
 	if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err)
 	{
-		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler);
+		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
+			assert(false);
 		return err;
 	}
 	
@@ -109,8 +113,10 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 
 	if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
 	{
-		ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), handler);
-		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler);
+		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size)))
+			assert(false);
+		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
+			assert(false);
 		return err;
 	}
 
@@ -122,7 +128,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 namespace cuda_native
 {
 
-core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams)
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADeviceAccessor::createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams)
 {
 	const auto handler = device.getHandler();
 	auto& native = SAccess::native(device);
@@ -131,11 +137,11 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 	CCUDAExportableMemory::SCachedCreationParams params = {
 		.size = inParams.size,
 		.alignment = inParams.alignment,
-		.granularSize = roundToGranularity(device, inParams.location, inParams.size),
+		.granularSize = CCUDADeviceAccessor::roundToGranularity(device, inParams.location, inParams.size),
 		.deviceLocal = isDeviceLocal(inParams.location)
 	};
 
-	auto& cu = getCUDAFunctionTable(*handler);
+	auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*handler);
 	
 #ifdef _WIN32
 	OBJECT_ATTRIBUTES metadata = {
@@ -145,7 +151,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 
 	 const auto prop = CUmemAllocationProp{
 		.type = CU_MEM_ALLOCATION_TYPE_PINNED,
-		.requestedHandleTypes = getAllocationHandleType(),
+		.requestedHandleTypes = SAccess::allocationHandleType(),
 		.location = { .type = inParams.location, .id = native.handle },
 #ifdef _WIN32
 		.win32HandleMetaData = &metadata,
@@ -164,7 +170,8 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 	if (auto err = cu.pcuMemExportToShareableHandle(&params.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err)
 	{
 		logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR);
-		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler);
+		if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem)))
+			assert(false);
 		return nullptr;
 	}
 
@@ -172,7 +179,8 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 	{
 		logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
-		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler);
+		if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem)))
+			assert(false);
 
 		bool closeSucceed = CloseExternalHandle(params.externalHandle);
 		assert(closeSucceed);
@@ -194,7 +202,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem)
 {
-	const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
+	const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler);
 	const auto handleType = mem->getCreationParams().externalHandleType;
 
 	if (!handleType) return nullptr;
@@ -225,7 +233,7 @@ core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(co
 
 core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sema)
 {
-	auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
+	auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler);
 	auto handleType = sema->getCreationParams().externalHandleTypes.value;
 
 	if (!handleType)
@@ -258,7 +266,8 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 
 CCUDADevice::~CCUDADevice()
 {
-	ASSERT_CUDA_SUCCESS(cuda_native::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context), m_handler);
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context)))
+		assert(false);
 }
 
 }
diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp
index 4eb37b720a..f84169e38f 100644
--- a/src/nbl/video/CCUDAExportableMemory.cpp
+++ b/src/nbl/video/CCUDAExportableMemory.cpp
@@ -52,11 +52,13 @@ core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsM
 
 CCUDAExportableMemory::~CCUDAExportableMemory()
 {
-	const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+	const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler());
 
-  ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_native->ptr, m_params.granularSize), m_device->getHandler());
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize)))
+		assert(false);
 
-	ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize), m_device->getHandler());
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)))
+		assert(false);
 
   bool closeSucceed = CloseExternalHandle(m_params.externalHandle);
 	assert(closeSucceed);
@@ -66,7 +68,7 @@ CCUDAExportableMemory::~CCUDAExportableMemory()
 namespace cuda_native
 {
 
-CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory)
+CUdeviceptr CCUDAExportableMemoryAccessor::getDeviceptr(const CCUDAExportableMemory& memory)
 {
 	return SAccess::native(memory).ptr;
 }
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index ced76b9713..0064a191a6 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -355,7 +355,7 @@ CCUDAHandler::~CCUDAHandler() = default;
 namespace cuda_native
 {
 
-bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
+bool CCUDAHandlerAccessor::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 {
 	switch (result)
 	{
@@ -721,12 +721,12 @@ bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 	return false;
 }
 
-bool defaultHandleResult(const CCUDAHandler& handler, CUresult result)
+bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, CUresult result)
 {
-	return defaultHandleResult(result,SAccess::logger(handler));
+	return CCUDAHandlerAccessor::defaultHandleResult(result,SAccess::logger(handler));
 }
 
-bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
+bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
 {
 	switch (result)
 	{
@@ -874,22 +874,22 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 namespace cuda_native
 {
 
-const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler)
+const CUDA& CCUDAHandlerAccessor::getCUDAFunctionTable(const CCUDAHandler& handler)
 {
 	return SAccess::native(handler).cuda;
 }
 
-const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler)
+const NVRTC& CCUDAHandlerAccessor::getNVRTCFunctionTable(const CCUDAHandler& handler)
 {
 	return SAccess::native(handler).nvrtc;
 }
 
-const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler)
+const core::vector<SCUDADeviceInfo>& CCUDAHandlerAccessor::getAvailableDevices(const CCUDAHandler& handler)
 {
 	return SAccess::native(handler).availableDevices;
 }
 
-nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
+nvrtcResult CCUDAHandlerAccessor::createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
 #if defined(_NBL_WINDOWS_API_)
 	source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n");
@@ -901,24 +901,12 @@ nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string
 	return SAccess::native(handler).nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames);
 }
 
-nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount, const char* const* headerContents, const char* const* includeNames)
-{
-	const auto filesize = file->getSize();
-	std::string source(filesize+1u,'0');
-
-	system::IFile::success_t bytesRead;
-	file->read(bytesRead,source.data(),0u,file->getSize());
-	source.resize(bytesRead.getBytesProcessed());
-
-	return createProgram(handler,prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames);
-}
-
-nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options)
+nvrtcResult CCUDAHandlerAccessor::compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options)
 {
 	return SAccess::native(handler).nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin());
 }
 
-nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log)
+nvrtcResult CCUDAHandlerAccessor::getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log)
 {
 	size_t _size = 0ull;
 	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetProgramLogSize(prog, &_size);
@@ -931,7 +919,7 @@ nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::s
 	return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data());
 }
 
-ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
+ptx_and_nvrtcResult_t CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 {
 	size_t _size = 0ull;
 	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size);
@@ -968,16 +956,16 @@ static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nv
 
 	const auto* optionsBegin = options.empty() ? nullptr:options.data();
 	const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size();
-	result = compileProgram(handler,program,{optionsBegin,optionsEnd});
+	result = CCUDAHandlerAccessor::compileProgram(handler,program,{optionsBegin,optionsEnd});
 	if (log)
-		getProgramLog(handler,program,*log);
+		CCUDAHandlerAccessor::getProgramLog(handler,program,*log);
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
-	return getPTX(handler,program);
+	return CCUDAHandlerAccessor::getPTX(handler,program);
 }
 
-ptx_and_nvrtcResult_t compileDirectlyToPTX(
+ptx_and_nvrtcResult_t CCUDAHandlerAccessor::compileDirectlyToPTX(
 	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount, const char* const* headerContents, const char* const* includeNames,
 	std::string* log)
@@ -990,24 +978,7 @@ ptx_and_nvrtcResult_t compileDirectlyToPTX(
 			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
 	});
 
-	result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames);
-	return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log);
-}
-
-ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount, const char* const* headerContents, const char* const* includeNames,
-	std::string* log)
-{
-	nvrtcProgram program = nullptr;
-	nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
-	auto cleanup = core::makeRAIIExiter([&]() -> void
-	{
-		if (result!=NVRTC_SUCCESS && program)
-			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
-	});
-
-	result = createProgram(handler,&program,file,headerCount,headerContents,includeNames);
+	result = CCUDAHandlerAccessor::createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames);
 	return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log);
 }
 
diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp
index 9e58fbac10..9145fe18ac 100644
--- a/src/nbl/video/CCUDAImportedMemory.cpp
+++ b/src/nbl/video/CCUDAImportedMemory.cpp
@@ -21,18 +21,18 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> dev
 namespace cuda_native
 {
 
-CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory)
+CUexternalMemory CCUDAImportedMemoryAccessor::getInternalObject(const CCUDAImportedMemory& memory)
 {
   return SAccess::native(memory).handle;
 }
 
-CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer)
+CUresult CCUDAImportedMemoryAccessor::getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer)
 {
   CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {};
   bufferDesc.offset = 0;
   bufferDesc.size = SAccess::source(memory)->getAllocationSize();
 
-  const auto& cu = getCUDAFunctionTable(*SAccess::device(memory)->getHandler());
+  const auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*SAccess::device(memory)->getHandler());
   return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, SAccess::native(memory).handle, &bufferDesc);
   
 }
@@ -41,8 +41,9 @@ CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedB
 
 CCUDAImportedMemory::~CCUDAImportedMemory()
 {
-  auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
-  ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_native->handle), m_device->getHandler());
+  auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler());
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle)))
+		assert(false);
 }
 
 }
diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp
index bc1db625d1..5d7d3e07ae 100644
--- a/src/nbl/video/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/video/CCUDAImportedSemaphore.cpp
@@ -20,7 +20,7 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevic
 namespace cuda_native
 {
 
-CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore)
+CUexternalSemaphore CCUDAImportedSemaphoreAccessor::getInternalObject(const CCUDAImportedSemaphore& semaphore)
 {
 	return SAccess::native(semaphore).handle;
 }
@@ -29,8 +29,9 @@ CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore)
 
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 {
-	auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
-	ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_native->handle), m_device->getHandler());
+	auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler());
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle)))
+		assert(false);
 }
 }
 
diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp
index 79139d015d..7e602bb0f3 100644
--- a/src/nbl/video/CUDAInteropNativeState.hpp
+++ b/src/nbl/video/CUDAInteropNativeState.hpp
@@ -57,15 +57,6 @@ struct CCUDAImportedSemaphore::SNativeState
 namespace cuda_native
 {
 
-inline CUmemAllocationHandleType getAllocationHandleType()
-{
-#ifdef _WIN32
-	return CU_MEM_HANDLE_TYPE_WIN32;
-#else
-	return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
-#endif
-}
-
 struct SAccess
 {
 	static CCUDAHandler::SNativeState& native(CCUDAHandler& handler) { return *handler.m_native; }
@@ -96,6 +87,14 @@ struct SAccess
 	static system::logger_opt_ptr logger(const CCUDADevice& device) { return device.m_logger; }
 	static const CCUDADevice* device(const CCUDAImportedMemory& memory) { return memory.m_device.get(); }
 	static IDeviceMemoryAllocation* source(const CCUDAImportedMemory& memory) { return memory.m_src.get(); }
+	static CUmemAllocationHandleType allocationHandleType()
+	{
+	#ifdef _WIN32
+		return CU_MEM_HANDLE_TYPE_WIN32;
+	#else
+		return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+	#endif
+	}
 };
 
 }

From 23e6ef5235ebf2b6f86694652f437b37b0479c53 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 16:36:31 +0200
Subject: [PATCH 23/27] Use explicit CUDA compile log

---
 examples_tests                                  |  2 +-
 include/nbl/ext/CUDAInterop/CUDAInteropNative.h |  9 ++++-----
 src/nbl/ext/CUDAInterop/README.md               |  7 ++++---
 src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp |  8 ++++----
 src/nbl/video/CCUDAHandler.cpp                  | 15 +++++++--------
 5 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/examples_tests b/examples_tests
index 1dc7f6a075..3c57a88af9 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 1dc7f6a075c8c457b80388e59ef3da846bad03e4
+Subproject commit 3c57a88af9eba722fcc6b5b5ba3d136ab3e166ca
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 57669f591a..d409c774e1 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -155,7 +155,7 @@ struct SExportableMemoryCreationParams
 	CUmemLocationType location;
 };
 
-struct ptx_and_nvrtcResult_t
+struct SPTXResult
 {
 	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
 	nvrtcResult result;
@@ -175,11 +175,10 @@ class NBL_API2 CCUDAHandlerAccessor
 		static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 		static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 		static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
-		static ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
-		static ptx_and_nvrtcResult_t compileDirectlyToPTX(
+		static SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
+		static SPTXResult compileDirectlyToPTX(
 			CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
+			std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
 		);
 };
 
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index ea92dcec7d..7d350da379 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -60,15 +60,15 @@ auto memory = nbl::video::cuda_native::CCUDADeviceAccessor::createExportableMemo
 
 std::string log;
 std::string cudaSource = loadKernelText();
-auto [ptx, result] = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
+auto compile = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
     *handler,
     std::move(cudaSource),
     "kernel.cu",
     cudaDevice->geDefaultCompileOptions(),
+    log,
     0,
     nullptr,
-    nullptr,
-    &log
+    nullptr
 );
 ```
 
@@ -78,6 +78,7 @@ Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC ty
 - `CCUDADeviceAccessor` exposes `CUdevice`, `CUcontext`, memory granularity, and CUDA allocation creation.
 - `CCUDAExportableMemoryAccessor`, `CCUDAImportedMemoryAccessor`, and `CCUDAImportedSemaphoreAccessor` expose the raw CUDA handles needed for interop.
 - Accessor methods take explicit Nabla references. Callers dereference `smart_refctd_ptr` at the call site instead of going through pointer/smart-pointer convenience overloads.
+- `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`. There is no optional output pointer in the public API.
 
 Smoke examples:
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 0b07bfa137..ace1059215 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -95,17 +95,17 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler)
 	)cuda";
 
 	std::string log;
-	auto [ptx, result] = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
+	auto compile = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
 		handler,
 		std::string(Source),
 		"cuda_fp16_discovery_probe.cu",
 		{nullptr,nullptr},
+		log,
 		0,
 		nullptr,
-		nullptr,
-		&log
+		nullptr
 	);
-	return result==NVRTC_SUCCESS && ptx && ptx->getSize()>0u;
+	return compile.result==NVRTC_SUCCESS && compile.ptx && compile.ptx->getSize()>0u;
 }
 }
 
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 0064a191a6..9db99e7642 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -919,7 +919,7 @@ nvrtcResult CCUDAHandlerAccessor::getProgramLog(const CCUDAHandler& handler, nvr
 	return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data());
 }
 
-ptx_and_nvrtcResult_t CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
+SPTXResult CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 {
 	size_t _size = 0ull;
 	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size);
@@ -941,8 +941,9 @@ static const core::vector<std::string>& getDefaultRuntimeIncludeOptions()
 	return RuntimeIncludeOptions;
 }
 
-static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
+static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string& log)
 {
+	log.clear();
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
@@ -957,24 +958,22 @@ static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nv
 	const auto* optionsBegin = options.empty() ? nullptr:options.data();
 	const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size();
 	result = CCUDAHandlerAccessor::compileProgram(handler,program,{optionsBegin,optionsEnd});
-	if (log)
-		CCUDAHandlerAccessor::getProgramLog(handler,program,*log);
+	CCUDAHandlerAccessor::getProgramLog(handler,program,log);
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
 	return CCUDAHandlerAccessor::getPTX(handler,program);
 }
 
-ptx_and_nvrtcResult_t CCUDAHandlerAccessor::compileDirectlyToPTX(
+SPTXResult CCUDAHandlerAccessor::compileDirectlyToPTX(
 	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount, const char* const* headerContents, const char* const* includeNames,
-	std::string* log)
+	std::string& log, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
 	nvrtcProgram program = nullptr;
 	nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
 	auto cleanup = core::makeRAIIExiter([&]() -> void
 	{
-		if (result!=NVRTC_SUCCESS && program)
+		if (program)
 			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
 	});
 

From a640183dbc6229f3b9b60c1d22bb1c50c7b8e5fe Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 17:05:26 +0200
Subject: [PATCH 24/27] Trim CUDA interop API surface

---
 cmake/common.cmake                            | 19 +++-------------
 include/nbl/core/decl/smart_refctd_ptr.h      |  1 +
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   |  4 ++--
 include/nbl/video/CCUDADevice.h               |  2 --
 include/nbl/video/CCUDAHandler.h              | 12 ++++++----
 src/nbl/ext/CUDAInterop/README.md             | 11 +++++++---
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  5 ++++-
 src/nbl/video/CCUDAHandler.cpp                | 22 ++++---------------
 8 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/cmake/common.cmake b/cmake/common.cmake
index ae2264fda4..c50e1f6fb2 100755
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -284,22 +284,9 @@ function(nbl_install_dir _DIR)
 endfunction()
 
 function(nbl_install_lib_spec _TARGETS _RELATIVE_DESTINATION)
-	cmake_parse_arguments(_NBL_INSTALL_LIB "" "EXPORT" "" ${ARGN})
-	if(_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS)
-		message(FATAL_ERROR "Unexpected arguments for nbl_install_lib_spec: ${_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS}")
-	endif()
-
-	if(_NBL_INSTALL_LIB_EXPORT)
-		install(TARGETS ${_TARGETS}
-			EXPORT ${_NBL_INSTALL_LIB_EXPORT}
-			ARCHIVE DESTINATION ${_NBL_CPACK_PACKAGE_RELATIVE_ENTRY_}/lib/${_RELATIVE_DESTINATION}
-			COMPONENT Libraries
-		)
-	else()
-		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries)
-		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries)
-		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries)
-	endif()
+	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries)
+	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries)
+	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries)
 endfunction()
 
 function(nbl_install_lib _TARGETS)
diff --git a/include/nbl/core/decl/smart_refctd_ptr.h b/include/nbl/core/decl/smart_refctd_ptr.h
index 814c807a84..7c231fea4b 100644
--- a/include/nbl/core/decl/smart_refctd_ptr.h
+++ b/include/nbl/core/decl/smart_refctd_ptr.h
@@ -118,6 +118,7 @@ class smart_refctd_ptr
 };
 static_assert(sizeof(smart_refctd_ptr<IReferenceCounted>) == sizeof(IReferenceCounted*), "smart_refctd_ptr has a memory overhead!");
 
+
 template< class T, class... Args >
 smart_refctd_ptr<T> make_smart_refctd_ptr(Args&& ... args);
 
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index d409c774e1..daf3dcb4d1 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -161,8 +161,8 @@ struct SPTXResult
 	nvrtcResult result;
 };
 
-// These are opt-in CUDA-native declarations for symbols implemented and exported by Nabla.
-// Only consumers that include this header and link Nabla::ext::CUDAInterop see CUDA SDK types.
+// Opt-in native CUDA API. The declarations below are implemented by the Nabla library.
+// This header is intentionally the only public path that includes CUDA SDK types.
 class NBL_API2 CCUDAHandlerAccessor
 {
 	public:
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index bc1931e363..7c1d1f272b 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -89,8 +89,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 		struct SNativeState;
 		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr<SNativeState>&& nativeState, core::smart_refctd_ptr<CCUDAHandler>&& handler);
 
-		static constexpr auto CudaMemoryLocationCount = 5;
-
 		const system::logger_opt_ptr m_logger;
 		std::vector<const char*> m_defaultCompileOptions;
 		core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection;
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index f6b5d578a8..bb2d12c637 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -33,12 +33,17 @@ inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.js
 struct SRuntimeCompileEnvironment
 {
 	core::vector<system::path> includeDirs;
-	core::vector<system::path> runtimePathFiles;
 };
 
 NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs = {});
 NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs, core::vector<system::path> runtimePathFiles);
-NBL_API2 core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment);
+inline core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment)
+{
+	core::vector<std::string> options;
+	for (const auto& includeDir : environment.includeDirs)
+		options.push_back("-I" + includeDir.generic_string());
+	return options;
+}
 }
 
 class NBL_API2 CCUDAHandler : public core::IReferenceCounted
@@ -73,7 +78,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		friend struct cuda_native::SAccess;
 
 		struct SNativeState;
-		CCUDAHandler(std::unique_ptr<SNativeState>&& nativeState, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version);
+		CCUDAHandler(std::unique_ptr<SNativeState>&& nativeState, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger);
 
 		std::unique_ptr<SNativeState> m_native;
 		core::vector<SCUDADeviceInfo> m_availableDevices;
@@ -82,7 +87,6 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		core::vector<std::string> m_headerNamesStorage;
 		core::vector<const char*> m_headerNames;
 		system::logger_opt_smart_ptr m_logger;
-		int m_version;
 };
 
 }
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 7d350da379..fb9896e30e 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -93,11 +93,12 @@ Smoke examples:
 - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
 - `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI.
 - Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs.
+- Runtime include-option construction is header-only and is not part of the exported ABI.
 - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime.
 
 ## Runtime Header Discovery
 
-NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`.
+NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. This is a runtime concern of applications that compile CUDA source with NVRTC, not a default `Nabla::Nabla` package requirement.
 
 - `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into native CUDA interop.
 - The JSON is a build artifact. Nabla packages do not install host-specific CUDA paths.
@@ -105,7 +106,7 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud
 - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
 - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
 - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths.
-- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use.
+- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Default discovery is cached after the first call.
 
 Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
@@ -114,7 +115,11 @@ Nabla does not ship CUDA runtime headers by default. NVIDIA CUDA EULA allows red
 - https://docs.nvidia.com/cuda/eula/#distribution
 - https://docs.nvidia.com/cuda/eula/#attachment-a
 
-Attachment A includes header groups relevant to NVRTC runtime compilation, including `nvrtc.h`, `cuda_fp16.h`, `cuda_bf16.h`, `cuda_fp8.h`, `cuda_fp6.h`, `cuda_fp4.h`, `cuda_runtime_api.h`, `cuda.h`, `vector_functions.h`, and `vector_types.h`.
+Attachment A lists header groups relevant to NVRTC runtime compilation:
+
+- NVIDIA Runtime Compilation Library and Header: `nvrtc.h`
+- CUDA Floating Point Type Headers: `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp`
+- CUDA Headers for Runtime Compilation: `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, and `vector_types.h`
 
 CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They show the common `vector_types.h` failure and recommend CUDA runtime header packages for PyPI/system package installs:
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index ace1059215..5d35ec8bed 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -53,11 +53,13 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 		return false;
 
 	CUcontext poppedContext = nullptr;
+	bool contextPushed = false;
 	auto releaseContext = [&]()
 	{
 		if (context)
 		{
-			cuda.pcuCtxPopCurrent_v2(&poppedContext);
+			if (contextPushed)
+				cuda.pcuCtxPopCurrent_v2(&poppedContext);
 			cuda.pcuDevicePrimaryCtxRelease_v2(device);
 		}
 	};
@@ -67,6 +69,7 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 		releaseContext();
 		return false;
 	}
+	contextPushed = true;
 
 	constexpr std::array<uint32_t, 4> input = {0x12345678u, 0x90abcdefu, 0xfedcba09u, 0x87654321u};
 	std::array<uint32_t, input.size()> output = {};
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 9db99e7642..22ed5d0eb3 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -3,7 +3,6 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "nbl/video/CUDAInterop.h"
-#include "nbl/system/ModuleLookupUtils.h"
 
 #include "nlohmann/json.hpp"
 
@@ -253,11 +252,10 @@ void appendSystemIncludeDirs(core::vector<system::path>& includeDirs)
 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs, core::vector<system::path> runtimePathFiles)
 {
 	SRuntimeCompileEnvironment environment;
-	environment.runtimePathFiles = std::move(runtimePathFiles);
 	for (auto& includeDir : explicitIncludeDirs)
 		appendIncludeDir(environment.includeDirs,std::move(includeDir));
 
-	appendRuntimePathsConfigs(environment.includeDirs,environment.runtimePathFiles);
+	appendRuntimePathsConfigs(environment.includeDirs,runtimePathFiles);
 	appendAppLocalIncludeDirs(environment.includeDirs);
 	appendEnvironmentIncludeDirs(environment.includeDirs);
 	appendSystemIncludeDirs(environment.includeDirs);
@@ -270,14 +268,6 @@ SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::pa
 	return findRuntimeCompileEnvironment(std::move(explicitIncludeDirs),{});
 }
 
-core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment)
-{
-	core::vector<std::string> options;
-	for (const auto& includeDir : environment.includeDirs)
-		options.push_back("-I" + includeDir.generic_string());
-	return options;
-}
-
 }
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
@@ -307,12 +297,10 @@ int cudaVersionMinor(int version)
 CCUDAHandler::CCUDAHandler(
 	std::unique_ptr<SNativeState>&& nativeState,
 	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, 
-	core::smart_refctd_ptr<system::ILogger>&& _logger,
-	int _version)
+	core::smart_refctd_ptr<system::ILogger>&& _logger)
 	: m_native(std::move(nativeState))
 	, m_headers(std::move(_headers))
 	, m_logger(std::move(_logger))
-	, m_version(_version)
 {
 	assert(m_native);
 
@@ -866,7 +854,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	}
 
 	return core::smart_refctd_ptr<CCUDAHandler>(
-		new CCUDAHandler(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger),cudaVersion),
+		new CCUDAHandler(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger)),
 		core::dont_grab
 	);
 }
@@ -1097,12 +1085,10 @@ struct CCUDAHandler::SNativeState {};
 CCUDAHandler::CCUDAHandler(
 	std::unique_ptr<SNativeState>&& nativeState,
 	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers,
-	core::smart_refctd_ptr<system::ILogger>&& _logger,
-	int _version)
+	core::smart_refctd_ptr<system::ILogger>&& _logger)
 	: m_native(std::move(nativeState))
 	, m_headers(std::move(_headers))
 	, m_logger(std::move(_logger))
-	, m_version(_version)
 {
 	assert(m_native);
 }

From 5bf0e2d9c70280851f6779ce6a25b853f1730829 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 17:24:10 +0200
Subject: [PATCH 25/27] Keep CUDA SDK layouts private

---
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h    |  1 -
 src/nbl/ext/CUDAInterop/README.md              |  7 +++++--
 src/nbl/video/CCUDAHandler.cpp                 | 18 +++++++++---------
 src/nbl/video/CUDAInteropNativeState.hpp       |  7 +++++++
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index daf3dcb4d1..6d142c6b3f 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -145,7 +145,6 @@ struct SCUDADeviceInfo
 {
 	CUdevice handle = {};
 	CUuuid uuid = {};
-	int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
 };
 
 struct SExportableMemoryCreationParams
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index fb9896e30e..d60b15639a 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -44,6 +44,8 @@ Consumers can also choose the SDK used for native compilation with:
 cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>
 ```
 
+This affects native opt-in compilation and generated runtime header discovery only. It does not rebuild Nabla and does not change the `Nabla.dll` ABI.
+
 ## Native Usage
 
 ```cpp
@@ -92,9 +94,10 @@ Smoke examples:
 - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes.
 - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
 - `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI.
-- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs.
+- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small fixed-layout parameter/result structs.
+- SDK-sized arrays and other layouts derived from CUDA SDK constants stay private to Nabla. A consumer can build native opt-in code with its own compatible SDK independently from the SDK used to build Nabla.
 - Runtime include-option construction is header-only and is not part of the exported ABI.
-- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime.
+- The loaded CUDA driver and NVRTC runtime are validated at runtime.
 
 ## Runtime Header Discovery
 
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 22ed5d0eb3..78434d9bd5 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -325,15 +325,15 @@ CCUDAHandler::CCUDAHandler(
 		if (m_native->cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS)
 			continue;
 
-		auto& nativeDevice = m_native->availableDevices.emplace_back();
-		nativeDevice.handle = handle;
-		nativeDevice.uuid = uuid;
+		auto& nativeDevice = m_native->deviceStates.emplace_back();
+		nativeDevice.info.handle = handle;
+		nativeDevice.info.uuid = uuid;
+		m_native->availableDevices.push_back(nativeDevice.info);
 		auto& cleanDevice = m_availableDevices.emplace_back();
 		memcpy(cleanDevice.uuid.data(),&uuid,cleanDevice.uuid.size());
 
-		int* attributes = nativeDevice.attributes;
-		for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++)
-			m_native->cuda.pcuDeviceGetAttribute(attributes + i, static_cast<CUdevice_attribute>(i), handle);
+		for (size_t i = 0; i < nativeDevice.attributes.size(); i++)
+			m_native->cuda.pcuDeviceGetAttribute(&nativeDevice.attributes[i], static_cast<CUdevice_attribute>(i), handle);
 
 	}
 }
@@ -979,9 +979,9 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 	if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end())
 		return nullptr;
 
-	for (const auto& device : m_native->availableDevices)
+	for (const auto& device : m_native->deviceStates)
 	{
-		if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE))
+		if (!memcmp(&device.info.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE))
 		{
 			CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT;
 			const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR];
@@ -1064,7 +1064,7 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 				continue;
 
 			return core::smart_refctd_ptr<CCUDADevice>(
-				new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique<CCUDADevice::SNativeState>(device.handle),core::smart_refctd_ptr<CCUDAHandler>(this)),
+				new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique<CCUDADevice::SNativeState>(device.info.handle),core::smart_refctd_ptr<CCUDAHandler>(this)),
 				core::dont_grab
 			);
 		}
diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp
index 7e602bb0f3..4be8178aa2 100644
--- a/src/nbl/video/CUDAInteropNativeState.hpp
+++ b/src/nbl/video/CUDAInteropNativeState.hpp
@@ -10,9 +10,16 @@ namespace nbl::video
 
 struct CCUDAHandler::SNativeState
 {
+	struct SDeviceState
+	{
+		cuda_native::SCUDADeviceInfo info = {};
+		std::array<int,CU_DEVICE_ATTRIBUTE_MAX> attributes = {};
+	};
+
 	cuda_native::CUDA cuda;
 	cuda_native::NVRTC nvrtc;
 	core::vector<cuda_native::SCUDADeviceInfo> availableDevices;
+	core::vector<SDeviceState> deviceStates;
 
 	SNativeState(cuda_native::CUDA&& _cuda, cuda_native::NVRTC&& _nvrtc)
 		: cuda(std::move(_cuda))

From d745421cc25114adf5664fb778e873db8e8f5c7a Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 17:51:35 +0200
Subject: [PATCH 26/27] Simplify CUDA interop helper

---
 cmake/NablaCUDAInteropHelpers.cmake | 190 +++-------------------------
 src/nbl/ext/CUDAInterop/README.md   |   9 +-
 2 files changed, 26 insertions(+), 173 deletions(-)

diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake
index 9c1ac657d4..e84b2d1a8e 100644
--- a/cmake/NablaCUDAInteropHelpers.cmake
+++ b/cmake/NablaCUDAInteropHelpers.cmake
@@ -1,182 +1,28 @@
-function(_nbl_cuda_interop_collect_runtime_include_dirs _OUT_INCLUDE_DIRS)
-	set(_include_dirs ${ARGN})
-
-	if(DEFINED CUDAToolkit_INCLUDE_DIRS AND NOT "${CUDAToolkit_INCLUDE_DIRS}" STREQUAL "")
-		list(APPEND _include_dirs ${CUDAToolkit_INCLUDE_DIRS})
+function(nbl_target_link_cuda_interop TARGET_NAME SCOPE)
+	if(NOT SCOPE MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$")
+		set(SCOPE PRIVATE)
 	endif()
-
-	if(TARGET CUDA::toolkit)
-		get_target_property(_cuda_toolkit_include_dirs CUDA::toolkit INTERFACE_INCLUDE_DIRECTORIES)
-		if(_cuda_toolkit_include_dirs AND NOT _cuda_toolkit_include_dirs STREQUAL "NOTFOUND")
-			list(APPEND _include_dirs ${_cuda_toolkit_include_dirs})
-		endif()
-	endif()
-
-	if(_include_dirs)
-		list(REMOVE_DUPLICATES _include_dirs)
-	endif()
-
-	set(${_OUT_INCLUDE_DIRS} ${_include_dirs} PARENT_SCOPE)
-endfunction()
-
-function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT)
-	set(_include_dirs ${ARGN})
-	set(_cuda_runtime_include_dir_entries "")
-
-	foreach(_include_dir IN LISTS _include_dirs)
-		if("${_include_dir}" STREQUAL "")
-			continue()
+	cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN})
+	target_link_libraries("${TARGET_NAME}" ${SCOPE} Nabla::ext::CUDAInterop)
+	set(_include_dir_entries "")
+	foreach(_include_dir IN LISTS _NBL_CUDA_INTEROP_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS)
+		if(_include_dir)
+			file(TO_CMAKE_PATH "${_include_dir}" _include_dir)
+			list(APPEND _include_dir_entries "    \"${_include_dir}\"")
 		endif()
-
-		file(TO_CMAKE_PATH "${_include_dir}" _include_dir_json)
-		string(REPLACE "\"" "\\\"" _include_dir_json "${_include_dir_json}")
-
-		list(APPEND _cuda_runtime_include_dir_entries "    \"${_include_dir_json}\"")
 	endforeach()
-
-	set(_json_entry_separator [=[
-,
-]=])
-	list(JOIN _cuda_runtime_include_dir_entries "${_json_entry_separator}" _cuda_runtime_include_dirs)
-
-	set(_json [=[
+	list(JOIN _include_dir_entries "," _include_dirs_json)
+	set(_runtime_json [=[
 {
   "cudaRuntimeIncludeDirs": [
-@_cuda_runtime_include_dirs@
+@_include_dirs_json@
   ]
 }
 ]=])
-	string(CONFIGURE "${_json}" _json @ONLY)
-	set(${_OUT_CONTENT} "${_json}" PARENT_SCOPE)
-endfunction()
-
-function(_nbl_cuda_interop_collect_configs _OUT_CONFIGS)
-	if(CMAKE_CONFIGURATION_TYPES)
-		set(_configs ${CMAKE_CONFIGURATION_TYPES})
-	elseif(CMAKE_BUILD_TYPE)
-		set(_configs "${CMAKE_BUILD_TYPE}")
-	else()
-		set(_configs Debug)
-	endif()
-
-	list(REMOVE_DUPLICATES _configs)
-	set(${_OUT_CONFIGS} ${_configs} PARENT_SCOPE)
-endfunction()
-
-function(_nbl_cuda_interop_collect_target_runtime_jsons TARGET_NAME _OUT_FILES _OVERRIDE_OUTPUT)
-	_nbl_cuda_interop_collect_configs(_configs)
-	set(_runtime_jsons "")
-
-	if(NOT "${_OVERRIDE_OUTPUT}" STREQUAL "")
-		foreach(_config IN LISTS _configs)
-			set(_runtime_paths_json "${_OVERRIDE_OUTPUT}")
-			string(REPLACE "$<CONFIG>" "${_config}" _runtime_paths_json "${_runtime_paths_json}")
-			if(_runtime_paths_json MATCHES "\\$<")
-				message(FATAL_ERROR "Nabla: CUDA interop runtime JSON path supports only plain paths or $<CONFIG>.")
-			endif()
-			cmake_path(IS_ABSOLUTE _runtime_paths_json _is_abs)
-			if(NOT _is_abs)
-				cmake_path(ABSOLUTE_PATH _runtime_paths_json BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_paths_json)
-			endif()
-			cmake_path(NORMAL_PATH _runtime_paths_json OUTPUT_VARIABLE _runtime_paths_json)
-			list(APPEND _runtime_jsons "${_runtime_paths_json}")
-		endforeach()
-		list(REMOVE_DUPLICATES _runtime_jsons)
-		set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE)
-		return()
-	endif()
-
-	foreach(_config IN LISTS _configs)
-		string(TOUPPER "${_config}" _config_upper)
-		get_target_property(_runtime_output_dir "${TARGET_NAME}" "RUNTIME_OUTPUT_DIRECTORY_${_config_upper}")
-
-		if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND")
-			get_target_property(_runtime_output_dir "${TARGET_NAME}" RUNTIME_OUTPUT_DIRECTORY)
-		endif()
-		if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper})
-			set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper}}")
-		endif()
-		if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
-			set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
-		endif()
-		if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND")
-			if(CMAKE_CONFIGURATION_TYPES)
-				set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}/${_config}")
-			else()
-				set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
-			endif()
-		endif()
-
-		string(REPLACE "$<CONFIG>" "${_config}" _runtime_output_dir "${_runtime_output_dir}")
-		if(_runtime_output_dir MATCHES "\\$<")
-			message(FATAL_ERROR "Nabla: nbl_configure_cuda_interop_runtime supports only plain runtime output directories or $<CONFIG>.")
-		endif()
-
-		cmake_path(IS_ABSOLUTE _runtime_output_dir _is_abs)
-		if(NOT _is_abs)
-			cmake_path(ABSOLUTE_PATH _runtime_output_dir BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_output_dir)
-		endif()
-		cmake_path(NORMAL_PATH _runtime_output_dir OUTPUT_VARIABLE _runtime_output_dir)
-
-		list(APPEND _runtime_jsons "${_runtime_output_dir}/nbl_cuda_interop_runtime.json")
-	endforeach()
-
-	list(REMOVE_DUPLICATES _runtime_jsons)
-	set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE)
-endfunction()
-
-function(nbl_configure_cuda_interop_runtime TARGET_NAME)
-	cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN})
-
-	if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS)
-		message(FATAL_ERROR "Nabla: unexpected arguments for nbl_configure_cuda_interop_runtime: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}")
-	endif()
-
-	if(NOT TARGET "${TARGET_NAME}")
-		message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist")
-	endif()
-
-	_nbl_cuda_interop_collect_runtime_include_dirs(_include_dirs ${_NBL_CUDA_INTEROP_INCLUDE_DIRS})
-
-	_nbl_cuda_interop_make_runtime_paths_json(_runtime_paths_json_content ${_include_dirs})
-	_nbl_cuda_interop_collect_target_runtime_jsons("${TARGET_NAME}" _runtime_paths_jsons "${_NBL_CUDA_INTEROP_RUNTIME_JSON}")
-
-	foreach(_runtime_paths_json IN LISTS _runtime_paths_jsons)
-		file(GENERATE OUTPUT "${_runtime_paths_json}" CONTENT "${_runtime_paths_json_content}" TARGET "${TARGET_NAME}")
-	endforeach()
-
-	set_source_files_properties(${_runtime_paths_jsons} PROPERTIES GENERATED TRUE HEADER_FILE_ONLY TRUE)
-	target_sources("${TARGET_NAME}" PRIVATE ${_runtime_paths_jsons})
-endfunction()
-
-function(nbl_target_link_cuda_interop TARGET_NAME)
-	set(_args ${ARGN})
-	set(_scope PRIVATE)
-
-	if(_args)
-		list(GET _args 0 _first_arg)
-		if(_first_arg MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$")
-			set(_scope "${_first_arg}")
-			list(REMOVE_AT _args 0)
-		endif()
-	endif()
-
-	cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${_args})
-
-	if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS)
-		message(FATAL_ERROR "Nabla: unexpected arguments for nbl_target_link_cuda_interop: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}")
-	endif()
-
-	if(NOT TARGET "${TARGET_NAME}")
-		message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist")
-	endif()
-	if(NOT TARGET Nabla::ext::CUDAInterop)
-		message(FATAL_ERROR "Nabla: Nabla::ext::CUDAInterop is not available. Request the CUDAInterop package component or enable NBL_COMPILE_WITH_CUDA.")
+	string(CONFIGURE "${_runtime_json}" _runtime_json @ONLY)
+	set(_runtime_json_path "$<TARGET_FILE_DIR:${TARGET_NAME}>/nbl_cuda_interop_runtime.json")
+	if(_NBL_CUDA_INTEROP_RUNTIME_JSON)
+		set(_runtime_json_path "${_NBL_CUDA_INTEROP_RUNTIME_JSON}")
 	endif()
-
-	target_link_libraries("${TARGET_NAME}" ${_scope} Nabla::ext::CUDAInterop)
-	nbl_configure_cuda_interop_runtime("${TARGET_NAME}"
-		RUNTIME_JSON "${_NBL_CUDA_INTEROP_RUNTIME_JSON}"
-		INCLUDE_DIRS ${_NBL_CUDA_INTEROP_INCLUDE_DIRS}
-	)
+	file(GENERATE OUTPUT "${_runtime_json_path}" CONTENT "${_runtime_json}" TARGET "${TARGET_NAME}")
 endfunction()
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index d60b15639a..2ce46cbc93 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -113,11 +113,15 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud
 
 Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
-Nabla does not ship CUDA runtime headers by default. NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See:
+Nabla could ship an app-local bundle of selected CUDA runtime headers and make it available to runtime discovery. That model is allowed by the NVIDIA CUDA EULA for the components listed in Attachment A. Nabla intentionally does not bundle these headers. Because of that, end users should prefer an official CUDA runtime/header package for production machines. An installed toolkit also works, but the full toolkit is mainly for developers compiling Nabla or native CUDA code.
+
+NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See:
 
 - https://docs.nvidia.com/cuda/eula/#distribution
 - https://docs.nvidia.com/cuda/eula/#attachment-a
 
+This means the Attachment A header groups below can be redistributed with applications under the EULA terms. It does not mean the full CUDA SDK can be redistributed. Applications that need NVRTC runtime compilation can decide whether to ship the allowed headers, depend on an official runtime/header package, or point discovery at an installed toolkit/header package.
+
 Attachment A lists header groups relevant to NVRTC runtime compilation:
 
 - NVIDIA Runtime Compilation Library and Header: `nvrtc.h`
@@ -144,3 +148,6 @@ The split follows the same boundary pattern used by mature GPU projects: default
 - OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61
 - Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27
 - Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30
+- OpenMM keeps the CUDA platform boundary on OpenMM types/properties in `CudaPlatform.h`, while `CudaContext.h` is the CUDA-specific low-level header that includes CUDA SDK headers and exposes `CUmodule` / `CUfunction`: https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaPlatform.h#L48-L120 and https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaContext.h#L32-L52
+- GROMACS gates CUDA source handling behind `GMX_GPU_CUDA` in the library build and keeps CUDA runtime types in internal GPU utility headers: https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/CMakeLists.txt#L339-L367 and https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/gpu_utils/gputraits.cuh#L44-L58
+- ONNX Runtime keeps the public C API provider-neutral and routes CUDA through provider-specific bridge/factory code: https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_c_api.h#L1-L80 and https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/session/provider_bridge_ort.cc#L110-L150

From ffba3d48d4ac5fd7f26ed324c310f338328572af Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 18:12:36 +0200
Subject: [PATCH 27/27] Update CUDA interop examples pointer

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 3c57a88af9..7b5817a6d4 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 3c57a88af9eba722fcc6b5b5ba3d136ab3e166ca
+Subproject commit 7b5817a6d45c62a70fbe617022b6026a83939ff5