diff --git a/CMakeLists.txt b/CMakeLists.txt index fa74e167f0..9251a3ee68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,15 +70,15 @@ else() message(STATUS "Vulkan SDK is not found") endif() -option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" OFF) +option(NBL_COMPILE_WITH_CUDA "Build the CUDA interop extension?" OFF) +set(NBL_CUDA_TOOLKIT_ROOT "" CACHE PATH "Optional CUDA Toolkit root used when NBL_COMPILE_WITH_CUDA is ON") if(NBL_COMPILE_WITH_CUDA) - find_package(CUDAToolkit REQUIRED) - if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") - message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!") - else() - message(FATAL_ERROR "CUDA version 13.0+ needed for C++14 support!") + if(NBL_CUDA_TOOLKIT_ROOT) + set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}") endif() + find_package(CUDAToolkit 13.0 REQUIRED) + message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!") endif() get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) @@ -183,13 +183,12 @@ option(NBL_BUILD_IMGUI "Enable nbl::ext::ImGui?" ON) option(NBL_BUILD_DEBUG_DRAW "Enable Nabla Debug Draw extension?" ON) option(NBL_BUILD_OPTIX "Enable nbl::ext::OptiX?" OFF) -if(NBL_COMPILE_WITH_CUDA) - find_package(OPTIX REQUIRED) - message(STATUS "CUDA enabled and OptiX found!") -else() - if(NBL_BUILD_OPTIX) +if(NBL_BUILD_OPTIX) + if(NOT NBL_COMPILE_WITH_CUDA) message(FATAL_ERROR "You cannot build Optix without enabled CUDA! NBL_COMPILE_WITH_CUDA must be ON!") endif() + find_package(OPTIX REQUIRED) + message(STATUS "CUDA enabled and OptiX found!") endif() option(NBL_BUILD_BULLET "Enable Bullet Physics building and integration?" OFF) @@ -313,6 +312,7 @@ if(NBL_ENABLE_CONFIG_INSTALL) set(_NBL_NABLA_CONFIG_FILES "${CMAKE_CURRENT_BINARY_DIR}/NablaConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/NablaConfigVersion.cmake" + "${CMAKE_CURRENT_LIST_DIR}/cmake/NablaCUDAInteropHelpers.cmake" ) install(EXPORT NablaExportTargets diff --git a/cmake/FindZLIB.cmake b/cmake/FindZLIB.cmake index f855c396b9..42aa789bee 100644 --- a/cmake/FindZLIB.cmake +++ b/cmake/FindZLIB.cmake @@ -4,4 +4,6 @@ endif() set(ZLIB_FOUND TRUE) set(ZLIB_LIBRARY ZLIB::ZLIB) -set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib") \ No newline at end of file +set(ZLIB_LIBRARIES ZLIB::ZLIB) +set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib") +set(ZLIB_INCLUDE_DIRS "${ZLIB_INCLUDE_DIR}") diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake new file mode 100644 index 0000000000..e84b2d1a8e --- /dev/null +++ b/cmake/NablaCUDAInteropHelpers.cmake @@ -0,0 +1,28 @@ +function(nbl_target_link_cuda_interop TARGET_NAME SCOPE) + if(NOT SCOPE MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$") + set(SCOPE PRIVATE) + endif() + cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN}) + target_link_libraries("${TARGET_NAME}" ${SCOPE} Nabla::ext::CUDAInterop) + set(_include_dir_entries "") + foreach(_include_dir IN LISTS _NBL_CUDA_INTEROP_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS) + if(_include_dir) + file(TO_CMAKE_PATH "${_include_dir}" _include_dir) + list(APPEND _include_dir_entries " \"${_include_dir}\"") + endif() + endforeach() + list(JOIN _include_dir_entries "," _include_dirs_json) + set(_runtime_json [=[ +{ + "cudaRuntimeIncludeDirs": [ +@_include_dirs_json@ + ] +} +]=]) + string(CONFIGURE "${_runtime_json}" _runtime_json @ONLY) + set(_runtime_json_path "$/nbl_cuda_interop_runtime.json") + if(_NBL_CUDA_INTEROP_RUNTIME_JSON) + set(_runtime_json_path "${_NBL_CUDA_INTEROP_RUNTIME_JSON}") + endif() + file(GENERATE OUTPUT "${_runtime_json_path}" CONTENT "${_runtime_json}" TARGET "${TARGET_NAME}") +endfunction() diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in index b22b3ad0d7..0464340ce3 100644 --- a/cmake/NablaConfig.cmake.in +++ b/cmake/NablaConfig.cmake.in @@ -6,6 +6,7 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso set(_NBL_NABLA_LOAD_CORE OFF) set(_NBL_NABLA_LOAD_NSC OFF) +set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF) set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS}) set(_NBL_NABLA_HAS_CORE_EXPORTS OFF) set(_NBL_NABLA_HAS_NSC_EXPORTS OFF) @@ -25,6 +26,10 @@ if(_NBL_NABLA_COMPONENTS) elseif(_NBL_NABLA_COMPONENT STREQUAL "Core") set(_NBL_NABLA_LOAD_CORE ON) set(Nabla_Core_FOUND TRUE) + elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInterop") + set(_NBL_NABLA_LOAD_CORE ON) + set(_NBL_NABLA_LOAD_CUDA_INTEROP ON) + set(Nabla_CUDAInterop_FOUND TRUE) else() set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE) endif() @@ -80,6 +85,23 @@ if(_NBL_NABLA_LOAD_NSC) endif() endif() +if(_NBL_NABLA_LOAD_CUDA_INTEROP) + include(CMakeFindDependencyMacro) + + if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "") + set(CUDAToolkit_ROOT "${Nabla_CUDA_TOOLKIT_ROOT}") + endif() + + find_dependency(CUDAToolkit 13.0 REQUIRED) + _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) + if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop) + target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit) + if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake") + endif() + endif() +endif() + check_required_components(Nabla) # diff --git a/examples_tests b/examples_tests index 93ca5efe58..7b5817a6d4 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 93ca5efe588ca85c1eaf81a486b611df98403580 +Subproject commit 7b5817a6d45c62a70fbe617022b6026a83939ff5 diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h new file mode 100644 index 0000000000..6d142c6b3f --- /dev/null +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -0,0 +1,214 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ +#define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ + +#include "nbl/video/CUDAInterop.h" + +#include "nbl/asset/ICPUBuffer.h" +#include "nbl/system/DynamicFunctionCaller.h" + +#include + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 13000 + #error "Need CUDA 13.0 SDK or higher." +#endif + +namespace nbl::video::cuda_native +{ + +inline constexpr int MinimumCUDADriverVersion = 13000; +inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000; + +using LibLoader = system::DefaultFuncPtrLoader; + +NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader + ,cuCtxCreate_v4 + ,cuDevicePrimaryCtxRetain + ,cuDevicePrimaryCtxRelease + ,cuDevicePrimaryCtxSetFlags + ,cuDevicePrimaryCtxGetState + ,cuCtxDestroy_v2 + ,cuCtxEnablePeerAccess + ,cuCtxGetApiVersion + ,cuCtxGetCurrent + ,cuCtxGetDevice + ,cuCtxGetSharedMemConfig + ,cuCtxPopCurrent_v2 + ,cuCtxPushCurrent_v2 + ,cuCtxSetCacheConfig + ,cuCtxSetCurrent + ,cuCtxSetSharedMemConfig + ,cuCtxSynchronize + ,cuDeviceComputeCapability + ,cuDeviceCanAccessPeer + ,cuDeviceGetCount + ,cuDeviceGet + ,cuDeviceGetAttribute + ,cuDeviceGetLuid + ,cuDeviceGetUuid_v2 + ,cuDeviceTotalMem_v2 + ,cuDeviceGetName + ,cuDriverGetVersion + ,cuEventCreate + ,cuEventDestroy_v2 + ,cuEventElapsedTime + ,cuEventQuery + ,cuEventRecord + ,cuEventSynchronize + ,cuFuncGetAttribute + ,cuFuncSetCacheConfig + ,cuGetErrorName + ,cuGetErrorString + ,cuGraphicsMapResources + ,cuGraphicsResourceGetMappedPointer_v2 + ,cuGraphicsResourceGetMappedMipmappedArray + ,cuGraphicsSubResourceGetMappedArray + ,cuGraphicsUnmapResources + ,cuGraphicsUnregisterResource + ,cuInit + ,cuLaunchKernel + ,cuMemAlloc_v2 + ,cuMemcpyDtoD_v2 + ,cuMemcpyDtoH_v2 + ,cuMemcpyHtoD_v2 + ,cuMemcpyDtoDAsync_v2 + ,cuMemcpyDtoHAsync_v2 + ,cuMemcpyHtoDAsync_v2 + ,cuMemGetAddressRange_v2 + ,cuMemFree_v2 + ,cuMemFreeHost + ,cuMemGetInfo_v2 + ,cuMemHostAlloc + ,cuMemHostRegister_v2 + ,cuMemHostUnregister + ,cuMemsetD32_v2 + ,cuMemsetD32Async + ,cuMemsetD8_v2 + ,cuMemsetD8Async + ,cuModuleGetFunction + ,cuModuleGetGlobal_v2 + ,cuModuleLoadDataEx + ,cuModuleLoadFatBinary + ,cuModuleUnload + ,cuOccupancyMaxActiveBlocksPerMultiprocessor + ,cuPointerGetAttribute + ,cuStreamAddCallback + ,cuStreamCreate + ,cuStreamDestroy_v2 + ,cuStreamQuery + ,cuStreamSynchronize + ,cuStreamWaitEvent + ,cuSurfObjectCreate + ,cuSurfObjectDestroy + ,cuTexObjectCreate + ,cuTexObjectDestroy + ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess + ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync + ,cuLogsRegisterCallback +); + +NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, + nvrtcGetErrorString, + nvrtcVersion, + nvrtcAddNameExpression, + nvrtcCompileProgram, + nvrtcCreateProgram, + nvrtcDestroyProgram, + nvrtcGetLoweredName, + nvrtcGetPTX, + nvrtcGetPTXSize, + nvrtcGetProgramLog, + nvrtcGetProgramLogSize +); + +struct SCUDADeviceInfo +{ + CUdevice handle = {}; + CUuuid uuid = {}; +}; + +struct SExportableMemoryCreationParams +{ + size_t size; + uint32_t alignment; + CUmemLocationType location; +}; + +struct SPTXResult +{ + core::smart_refctd_ptr ptx; + nvrtcResult result; +}; + +// Opt-in native CUDA API. The declarations below are implemented by the Nabla library. +// This header is intentionally the only public path that includes CUDA SDK types. +class NBL_API2 CCUDAHandlerAccessor +{ + public: + static const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); + static const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); + static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); + static bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); + static bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); + static const core::vector& getAvailableDevices(const CCUDAHandler& handler); + static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); + static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); + static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); + static SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog); + static SPTXResult compileDirectlyToPTX( + CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, + std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr + ); +}; + +class NBL_API2 CCUDADeviceAccessor +{ + public: + static CUdevice getInternalObject(const CCUDADevice& device); + static CUcontext getContext(const CCUDADevice& device); + static size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size); + static core::smart_refctd_ptr createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params); +}; + +class NBL_API2 CCUDAExportableMemoryAccessor +{ + public: + static CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory); +}; + +class NBL_API2 CCUDAImportedMemoryAccessor +{ + public: + static CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory); + static CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer); +}; + +class NBL_API2 CCUDAImportedSemaphoreAccessor +{ + public: + static CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore); +}; + +} + +#endif diff --git a/include/nbl/ext/OptiX/IDenoiser.h b/include/nbl/ext/OptiX/IDenoiser.h index 7820aa1222..bb0677657d 100644 --- a/include/nbl/ext/OptiX/IDenoiser.h +++ b/include/nbl/ext/OptiX/IDenoiser.h @@ -5,7 +5,7 @@ #ifndef __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__ #define __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__ -#include "../../../../src/nbl/video/CCUDAHandler.h" +#include "nbl/video/CCUDAHandler.h" #include #include @@ -122,4 +122,4 @@ class IDenoiser final : public core::IReferenceCounted } } -#endif \ No newline at end of file +#endif diff --git a/include/nbl/system/DefaultFuncPtrLoader.h b/include/nbl/system/DefaultFuncPtrLoader.h index 56142448c8..10fab3a454 100644 --- a/include/nbl/system/DefaultFuncPtrLoader.h +++ b/include/nbl/system/DefaultFuncPtrLoader.h @@ -11,18 +11,18 @@ namespace nbl::system { -class DefaultFuncPtrLoader final : FuncPtrLoader +class NBL_API2 DefaultFuncPtrLoader final : FuncPtrLoader { void* lib; public: inline DefaultFuncPtrLoader() : lib(nullptr) {} - NBL_API2 DefaultFuncPtrLoader(const char* name); + DefaultFuncPtrLoader(const char* name); inline DefaultFuncPtrLoader(DefaultFuncPtrLoader&& other) : DefaultFuncPtrLoader() { operator=(std::move(other)); } - NBL_API2 ~DefaultFuncPtrLoader(); + ~DefaultFuncPtrLoader(); inline DefaultFuncPtrLoader& operator=(DefaultFuncPtrLoader&& other) { @@ -40,4 +40,4 @@ class DefaultFuncPtrLoader final : FuncPtrLoader } -#endif \ No newline at end of file +#endif diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 02f85fdac8..7c1d1f272b 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -4,38 +4,31 @@ #ifndef _NBL_VIDEO_C_CUDA_DEVICE_H_ #define _NBL_VIDEO_C_CUDA_DEVICE_H_ - -#include "nbl/video/IPhysicalDevice.h" +#include "nbl/video/declarations.h" #include "nbl/video/CCUDAExportableMemory.h" #include "nbl/video/CCUDAImportedMemory.h" #include "nbl/video/CCUDAImportedSemaphore.h" - -#ifdef _NBL_COMPILE_WITH_CUDA_ - -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -// useful includes in the future -//#include "cudaEGL.h" -//#include "cudaVDPAU.h" +#include +#include +#include namespace nbl::video { class CCUDAHandler; +namespace cuda_native +{ +struct SAccess; +} + class NBL_API2 CCUDADevice : public core::IReferenceCounted { - public: + public: #ifdef _WIN32 static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; - static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; #else static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD; - static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #endif enum E_VIRTUAL_ARCHITECTURE @@ -74,48 +67,38 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted }; inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;} - CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr&& handler); - - ~CCUDADevice(); + ~CCUDADevice() override; inline core::SRange geDefaultCompileOptions() const { return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; } - CUdevice getInternalObject() const { return m_handle; } - const CCUDAHandler* getHandler() const { return m_handler.get(); } bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); } - size_t roundToGranularity(CUmemLocationType location, size_t size) const; - - core::smart_refctd_ptr createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams); - core::smart_refctd_ptr importExternalMemory(core::smart_refctd_ptr&& mem); core::smart_refctd_ptr importExternalSemaphore(core::smart_refctd_ptr&& sem); private: - CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; + friend class CCUDAHandler; + friend struct cuda_native::SAccess; - static constexpr auto CudaMemoryLocationCount = 5; + struct SNativeState; + CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler); - const system::logger_opt_ptr m_logger; + const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; IPhysicalDevice* const m_physicalDevice; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; core::smart_refctd_ptr m_handler; - CUdevice m_handle; - CUcontext m_context; - std::array m_allocationGranularity; + std::unique_ptr m_native; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index 1c3d206906..6d29739408 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -4,62 +4,48 @@ #ifndef _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ #define _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ +#include "nbl/video/declarations.h" -#ifdef _NBL_COMPILE_WITH_CUDA_ - -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -// useful includes in the future -//#include "cudaEGL.h" -//#include "cudaVDPAU.h" +#include +#include namespace nbl::video { - class CCUDADevice; -class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted +namespace cuda_native { - public: - - struct SCreationParams - { - size_t size; - uint32_t alignment; - CUmemLocationType location; - }; - - struct SCachedCreationParams : SCreationParams - { - size_t granularSize; - CUdeviceptr ptr; - external_handle_t externalHandle; - }; - - CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) - : m_device(std::move(device)) - , m_params(std::move(params)) - {} - ~CCUDAExportableMemory() override; - - CUdeviceptr getDeviceptr() const { return m_params.ptr; } - - const SCreationParams& getCreationParams() const { return m_params; } - - core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; - - private: +struct SAccess; +} - core::smart_refctd_ptr m_device; - SCachedCreationParams m_params; +class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted +{ + public: + struct SCachedCreationParams + { + size_t size; + uint32_t alignment; + size_t granularSize; + external_handle_t externalHandle; + bool deviceLocal; + }; + + ~CCUDAExportableMemory() override; + + core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; + + private: + friend struct cuda_native::SAccess; + + struct SNativeState; + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); + static core::smart_refctd_ptr create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); + + core::smart_refctd_ptr m_device; + SCachedCreationParams m_params; + std::unique_ptr m_native; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - -#endif \ No newline at end of file +#endif diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 61e9522a66..bb2d12c637 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -8,159 +8,49 @@ #include "nbl/core/definitions.h" #include "nbl/system/declarations.h" +#include "nbl/system/path.h" -#include "nbl/video/CCUDADevice.h" +#include +#include +#include +#include - -#ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { +class CCUDADevice; +class CVulkanConnection; +class IPhysicalDevice; - -class NBL_API2 CCUDAHandler : public core::IReferenceCounted +namespace cuda_native { - public: - static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); +struct SAccess; +} - inline bool defaultHandleResult(CUresult result) const - { - core::smart_refctd_ptr logger = m_logger.get(); - return defaultHandleResult(result,logger.get()); - } +namespace cuda_interop +{ +inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.json"; - // - bool defaultHandleResult(nvrtcResult result); +struct SRuntimeCompileEnvironment +{ + core::vector includeDirs; +}; - // - template - static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs = {}); +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs, core::vector runtimePathFiles); +inline core::vector makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment) +{ + core::vector options; + for (const auto& includeDir : environment.includeDirs) + options.push_back("-I" + includeDir.generic_string()); + return options; +} +} - // +class NBL_API2 CCUDAHandler : public core::IReferenceCounted +{ + public: static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); - // - using LibLoader = system::DefaultFuncPtrLoader; - NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v4 - ,cuDevicePrimaryCtxRetain - ,cuDevicePrimaryCtxRelease - ,cuDevicePrimaryCtxSetFlags - ,cuDevicePrimaryCtxGetState - ,cuCtxDestroy_v2 - ,cuCtxEnablePeerAccess - ,cuCtxGetApiVersion - ,cuCtxGetCurrent - ,cuCtxGetDevice - ,cuCtxGetSharedMemConfig - ,cuCtxPopCurrent_v2 - ,cuCtxPushCurrent_v2 - ,cuCtxSetCacheConfig - ,cuCtxSetCurrent - ,cuCtxSetSharedMemConfig - ,cuCtxSynchronize - ,cuDeviceComputeCapability - ,cuDeviceCanAccessPeer - ,cuDeviceGetCount - ,cuDeviceGet - ,cuDeviceGetAttribute - ,cuDeviceGetLuid - ,cuDeviceGetUuid_v2 - ,cuDeviceTotalMem_v2 - ,cuDeviceGetName - ,cuDriverGetVersion - ,cuEventCreate - ,cuEventDestroy_v2 - ,cuEventElapsedTime - ,cuEventQuery - ,cuEventRecord - ,cuEventSynchronize - ,cuFuncGetAttribute - ,cuFuncSetCacheConfig - ,cuGetErrorName - ,cuGetErrorString - ,cuGraphicsMapResources - ,cuGraphicsResourceGetMappedPointer_v2 - ,cuGraphicsResourceGetMappedMipmappedArray - ,cuGraphicsSubResourceGetMappedArray - ,cuGraphicsUnmapResources - ,cuGraphicsUnregisterResource - ,cuInit - ,cuLaunchKernel - ,cuMemAlloc_v2 - ,cuMemcpyDtoD_v2 - ,cuMemcpyDtoH_v2 - ,cuMemcpyHtoD_v2 - ,cuMemcpyDtoDAsync_v2 - ,cuMemcpyDtoHAsync_v2 - ,cuMemcpyHtoDAsync_v2 - ,cuMemGetAddressRange_v2 - ,cuMemFree_v2 - ,cuMemFreeHost - ,cuMemGetInfo_v2 - ,cuMemHostAlloc - ,cuMemHostRegister_v2 - ,cuMemHostUnregister - ,cuMemsetD32_v2 - ,cuMemsetD32Async - ,cuMemsetD8_v2 - ,cuMemsetD8Async - ,cuModuleGetFunction - ,cuModuleGetGlobal_v2 - ,cuModuleLoadDataEx - ,cuModuleLoadFatBinary - ,cuModuleUnload - ,cuOccupancyMaxActiveBlocksPerMultiprocessor - ,cuPointerGetAttribute - ,cuStreamAddCallback - ,cuStreamCreate - ,cuStreamDestroy_v2 - ,cuStreamQuery - ,cuStreamSynchronize - ,cuStreamWaitEvent - ,cuSurfObjectCreate - ,cuSurfObjectDestroy - ,cuTexObjectCreate - ,cuTexObjectDestroy - ,cuImportExternalMemory - ,cuDestroyExternalMemory - ,cuExternalMemoryGetMappedBuffer - ,cuMemUnmap - ,cuMemAddressFree - ,cuMemGetAllocationGranularity - ,cuMemAddressReserve - ,cuMemCreate - ,cuMemExportToShareableHandle - ,cuMemMap - ,cuMemRelease - ,cuMemSetAccess - ,cuMemImportFromShareableHandle - ,cuLaunchHostFunc - ,cuDestroyExternalSemaphore - ,cuImportExternalSemaphore - ,cuSignalExternalSemaphoresAsync - ,cuWaitExternalSemaphoresAsync - ,cuLogsRegisterCallback - ); - const CUDA& getCUDAFunctionTable() const {return m_cuda;} - - NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, - nvrtcGetErrorString, - nvrtcVersion, - nvrtcAddNameExpression, - nvrtcCompileProgram, - nvrtcCreateProgram, - nvrtcDestroyProgram, - nvrtcGetLoweredName, - nvrtcGetPTX, - nvrtcGetPTXSize, - nvrtcGetProgramLog, - nvrtcGetProgramLogSize - ); - const NVRTC& getNVRTCFunctionTable() const {return m_nvrtc;} - - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); - - // inline core::SRange getSTDHeaders() { auto begin = m_headers.empty() ? nullptr:(&m_headers[0].get()); @@ -169,29 +59,9 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted inline const auto& getSTDHeaderContents() { return m_headerContents; } inline const auto& getSTDHeaderNames() { return m_headerNames; } - // - nvrtcResult createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); - inline nvrtcResult createProgram(nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) - { - return createProgram(prog,std::string(source),name,headerCount,headerContents,includeNames); - } - inline nvrtcResult createProgram(nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) - { - const auto filesize = file->getSize(); - std::string source(filesize+1u,'0'); - - system::IFile::success_t bytesRead; - file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.getBytesProcessed()); - - return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); - } - struct SCUDADeviceInfo { - CUdevice handle = {}; - CUuuid uuid = {}; - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; + std::array uuid = {}; }; inline core::vector const& getAvailableDevices() const @@ -199,112 +69,26 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted return m_availableDevices; } - // - inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) - { - return m_nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin()); - } - - // - nvrtcResult getProgramLog(nvrtcProgram prog, std::string& log); - - // - struct ptx_and_nvrtcResult_t - { - core::smart_refctd_ptr ptx; - nvrtcResult result; - }; - ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog); - - // - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - std::string&& source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; - auto cleanup = core::makeRAIIExiter([&]() -> void - { - if (result!=NVRTC_SUCCESS && program) - m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX? - }); - - result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames); - return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); - } - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - const char* source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - return compileDirectlyToPTX(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); - } - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - system::IFile* file, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; - auto cleanup = core::makeRAIIExiter([&]() -> void - { - if (result!=NVRTC_SUCCESS && program) - m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX? - }); - - result = createProgram(&program,file,headerCount,headerContents,includeNames); - return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); - } - core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: + ~CCUDAHandler() override; - ~CCUDAHandler() = default; - - // - inline ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string* log) - { - if (result!=NVRTC_SUCCESS) - return {nullptr,result}; - - result = compileProgram(program,nvrtcOptions); - if (log) - getProgramLog(program,*log); - if (result!=NVRTC_SUCCESS) - return {nullptr,result}; - - return getPTX(program); - } + private: + friend struct cuda_native::SAccess; - // function tables - CUDA m_cuda; - NVRTC m_nvrtc; + struct SNativeState; + CCUDAHandler(std::unique_ptr&& nativeState, core::vector>&& _headers, core::smart_refctd_ptr&& _logger); - // + std::unique_ptr m_native; core::vector m_availableDevices; core::vector> m_headers; core::vector m_headerContents; core::vector m_headerNamesStorage; core::vector m_headerNames; system::logger_opt_smart_ptr m_logger; - int m_version; }; -#define ASSERT_CUDA_SUCCESS(expr, handler) \ - do { \ - const auto cudaResult = (expr); \ - if (!((handler)->defaultHandleResult(cudaResult))) { \ - assert(false); \ - } \ - } while(0) - } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h index 4e3bfcd085..ac41c110a2 100644 --- a/include/nbl/video/CCUDAImportedMemory.h +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -1,42 +1,38 @@ -#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H -#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H +#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_ -#ifdef _NBL_COMPILE_WITH_CUDA_ +#include "nbl/video/declarations.h" -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -#endif // _NBL_COMPILE_WITH_CUDA +#include +#include namespace nbl::video { -class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted -{ - public: - - CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, - CUexternalMemory cuExtMem) : - m_device(device), - m_src(src), - m_handle(cuExtMem) {} +class CCUDADevice; - ~CCUDAImportedMemory() override; +namespace cuda_native +{ +struct SAccess; +} - CUexternalMemory getInternalObject() const { return m_handle; } - CUresult getMappedBuffer(CUdeviceptr* mappedBuffer); +class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted +{ + public: + ~CCUDAImportedMemory() override; - private: + private: + friend class CCUDADevice; + friend struct cuda_native::SAccess; - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_src; - CUexternalMemory m_handle; + struct SNativeState; + CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + std::unique_ptr m_native; }; } -#endif \ No newline at end of file +#endif diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h index 2e5010fa2d..c8bf77313e 100644 --- a/include/nbl/video/CCUDAImportedSemaphore.h +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -4,43 +4,38 @@ #ifndef _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ #define _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ -#ifdef _NBL_COMPILE_WITH_CUDA_ +#include "nbl/video/declarations.h" -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -// useful includes in the future -//#include "cudaEGL.h" -//#include "cudaVDPAU.h" +#include +#include namespace nbl::video { +class CCUDADevice; + +namespace cuda_native +{ +struct SAccess; +} + class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted { - public: - - CUexternalSemaphore getInternalObject() const { return m_handle; } - CCUDAImportedSemaphore(core::smart_refctd_ptr device, - core::smart_refctd_ptr src, - CUexternalSemaphore semaphore) - : m_device(std::move(device)) - , m_src(std::move(src)) - , m_handle(semaphore) - {} - ~CCUDAImportedSemaphore() override; - - private: - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_src; - CUexternalSemaphore m_handle; + public: + ~CCUDAImportedSemaphore() override; + + private: + friend class CCUDADevice; + friend struct cuda_native::SAccess; + + struct SNativeState; + CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + std::unique_ptr m_native; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/video/CUDAInterop.h b/include/nbl/video/CUDAInterop.h new file mode 100644 index 0000000000..57e92ae647 --- /dev/null +++ b/include/nbl/video/CUDAInterop.h @@ -0,0 +1,13 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_ + +#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CCUDAExportableMemory.h" +#include "nbl/video/CCUDAHandler.h" +#include "nbl/video/CCUDAImportedMemory.h" +#include "nbl/video/CCUDAImportedSemaphore.h" + +#endif diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index 7f99d40309..44a31ecf90 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -28,29 +28,8 @@ constexpr external_handle_t ExternalHandleNull = nullptr; constexpr external_handle_t ExternalHandleNull = -1; #endif -inline bool CloseExternalHandle(external_handle_t handle) -{ -#ifdef _WIN32 - return CloseHandle(handle); -#else - return (close(handle) == 0); -#endif -} - -inline external_handle_t DuplicateExternalHandle(external_handle_t handle) -{ -#ifdef _WIN32 - HANDLE re = ExternalHandleNull; - - const HANDLE cur = GetCurrentProcess(); - if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) - return ExternalHandleNull; - - return re; -#else - return dup(handle); -#endif -} +NBL_API2 bool CloseExternalHandle(external_handle_t handle); +NBL_API2 external_handle_t DuplicateExternalHandle(external_handle_t handle); } diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h index 37f2f864bf..4393af1768 100644 --- a/include/nbl/video/declarations.h +++ b/include/nbl/video/declarations.h @@ -24,9 +24,6 @@ #include "nbl/video/CVulkanImage.h" #include "nbl/video/surface/CSurfaceVulkan.h" -// CUDA -#include "nbl/video/CCUDAHandler.h" - // utilities #include "nbl/video/utilities/CDumbPresentationOracle.h" #include "nbl/video/utilities/ICommandPoolCache.h" @@ -44,4 +41,4 @@ //#include "nbl/video/IGPUVirtualTexture.h" -#endif \ No newline at end of file +#endif diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 692efec8bd..d56c223e34 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -95,12 +95,8 @@ configure_file("${NBL_ROOT_PATH}/include/nbl/config/BuildConfigOptions.h.in" "${ file(GENERATE OUTPUT "${CONFIG_OUTPUT}" INPUT "${CONFIG_DIRECOTORY}/.int/BuildConfigOptions.h.conf") nbl_install_file_spec("${CONFIG_OUTPUT}" nbl/config) -if (NBL_COMPILE_WITH_CUDA) - message(STATUS "Building with CUDA interop") - set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA}) - if (NBL_BUILD_OPTIX) - set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) - endif() +if (NBL_BUILD_OPTIX) + set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) endif() # => TODO: clean! @@ -128,6 +124,15 @@ set(NBL_CORE_SOURCES core/alloc/refctd_memory_resource.cpp core/hash/blake.cpp ) + +set(NBL_CUDA_INTEROP_SOURCES + video/CCUDADevice.cpp + video/CCUDAExportableMemory.cpp + video/CCUDAHandler.cpp + video/CCUDAImportedMemory.cpp + video/CCUDAImportedSemaphore.cpp +) + set(NBL_SYSTEM_SOURCES system/DefaultFuncPtrLoader.cpp system/IFileBase.cpp @@ -252,6 +257,7 @@ set(NBL_VIDEO_SOURCES video/IGPUAccelerationStructure.cpp video/IGPUCommandBuffer.cpp video/IQueue.cpp + video/EApiType.cpp video/IGPUDescriptorSet.cpp video/IDeviceMemoryAllocation.cpp video/IDeviceMemoryBacked.cpp @@ -291,12 +297,6 @@ set(NBL_VIDEO_SOURCES video/CVulkanEvent.cpp video/CSurfaceVulkan.cpp -# CUDA - video/CCUDAHandler.cpp - video/CCUDADevice.cpp - video/CCUDAImportedSemaphore.cpp - video/CCUDAExportableMemory.cpp - video/CCUDAImportedMemory.cpp ) set(NBL_SCENE_SOURCES @@ -315,6 +315,7 @@ set(NABLA_SRCS_COMMON ${NBL_VIDEO_SOURCES} ${NBL_SCENE_SOURCES} ${NBL_META_SOURCES} + ${NBL_CUDA_INTEROP_SOURCES} ) if(MSVC) @@ -426,7 +427,8 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() if(NBL_COMPILE_WITH_CUDA) - target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) + target_compile_definitions(Nabla PRIVATE _NBL_COMPILE_WITH_CUDA_) + target_include_directories(Nabla PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) endif() set(INTERFACE_BUILD_DEFINITIONS @@ -664,11 +666,6 @@ target_link_libraries(Nabla PRIVATE volk) # volk is part of public interface headers in Nabla target_compile_definitions(Nabla PUBLIC $<$:VK_USE_PLATFORM_WIN32_KHR>) -# CUDA -if (NBL_COMPILE_WITH_CUDA) - list(APPEND PUBLIC_BUILD_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}") -endif() - list(APPEND PUBLIC_BUILD_INCLUDE_DIRS # this should be PRIVATE, but things from /src (or /source) are sometimes included in things in /include and so examples have to put source dirs into theirs Include Path # -> TODO @@ -781,8 +778,11 @@ if(TARGET ngfx) ) endif() -# on MSVC it won't compile without this option! -target_compile_options(Nabla PUBLIC $<$:/bigobj>) +# on MSVC it won't compile without these options! +target_compile_options(Nabla PUBLIC + $<$:/bigobj> + $<$:/Zc:preprocessor> +) if(NBL_PCH) target_precompile_headers(Nabla @@ -793,11 +793,24 @@ if(NBL_PCH) ) endif() -# extensions start_tracking_variables_for_propagation_to_parent() add_subdirectory(ext EXCLUDE_FROM_ALL) propagate_changed_variables_to_parent_scope() +if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB}) + if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD) + install(TARGETS ${NBL_EXT_CUDA_INTEROP_LIB} + EXPORT NablaCUDAInteropExportTargets + COMPONENT Libraries + ) + install(EXPORT NablaCUDAInteropExportTargets + NAMESPACE Nabla:: + DESTINATION cmake + COMPONENT Libraries + ) + endif() +endif() + if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB}) set_target_properties(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF) nbl_install_lib_spec(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} "nbl/ext/FULL_SCREEN_TRIANGLE") diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt index f3b55531c2..264cfc7c2d 100644 --- a/src/nbl/ext/CMakeLists.txt +++ b/src/nbl/ext/CMakeLists.txt @@ -38,6 +38,14 @@ if (NBL_BUILD_OPTIX) ) endif() +add_subdirectory(CUDAInterop) +if (NBL_COMPILE_WITH_CUDA) + set(NBL_EXT_CUDA_INTEROP_LIB + ${NBL_EXT_CUDA_INTEROP_LIB} + PARENT_SCOPE + ) +endif() + if (NBL_BUILD_IMGUI) add_subdirectory(ImGui) set(NBL_EXT_IMGUI_UI_INCLUDE_DIRS diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt new file mode 100644 index 0000000000..a9e1663fa9 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt @@ -0,0 +1,26 @@ +include(common) +include(NablaCUDAInteropHelpers) + +if (NBL_COMPILE_WITH_CUDA) + set(NBL_EXT_CUDA_INTEROP_LIB "NblExtCUDA_INTEROP") + + file(GLOB NBL_EXT_CUDA_INTEROP_IDE_HEADERS CONFIGURE_DEPENDS "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop/*.h") + set(NBL_EXT_CUDA_INTEROP_IDE_SOURCES + ${NBL_EXT_CUDA_INTEROP_IDE_HEADERS} + CMakeLists.txt + README.md + ) + set_source_files_properties(${NBL_EXT_CUDA_INTEROP_IDE_SOURCES} PROPERTIES HEADER_FILE_ONLY TRUE) + + # Header-only opt-in target. It builds no artifact and adds CUDA SDK usage requirements only for native interop consumers. + add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE EXCLUDE_FROM_ALL ${NBL_EXT_CUDA_INTEROP_IDE_SOURCES}) + target_link_libraries(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE + Nabla + CUDA::toolkit + ) + set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInterop") + add_library(Nabla::ext::CUDAInterop ALIAS ${NBL_EXT_CUDA_INTEROP_LIB}) + set(NBL_EXT_CUDA_INTEROP_LIB "${NBL_EXT_CUDA_INTEROP_LIB}" PARENT_SCOPE) +endif() + +add_subdirectory(smoke) diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md new file mode 100644 index 0000000000..2ce46cbc93 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/README.md @@ -0,0 +1,153 @@ +# CUDA Interop + +## Layout + +- `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and its implementation in `src/nbl/video/CCUDA*.cpp`. +- Those headers do not include CUDA SDK headers. Consumers that only link `Nabla::Nabla` do not need `cuda.h`, `nvrtc.h`, or a CUDA SDK install just to parse Nabla headers. +- `Nabla::ext::CUDAInterop` is an `INTERFACE` target for native CUDA opt-in. It builds no library. It only adds `CUDAInteropNative.h`, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. +- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*Accessor` classes for CUDA Driver API and NVRTC types. + +## CMake Usage + +Default Nabla usage stays SDK-free: + +```cmake +find_package(Nabla CONFIG REQUIRED) +target_link_libraries(app PRIVATE Nabla::Nabla) +``` + +Native CUDA interop is explicit: + +```cmake +find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) +nbl_target_link_cuda_interop(native_app PRIVATE) +``` + +`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes `nbl_cuda_interop_runtime.json` next to the target executable during CMake generation. + +Optional overrides: + +```cmake +find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) +nbl_target_link_cuda_interop(native_app PRIVATE + INCLUDE_DIRS "${cuda_runtime_headers}" +) + +nbl_target_link_cuda_interop(native_app PRIVATE + RUNTIME_JSON "${CMAKE_CURRENT_BINARY_DIR}/$/my_cuda_runtime.json" +) +``` + +Consumers can also choose the SDK used for native compilation with: + +```cmake +cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT= +``` + +This affects native opt-in compilation and generated runtime header discovery only. It does not rebuild Nabla and does not change the `Nabla.dll` ABI. + +## Native Usage + +```cpp +#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" + +auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger)); +auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice); + +auto memory = nbl::video::cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, { + .size = size, + .alignment = alignment, + .location = CU_MEM_LOCATION_TYPE_DEVICE, +}); + +std::string log; +std::string cudaSource = loadKernelText(); +auto compile = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX( + *handler, + std::move(cudaSource), + "kernel.cu", + cudaDevice->geDefaultCompileOptions(), + log, + 0, + nullptr, + nullptr +); +``` + +Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly through accessor classes: + +- `CCUDAHandlerAccessor` exposes CUDA/NVRTC function tables, NVRTC program helpers, PTX compilation, native device enumeration, and default error handling. +- `CCUDADeviceAccessor` exposes `CUdevice`, `CUcontext`, memory granularity, and CUDA allocation creation. +- `CCUDAExportableMemoryAccessor`, `CCUDAImportedMemoryAccessor`, and `CCUDAImportedSemaphoreAccessor` expose the raw CUDA handles needed for interop. +- Accessor methods take explicit Nabla references. Callers dereference `smart_refctd_ptr` at the call site instead of going through pointer/smart-pointer convenience overloads. +- `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`. There is no optional output pointer in the public API. + +Smoke examples: + +- `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free. +- `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks default package usage without native opt-in. +- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks native opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, and raw interop usage. + +## ABI + +- `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI. +- Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes. +- CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state. +- `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI. +- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small fixed-layout parameter/result structs. +- SDK-sized arrays and other layouts derived from CUDA SDK constants stay private to Nabla. A consumer can build native opt-in code with its own compatible SDK independently from the SDK used to build Nabla. +- Runtime include-option construction is header-only and is not part of the exported ABI. +- The loaded CUDA driver and NVRTC runtime are validated at runtime. + +## Runtime Header Discovery + +NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. This is a runtime concern of applications that compile CUDA source with NVRTC, not a default `Nabla::Nabla` package requirement. + +- `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into native CUDA interop. +- The JSON is a build artifact. Nabla packages do not install host-specific CUDA paths. +- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`. +- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. +- Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. +- The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths. +- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Default discovery is cached after the first call. + +Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit. + +Nabla could ship an app-local bundle of selected CUDA runtime headers and make it available to runtime discovery. That model is allowed by the NVIDIA CUDA EULA for the components listed in Attachment A. Nabla intentionally does not bundle these headers. Because of that, end users should prefer an official CUDA runtime/header package for production machines. An installed toolkit also works, but the full toolkit is mainly for developers compiling Nabla or native CUDA code. + +NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See: + +- https://docs.nvidia.com/cuda/eula/#distribution +- https://docs.nvidia.com/cuda/eula/#attachment-a + +This means the Attachment A header groups below can be redistributed with applications under the EULA terms. It does not mean the full CUDA SDK can be redistributed. Applications that need NVRTC runtime compilation can decide whether to ship the allowed headers, depend on an official runtime/header package, or point discovery at an installed toolkit/header package. + +Attachment A lists header groups relevant to NVRTC runtime compilation: + +- NVIDIA Runtime Compilation Library and Header: `nvrtc.h` +- CUDA Floating Point Type Headers: `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp` +- CUDA Headers for Runtime Compilation: `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, and `vector_types.h` + +CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They show the common `vector_types.h` failure and recommend CUDA runtime header packages for PyPI/system package installs: + +- https://docs.cupy.dev/en/v13.5.0/install.html#cupy-always-raises-nvrtc-error-compilation-6 +- https://github.com/cupy/cupy/issues/8466 + +## CUDA ON/OFF Builds + +- SDK-free public headers stay stable for CUDA ON and CUDA OFF Nabla builds. +- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`. +- CUDA OFF implementations are local stubs in the same `.cpp` files. Factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols. +- The Nabla source list stays stable, so CUDA interop `.cpp` files remain visible in IDE projects for both CUDA ON and CUDA OFF builds. + +## Related Designs + +The split follows the same boundary pattern used by mature GPU projects: default headers avoid vendor SDK requirements, native access is explicit, and implementation details stay outside the default public API. + +- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79 +- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61 +- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27 +- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30 +- OpenMM keeps the CUDA platform boundary on OpenMM types/properties in `CudaPlatform.h`, while `CudaContext.h` is the CUDA-specific low-level header that includes CUDA SDK headers and exposes `CUmodule` / `CUfunction`: https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaPlatform.h#L48-L120 and https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaContext.h#L32-L52 +- GROMACS gates CUDA source handling behind `GMX_GPU_CUDA` in the library build and keeps CUDA runtime types in internal GPU utility headers: https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/CMakeLists.txt#L339-L367 and https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/gpu_utils/gputraits.cuh#L44-L58 +- ONNX Runtime keeps the public C API provider-neutral and routes CUDA through provider-specific bridge/factory code: https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_c_api.h#L1-L80 and https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/session/provider_bridge_ort.cc#L110-L150 diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt new file mode 100644 index 0000000000..7118eeff09 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -0,0 +1,38 @@ +cmake_minimum_required(VERSION 3.30) +project(NblExtCUDAInteropSmoke CXX) + +option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF) +set(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON "" CACHE FILEPATH "Optional CUDA interop runtime JSON path used by the native smoke.") + +if(NOT TARGET Nabla::Nabla) + set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core) + if(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE) + list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInterop) + endif() + find_package(Nabla REQUIRED CONFIG COMPONENTS ${_NBL_CUDA_INTEROP_SMOKE_COMPONENTS}) +endif() + +enable_testing() + +function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE) + add_executable(${TARGET_NAME} ${SOURCE_FILE}) + target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20) + + add_test(NAME ${TARGET_NAME} COMMAND $) +endfunction() + +nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp) +target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla) + +nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanNablaSmoke clean_opt_in.cpp) +target_link_libraries(NblExtCUDAInteropCleanNablaSmoke PRIVATE Nabla::Nabla) + +if(TARGET Nabla::ext::CUDAInterop) + nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp) + set(_nbl_cuda_interop_smoke_args PRIVATE) + if(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON) + list(APPEND _nbl_cuda_interop_smoke_args RUNTIME_JSON "${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}") + target_compile_definitions(NblExtCUDAInteropNativeOptInSmoke PRIVATE NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON="${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}") + endif() + nbl_target_link_cuda_interop(NblExtCUDAInteropNativeOptInSmoke ${_nbl_cuda_interop_smoke_args}) +endif() diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp new file mode 100644 index 0000000000..31bf461804 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp @@ -0,0 +1,39 @@ +#include "nbl/video/CUDAInterop.h" +#include "nbl/system/IApplicationFramework.h" + +#include + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#error "Nabla::Nabla must not propagate the CUDA build define." +#endif + +#ifdef CUDA_VERSION +#error "Nabla::Nabla must not require CUDA SDK headers." +#endif + +namespace +{ + +class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramework +{ + using base_t = nbl::system::IApplicationFramework; + +public: + using base_t::base_t; + + bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override + { + static_assert(std::is_class_v); + static_assert(std::is_class_v); + static_assert(std::is_class_v); + static_assert(std::is_class_v); + return isAPILoaded(); + } + + void workLoopBody() override {} + bool keepRunning() override { return false; } +}; + +} + +NBL_MAIN_FUNC(CUDAInteropCleanOptInSmoke) diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp new file mode 100644 index 0000000000..5d35ec8bed --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -0,0 +1,161 @@ +#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" +#include "nbl/system/IApplicationFramework.h" + +#include +#include +#include +#include +#include +#include + +#ifndef CUDA_VERSION +#error "Nabla::ext::CUDAInterop must expose CUDA SDK headers." +#endif + +namespace +{ +using namespace nbl; +using namespace nbl::video; + +[[maybe_unused]] bool compileVulkanCudaInteropRecipe( + CCUDADevice& cudaDevice, + ILogicalDevice* vulkanDevice, + core::smart_refctd_ptr vulkanMemory, + core::smart_refctd_ptr vulkanSemaphore) +{ + auto cudaMemory = cuda_native::CCUDADeviceAccessor::createExportableMemory(cudaDevice, { + .size = 4096, + .alignment = 4096, + .location = CU_MEM_LOCATION_TYPE_DEVICE, + }); + if (!cudaMemory) + return false; + + auto exportedToVulkan = cudaMemory->exportAsMemory(vulkanDevice); + auto importedFromVulkan = cudaDevice.importExternalMemory(std::move(vulkanMemory)); + auto importedSemaphore = cudaDevice.importExternalSemaphore(std::move(vulkanSemaphore)); + + CUdeviceptr mappedVulkanMemory = 0; + if (importedFromVulkan) + cuda_native::CCUDAImportedMemoryAccessor::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory); + + const CUdeviceptr cudaDevicePtr = cuda_native::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaMemory); + const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::CCUDAImportedSemaphoreAccessor::getInternalObject(*importedSemaphore):nullptr; + return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore; +} + +bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) +{ + auto& cuda = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(handler); + + CUcontext context = nullptr; + if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS) + return false; + + CUcontext poppedContext = nullptr; + bool contextPushed = false; + auto releaseContext = [&]() + { + if (context) + { + if (contextPushed) + cuda.pcuCtxPopCurrent_v2(&poppedContext); + cuda.pcuDevicePrimaryCtxRelease_v2(device); + } + }; + + if (cuda.pcuCtxPushCurrent_v2(context)!=CUDA_SUCCESS) + { + releaseContext(); + return false; + } + contextPushed = true; + + constexpr std::array input = {0x12345678u, 0x90abcdefu, 0xfedcba09u, 0x87654321u}; + std::array output = {}; + + CUdeviceptr deviceMemory = 0; + bool ok = cuda.pcuMemAlloc_v2(&deviceMemory, sizeof(input))==CUDA_SUCCESS; + if (ok) + ok = cuda.pcuMemcpyHtoD_v2(deviceMemory,input.data(),sizeof(input))==CUDA_SUCCESS; + if (ok) + ok = cuda.pcuMemcpyDtoH_v2(output.data(),deviceMemory,sizeof(output))==CUDA_SUCCESS; + if (deviceMemory) + ok = cuda.pcuMemFree_v2(deviceMemory)==CUDA_SUCCESS && ok; + + releaseContext(); + return ok && std::ranges::equal(input, output); +} + +bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler) +{ + constexpr const char* Source = R"cuda( + #include + extern "C" __global__ void fp16_probe(unsigned short* out) + { + out[0] = sizeof(__half); + } + )cuda"; + + std::string log; + auto compile = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX( + handler, + std::string(Source), + "cuda_fp16_discovery_probe.cu", + {nullptr,nullptr}, + log, + 0, + nullptr, + nullptr + ); + return compile.result==NVRTC_SUCCESS && compile.ptx && compile.ptx->getSize()>0u; +} +} + +class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramework +{ + using base_t = nbl::system::IApplicationFramework; + +public: + using base_t::base_t; + + bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override + { + if (!isAPILoaded()) + return false; + + static_assert(std::is_same_v())), CUdevice>); + + #ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON + const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}); + if (!std::filesystem::exists(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON)) + return false; + #else + const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment(); + #endif + const auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment); + const auto hasRuntimeHeaders = std::find_if(runtimeEnvironment.includeDirs.begin(),runtimeEnvironment.includeDirs.end(),[](const auto& includeDir) { + return std::filesystem::exists(includeDir/"cuda_fp16.h") || std::filesystem::exists(includeDir/"cuda_runtime_api.h"); + })!=runtimeEnvironment.includeDirs.end(); + if (includeOptions.empty() || !hasRuntimeHeaders) + return false; + + auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr); + if (!handler) + return true; + + if (!cudaFp16HeaderCompileProbe(*handler)) + return false; + + const auto& devices = nbl::video::cuda_native::CCUDAHandlerAccessor::getAvailableDevices(*handler); + if (devices.empty()) + return true; + + return cudaDriverRoundtrip(*handler, devices.front().handle); + } + + void workLoopBody() override {} + bool keepRunning() override { return false; } +}; + +NBL_MAIN_FUNC(CUDAInteropNativeOptInSmoke) diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp new file mode 100644 index 0000000000..dc1c247806 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp @@ -0,0 +1,52 @@ +#include "nabla.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#error "Nabla consumers must not get the CUDA opt-in define." +#endif + +#ifdef CUDA_VERSION +#error "Nabla consumers must not include CUDA SDK headers." +#endif + +#include "nbl/system/IApplicationFramework.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#error "Nabla consumers must not get the CUDA opt-in define." +#endif + +#ifdef CUDA_VERSION +#error "Nabla consumers must not include CUDA SDK headers." +#endif + +#include "nbl/video/CUDAInterop.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#error "Nabla consumers must not get the CUDA opt-in define." +#endif + +#ifdef CUDA_VERSION +#error "Nabla consumers must not include CUDA SDK headers." +#endif + +namespace +{ + +class CUDAInteropPublicBoundarySmoke final : public nbl::system::IApplicationFramework +{ + using base_t = nbl::system::IApplicationFramework; + +public: + using base_t::base_t; + + bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override + { + return isAPILoaded(); + } + + void workLoopBody() override {} + bool keepRunning() override { return false; } +}; + +} + +NBL_MAIN_FUNC(CUDAInteropPublicBoundarySmoke) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 27f8f6f906..359cd093a1 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -1,15 +1,15 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CUDAInterop.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" #ifdef _WIN32 #include #endif -#include "nbl/video/CCUDAImportedMemory.h" - -#ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { @@ -17,28 +17,31 @@ CCUDADevice::CCUDADevice( core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, - CUdevice device, + std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler) : m_logger(vulkanDevice->getDebugCallback()->getLogger()), m_defaultCompileOptions(), m_vulkanConnection(std::move(vulkanConnection)), m_physicalDevice(vulkanDevice), m_virtualArchitecture(virtualArchitecture), - m_handle(device), m_handler(std::move(handler)), - m_allocationGranularity{} + m_native(std::move(nativeState)) { + assert(m_native); + m_defaultCompileOptions.push_back("--std=c++14"); m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); - const auto& cu = m_handler->getCUDAFunctionTable(); + const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler); - ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle), m_handler); - ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_context), m_handler); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle))) + assert(false); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context))) + assert(false); - for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) + for (uint32_t locationType = 0; locationType < m_native->allocationGranularity.size(); ++locationType) { #ifdef _WIN32 @@ -49,24 +52,48 @@ CCUDADevice::CCUDADevice( const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = { .type = static_cast(locationType), .id = m_handle }, + .requestedHandleTypes = cuda_native::SAccess::allocationHandleType(), + .location = { .type = static_cast(locationType), .id = m_native->handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, #endif }; - ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM))) + assert(false); } } -size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const +namespace cuda_native +{ + +CUdevice CCUDADeviceAccessor::getInternalObject(const CCUDADevice& device) +{ + return SAccess::native(device).handle; +} + +CUcontext CCUDADeviceAccessor::getContext(const CCUDADevice& device) +{ + return SAccess::native(device).context; +} + +size_t CCUDADeviceAccessor::roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size) { - return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; + const auto& granularity = SAccess::native(device).allocationGranularity[location]; + return ((size - 1) / granularity + 1) * granularity; +} + } -CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const +static bool isDeviceLocal(CUmemLocationType location) { - const auto& cu = m_handler->getCUDAFunctionTable(); + return location==CU_MEM_LOCATION_TYPE_DEVICE; +} + +static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) +{ + const auto handler = device.getHandler(); + const auto& native = cuda_native::SAccess::native(device); + const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*handler); CUdeviceptr ptr = 0; if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) @@ -74,19 +101,22 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) + assert(false); return err; } CUmemAccessDesc accessDesc = { - .location = { .type = location, .id = m_handle }, + .location = { .type = location, .id = native.handle }, .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, }; if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) { - ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler); - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size))) + assert(false); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) + assert(false); return err; } @@ -95,11 +125,23 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz return CUDA_SUCCESS; } -core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams) +namespace cuda_native { - CCUDAExportableMemory::SCachedCreationParams params = { inParams }; - auto& cu = m_handler->getCUDAFunctionTable(); +core::smart_refctd_ptr CCUDADeviceAccessor::createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams) +{ + const auto handler = device.getHandler(); + auto& native = SAccess::native(device); + auto logger = SAccess::logger(device); + + CCUDAExportableMemory::SCachedCreationParams params = { + .size = inParams.size, + .alignment = inParams.alignment, + .granularSize = CCUDADeviceAccessor::roundToGranularity(device, inParams.location, inParams.size), + .deviceLocal = isDeviceLocal(inParams.location) + }; + + auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*handler); #ifdef _WIN32 OBJECT_ATTRIBUTES metadata = { @@ -109,34 +151,36 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = { .type = params.location, .id = m_handle }, + .requestedHandleTypes = SAccess::allocationHandleType(), + .location = { .type = inParams.location, .id = native.handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, #endif }; - params.granularSize = roundToGranularity(params.location, params.size); + auto nativeState = SAccess::makeExportableMemoryNativeState(); CUmemGenericAllocationHandle mem; if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) { - m_logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR); + logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR); return nullptr; } if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { - m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); - ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); + logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); + if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem))) + assert(false); return nullptr; } - if (const auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) + if (const auto err = reserveAddressAndMapMemory(device,&SAccess::deviceptr(*nativeState), params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err) { - m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); + logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); - ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); + if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem))) + assert(false); bool closeSucceed = CloseExternalHandle(params.externalHandle); assert(closeSucceed); @@ -151,12 +195,14 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params)); + return SAccess::makeExportableMemory(core::smart_refctd_ptr(&device),std::move(params),std::move(nativeState)); +} + } core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) { - const auto& cu = m_handler->getCUDAFunctionTable(); + const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler); const auto handleType = mem->getCreationParams().externalHandleType; if (!handleType) return nullptr; @@ -179,12 +225,15 @@ core::smart_refctd_ptr CCUDADevice::importExternalMemory(co m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR); return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); + return core::smart_refctd_ptr( + new CCUDAImportedMemory(core::smart_refctd_ptr(this),std::move(mem),std::make_unique(cuExtMem)), + core::dont_grab + ); } core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&& sema) { - auto& cu = m_handler->getCUDAFunctionTable(); + auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler); auto handleType = sema->getCreationParams().externalHandleTypes.value; if (!handleType) @@ -209,12 +258,54 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); + return core::smart_refctd_ptr( + new CCUDAImportedSemaphore(core::smart_refctd_ptr(this),std::move(sema),std::make_unique(cusema)), + core::dont_grab + ); } CCUDADevice::~CCUDADevice() { - ASSERT_CUDA_SUCCESS(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context), m_handler); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context))) + assert(false); +} + +} + +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDADevice::SNativeState {}; + +CCUDADevice::CCUDADevice( + core::smart_refctd_ptr&& vulkanConnection, + IPhysicalDevice* const vulkanDevice, + const E_VIRTUAL_ARCHITECTURE virtualArchitecture, + std::unique_ptr&& nativeState, + core::smart_refctd_ptr&& handler) + : m_logger(nullptr) + , m_vulkanConnection(std::move(vulkanConnection)) + , m_physicalDevice(vulkanDevice) + , m_virtualArchitecture(virtualArchitecture) + , m_handler(std::move(handler)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +CCUDADevice::~CCUDADevice() = default; + +core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&&) +{ + return nullptr; } } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index 66cbbdcf4f..f84169e38f 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -2,27 +2,40 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAExportableMemory.h" -#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" + namespace nbl::video { +CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_params(std::move(params)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +core::smart_refctd_ptr CCUDAExportableMemory::create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) +{ + return core::smart_refctd_ptr( + new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)), + core::dont_grab + ); +} + core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const { auto pd = device->getPhysicalDevice(); uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; uint32_t vram = pd->getDeviceLocalMemoryTypeBits(); - switch (m_params.location) - { - case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &= vram; break; - case CU_MEM_LOCATION_TYPE_HOST_NUMA: - case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: - case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; - default: break; - } + if (m_params.deviceLocal) + memoryTypeBits &= vram; + else + memoryTypeBits &= ~vram; IDeviceMemoryBacked::SDeviceMemoryRequirements req = {}; req.size = m_params.granularSize; @@ -39,16 +52,61 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM CCUDAExportableMemory::~CCUDAExportableMemory() { - const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler()); - ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), m_device->getHandler()); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize))) + assert(false); - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize), m_device->getHandler()); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize))) + assert(false); bool closeSucceed = CloseExternalHandle(m_params.externalHandle); assert(closeSucceed); } + +namespace cuda_native +{ + +CUdeviceptr CCUDAExportableMemoryAccessor::getDeviceptr(const CCUDAExportableMemory& memory) +{ + return SAccess::native(memory).ptr; +} + +} +} + +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAExportableMemory::SNativeState {}; + +CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_params(std::move(params)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +core::smart_refctd_ptr CCUDAExportableMemory::create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) +{ + return core::smart_refctd_ptr( + new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)), + core::dont_grab + ); +} + +CCUDAExportableMemory::~CCUDAExportableMemory() = default; + +core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const +{ + return nullptr; +} + } -#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file +#endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 060afe6631..78434d9bd5 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -2,28 +2,308 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAHandler.h" -#include "nbl/system/CFileView.h" +#include "nbl/video/CUDAInterop.h" + +#include "nlohmann/json.hpp" + +#include +#include +#include +#include + +namespace nbl::video::cuda_interop +{ +namespace +{ + +#if defined(_NBL_PLATFORM_WINDOWS_) +inline constexpr char EnvironmentPathListSeparator = ';'; +#else +inline constexpr char EnvironmentPathListSeparator = ':'; +#endif + +std::string readEnvironmentVariable(const char* name) +{ + if (const char* value = std::getenv(name)) + return value; + return {}; +} + +bool isDirectory(const system::path& path) +{ + std::error_code error; + return std::filesystem::exists(path,error) && std::filesystem::is_directory(path,error); +} + +bool isRegularFile(const system::path& path) +{ + std::error_code error; + return std::filesystem::exists(path,error) && std::filesystem::is_regular_file(path,error); +} + +system::path normalizedAbsolute(system::path path) +{ + std::error_code error; + auto absolute = std::filesystem::absolute(path,error); + if (error) + absolute = std::move(path); + return absolute.lexically_normal(); +} + +bool looksLikeCUDAIncludeDir(const system::path& path) +{ + if (!isDirectory(path)) + return false; + + return isRegularFile(path/"cuda_fp16.h") || + isRegularFile(path/"cuda_runtime_api.h") || + isRegularFile(path/"vector_types.h") || + isRegularFile(path/"cuda.h") || + isRegularFile(path/"nv"/"target"); +} + +void appendIncludeDir(core::vector& includeDirs, system::path path) +{ + if (path.empty() || !looksLikeCUDAIncludeDir(path)) + return; + + path = normalizedAbsolute(std::move(path)); + const auto pathString = path.generic_string(); + const auto alreadyAdded = std::find_if(includeDirs.begin(),includeDirs.end(),[&](const system::path& existing) { + return existing.generic_string()==pathString; + }); + if (alreadyAdded==includeDirs.end()) + includeDirs.push_back(std::move(path)); +} + +void appendCUDAIncludeDirsBelow(core::vector& includeDirs, const system::path& root, uint32_t maxDepth) +{ + if (!isDirectory(root)) + return; + + if (looksLikeCUDAIncludeDir(root)) + { + appendIncludeDir(includeDirs,root); + return; + } + if (maxDepth==0u) + return; + + core::vector candidates; + std::error_code error; + for (const auto& entry : std::filesystem::directory_iterator(root,error)) + { + if (error) + break; + + std::error_code entryError; + if (!entry.is_directory(entryError)) + continue; + candidates.push_back(entry.path()); + } + + std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) { + return lhs.generic_string()>rhs.generic_string(); + }); + for (const auto& candidate : candidates) + appendCUDAIncludeDirsBelow(includeDirs,candidate,maxDepth-1u); +} + +void appendCUDAIncludeRoot(core::vector& includeDirs, const system::path& root) +{ + if (root.empty()) + return; + + appendIncludeDir(includeDirs,root); + appendIncludeDir(includeDirs,root/"include"); +} + +void appendRuntimePathsConfig(core::vector& includeDirs, const system::path& configFile) +{ + if (!isRegularFile(configFile)) + return; + + std::ifstream input(configFile); + if (!input) + return; + + const auto json = nlohmann::json::parse(input,nullptr,false); + if (json.is_discarded()) + return; + + const auto paths = json.find("cudaRuntimeIncludeDirs"); + if (paths==json.end() || !paths->is_array()) + return; + + for (const auto& path : *paths) + if (path.is_string()) + appendIncludeDir(includeDirs,system::path(path.get())); +} + +template +void appendPathListEnv(const char* name, Append append) +{ + const auto value = readEnvironmentVariable(name); + if (value.empty()) + return; + + size_t begin = 0; + while (begin& includeDirs, const core::vector& explicitRuntimePathFiles) +{ + for (const auto& runtimePathFile : explicitRuntimePathFiles) + appendRuntimePathsConfig(includeDirs,runtimePathFile); + + const auto appendConfig = [&](const system::path& path) { appendRuntimePathsConfig(includeDirs,path); }; + appendPathListEnv("NBL_CUDA_INTEROP_RUNTIME_JSON",appendConfig); + appendPathListEnv("Nabla_CUDA_INTEROP_RUNTIME_JSON",appendConfig); + + const auto exeDir = system::executableDirectory(); + if (!exeDir.empty()) + appendRuntimePathsConfig(includeDirs,exeDir/RuntimePathsFileName); +} + +void appendAppLocalIncludeDirs(core::vector& includeDirs) +{ + const auto exeDir = system::executableDirectory(); + if (exeDir.empty()) + return; + + appendIncludeDir(includeDirs,exeDir/"cuda"/"include"); + appendCUDAIncludeDirsBelow(includeDirs,exeDir/"nvidia",4u); + appendIncludeDir(includeDirs,exeDir/"Libraries"/"cuda"/"include"); + appendIncludeDir(includeDirs,exeDir.parent_path()/"cuda"/"include"); + appendCUDAIncludeDirsBelow(includeDirs,exeDir.parent_path()/"nvidia",4u); +} + +void appendPythonPackageIncludeDirs(core::vector& includeDirs, const system::path& root) +{ + if (root.empty()) + return; + + appendCUDAIncludeDirsBelow(includeDirs,root/"Lib"/"site-packages"/"nvidia",4u); + appendCUDAIncludeDirsBelow(includeDirs,root/"lib"/"site-packages"/"nvidia",4u); + appendIncludeDir(includeDirs,root/"Library"/"include"); + appendIncludeDir(includeDirs,root/"include"); +} + +void appendEnvironmentIncludeDirs(core::vector& includeDirs) +{ + const auto appendInclude = [&](const system::path& path) { appendIncludeDir(includeDirs,path); }; + appendPathListEnv("NBL_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude); + appendPathListEnv("Nabla_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude); + + appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_PATH")); + appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_HOME")); + appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_ROOT")); + appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDAToolkit_ROOT")); + + appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("VIRTUAL_ENV")); + appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("CONDA_PREFIX")); +} + +void appendCUDAInstallRoots(core::vector& includeDirs, const system::path& root) +{ + if (!isDirectory(root)) + return; + + core::vector candidates; + std::error_code error; + for (const auto& entry : std::filesystem::directory_iterator(root,error)) + { + if (error) + break; + if (!entry.is_directory(error)) + continue; + candidates.push_back(entry.path()/"include"); + } + + std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) { + return lhs.generic_string()>rhs.generic_string(); + }); + for (const auto& candidate : candidates) + appendIncludeDir(includeDirs,candidate); +} + +void appendSystemIncludeDirs(core::vector& includeDirs) +{ + #if defined(_NBL_PLATFORM_WINDOWS_) + appendCUDAInstallRoots(includeDirs,"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA"); + #else + appendIncludeDir(includeDirs,"/usr/local/cuda/include"); + appendCUDAInstallRoots(includeDirs,"/usr/local"); + appendIncludeDir(includeDirs,"/usr/include"); + #endif +} + +} + +SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs, core::vector runtimePathFiles) +{ + SRuntimeCompileEnvironment environment; + for (auto& includeDir : explicitIncludeDirs) + appendIncludeDir(environment.includeDirs,std::move(includeDir)); + + appendRuntimePathsConfigs(environment.includeDirs,runtimePathFiles); + appendAppLocalIncludeDirs(environment.includeDirs); + appendEnvironmentIncludeDirs(environment.includeDirs); + appendSystemIncludeDirs(environment.includeDirs); + + return environment; +} + +SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs) +{ + return findRuntimeCompileEnvironment(std::move(explicitIncludeDirs),{}); +} + +} #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" +#include "nbl/system/CFileView.h" #include "jitify/jitify.hpp" namespace nbl::video { + +namespace +{ + +int cudaVersionMajor(int version) +{ + return version/1000; +} + +int cudaVersionMinor(int version) +{ + return (version%1000)/10; +} + +} CCUDAHandler::CCUDAHandler( - CUDA&& _cuda, - NVRTC&& _nvrtc, + std::unique_ptr&& nativeState, core::vector>&& _headers, - core::smart_refctd_ptr&& _logger, - int _version) - : m_cuda(std::move(_cuda)) - , m_nvrtc(std::move(_nvrtc)) + core::smart_refctd_ptr&& _logger) + : m_native(std::move(nativeState)) , m_headers(std::move(_headers)) , m_logger(std::move(_logger)) - , m_version(_version) { + assert(m_native); + for (auto& header : m_headers) { m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); @@ -32,29 +312,38 @@ CCUDAHandler::CCUDAHandler( } int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) + if (m_native->cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) return; for (int device_i = 0; device_i < deviceCount; device_i++) { CUdevice handle = -1; - if (m_cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0) + if (m_native->cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0) continue; CUuuid uuid = {}; - if (m_cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) + if (m_native->cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) continue; - m_availableDevices.emplace_back(handle, uuid); + auto& nativeDevice = m_native->deviceStates.emplace_back(); + nativeDevice.info.handle = handle; + nativeDevice.info.uuid = uuid; + m_native->availableDevices.push_back(nativeDevice.info); + auto& cleanDevice = m_availableDevices.emplace_back(); + memcpy(cleanDevice.uuid.data(),&uuid,cleanDevice.uuid.size()); - int* attributes = m_availableDevices.back().attributes; - for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++) - m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast(i), handle); + for (size_t i = 0; i < nativeDevice.attributes.size(); i++) + m_native->cuda.pcuDeviceGetAttribute(&nativeDevice.attributes[i], static_cast(i), handle); } } -bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) +CCUDAHandler::~CCUDAHandler() = default; + +namespace cuda_native +{ + +bool CCUDAHandlerAccessor::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) { switch (result) { @@ -420,7 +709,12 @@ bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt return false; } -bool CCUDAHandler::defaultHandleResult(nvrtcResult result) +bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, CUresult result) +{ + return CCUDAHandlerAccessor::defaultHandleResult(result,SAccess::logger(handler)); +} + +bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result) { switch (result) { @@ -428,19 +722,23 @@ bool CCUDAHandler::defaultHandleResult(nvrtcResult result) return true; break; default: - if (m_nvrtc.pnvrtcGetErrorString) - m_logger.log("%s\n",system::ILogger::ELL_ERROR,m_nvrtc.pnvrtcGetErrorString(result)); + if (SAccess::native(handler).nvrtc.pnvrtcGetErrorString) + SAccess::logger(handler).log("%s\n",system::ILogger::ELL_ERROR,SAccess::native(handler).nvrtc.pnvrtcGetErrorString(result)); else - m_logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR); + SAccess::logger(handler).log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR); break; } _NBL_DEBUG_BREAK_IF(true); return false; } +} + core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* system, core::smart_refctd_ptr&& _logger) { - CUDA cuda = CUDA( + const system::logger_opt_ptr logger(_logger.get()); + + cuda_native::CUDA cuda = cuda_native::CUDA( #if defined(_NBL_WINDOWS_API_) "nvcuda" #elif defined(_NBL_POSIX_API_) @@ -450,7 +748,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste #endif ); - NVRTC nvrtc = {}; + cuda_native::NVRTC nvrtc = {}; #if defined(_NBL_WINDOWS_API_) // Perpetual TODO: any new CUDA releases we need to account for? // Version List: https://developer.nvidia.com/cuda-toolkit-archive @@ -468,7 +766,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste { std::string path(*verpath); path += *suffix; - nvrtc = NVRTC(path.c_str()); + nvrtc = cuda_native::NVRTC(path.c_str()); if (nvrtc.pnvrtcVersion) break; } @@ -476,7 +774,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste break; } #elif defined(_NBL_POSIX_API_) - nvrtc = NVRTC("nvrtc"); + nvrtc = cuda_native::NVRTC("nvrtc"); //nvrtc_builtins = NVRTC("nvrtc-builtins"); #else #error "Unsuported Platform" @@ -487,18 +785,32 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste #define SAFE_CUDA_CALL(FUNC,...) \ {\ if (!cuda.p ## FUNC)\ + {\ + logger.log("CCUDAHandler: CUDA Driver API function %s was not found. Need CUDA driver runtime %d.%d or newer.",system::ILogger::ELL_ERROR,#FUNC,cudaVersionMajor(cuda_native::MinimumCUDADriverVersion),cudaVersionMinor(cuda_native::MinimumCUDADriverVersion));\ return nullptr;\ - auto result = cuda.p ## FUNC ## (__VA_ARGS__);\ + }\ + auto result = cuda.p ## FUNC(__VA_ARGS__);\ if (result!=CUDA_SUCCESS)\ + {\ + logger.log("CCUDAHandler: %s failed with CUDA error code %d.",system::ILogger::ELL_ERROR,#FUNC,static_cast(result));\ return nullptr;\ + }\ } SAFE_CUDA_CALL(cuInit,0) int cudaVersion = 0; SAFE_CUDA_CALL(cuDriverGetVersion,&cudaVersion) - if (cudaVersion<13000) + if (cudaVersion CCUDAHandler::create(system::ISystem* syste // check nvrtc existence and compatibility if (!nvrtc.pnvrtcVersion) + { + logger.log("CCUDAHandler: NVRTC runtime was not found. Need NVRTC %d.x or newer.",system::ILogger::ELL_ERROR,cuda_native::MinimumNVRTCMajorVersion); return nullptr; + } int nvrtcVersion[2] = { -1,-1 }; - nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1); - if (nvrtcVersion[0]<9) + const auto nvrtcVersionResult = nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1); + if (nvrtcVersionResult!=NVRTC_SUCCESS) + { + logger.log("CCUDAHandler: nvrtcVersion failed with NVRTC error code %d.",system::ILogger::ELL_ERROR,static_cast(nvrtcVersionResult)); + return nullptr; + } + if (nvrtcVersion[0]> headers; @@ -526,10 +853,31 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste )); } - return core::make_smart_refctd_ptr(std::move(cuda),std::move(nvrtc), std::move(headers), std::move(_logger), cudaVersion); + return core::smart_refctd_ptr( + new CCUDAHandler(std::make_unique(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger)), + core::dont_grab + ); +} + +namespace cuda_native +{ + +const CUDA& CCUDAHandlerAccessor::getCUDAFunctionTable(const CCUDAHandler& handler) +{ + return SAccess::native(handler).cuda; } -nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) +const NVRTC& CCUDAHandlerAccessor::getNVRTCFunctionTable(const CCUDAHandler& handler) +{ + return SAccess::native(handler).nvrtc; +} + +const core::vector& CCUDAHandlerAccessor::getAvailableDevices(const CCUDAHandler& handler) +{ + return SAccess::native(handler).availableDevices; +} + +nvrtcResult CCUDAHandlerAccessor::createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) { #if defined(_NBL_WINDOWS_API_) source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n"); @@ -538,26 +886,31 @@ nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source #else #error "Unsuported Platform" #endif - return m_nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames); + return SAccess::native(handler).nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames); +} + +nvrtcResult CCUDAHandlerAccessor::compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options) +{ + return SAccess::native(handler).nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin()); } -nvrtcResult CCUDAHandler::getProgramLog(nvrtcProgram prog, std::string& log) +nvrtcResult CCUDAHandlerAccessor::getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log) { size_t _size = 0ull; - nvrtcResult sizeRes = m_nvrtc.pnvrtcGetProgramLogSize(prog, &_size); + nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetProgramLogSize(prog, &_size); if (sizeRes != NVRTC_SUCCESS) return sizeRes; if (_size == 0ull) return NVRTC_ERROR_INVALID_INPUT; log.resize(_size); - return m_nvrtc.pnvrtcGetProgramLog(prog,log.data()); + return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data()); } -CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) +SPTXResult CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog) { size_t _size = 0ull; - nvrtcResult sizeRes = m_nvrtc.pnvrtcGetPTXSize(prog,&_size); + nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size); if (sizeRes!=NVRTC_SUCCESS) return {nullptr,sizeRes}; if (_size==0ull) @@ -567,7 +920,55 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) ptxParams.size = _size; auto ptx = asset::ICPUBuffer::create(std::move(ptxParams)); auto ptxPtr = static_cast(ptx->getPointer()); - return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; + return {std::move(ptx),SAccess::native(handler).nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; +} + +static const core::vector& getDefaultRuntimeIncludeOptions() +{ + static const auto RuntimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(cuda_interop::findRuntimeCompileEnvironment()); + return RuntimeIncludeOptions; +} + +static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string& log) +{ + log.clear(); + if (result!=NVRTC_SUCCESS) + return {nullptr,result}; + + const auto& runtimeIncludeOptions = getDefaultRuntimeIncludeOptions(); + core::vector options; + options.reserve(nvrtcOptions.size()+runtimeIncludeOptions.size()); + for (const auto option : nvrtcOptions) + options.push_back(option); + for (const auto& option : runtimeIncludeOptions) + options.push_back(option.c_str()); + + const auto* optionsBegin = options.empty() ? nullptr:options.data(); + const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size(); + result = CCUDAHandlerAccessor::compileProgram(handler,program,{optionsBegin,optionsEnd}); + CCUDAHandlerAccessor::getProgramLog(handler,program,log); + if (result!=NVRTC_SUCCESS) + return {nullptr,result}; + + return CCUDAHandlerAccessor::getPTX(handler,program); +} + +SPTXResult CCUDAHandlerAccessor::compileDirectlyToPTX( + CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, + std::string& log, const int headerCount, const char* const* headerContents, const char* const* includeNames) +{ + nvrtcProgram program = nullptr; + nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; + auto cleanup = core::makeRAIIExiter([&]() -> void + { + if (program) + SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program); + }); + + result = CCUDAHandlerAccessor::createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames); + return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log); +} + } core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) @@ -578,9 +979,9 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - for (const auto& device : m_availableDevices) + for (const auto& device : m_native->deviceStates) { - if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&device.info.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT; const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; @@ -662,7 +1063,10 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - return core::make_smart_refctd_ptr(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)); + return core::smart_refctd_ptr( + new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique(device.info.handle),core::smart_refctd_ptr(this)), + core::dont_grab + ); } } return nullptr; @@ -670,4 +1074,37 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct } +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAHandler::SNativeState {}; + +CCUDAHandler::CCUDAHandler( + std::unique_ptr&& nativeState, + core::vector>&& _headers, + core::smart_refctd_ptr&& _logger) + : m_native(std::move(nativeState)) + , m_headers(std::move(_headers)) + , m_logger(std::move(_logger)) +{ + assert(m_native); +} + +CCUDAHandler::~CCUDAHandler() = default; + +core::smart_refctd_ptr CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&&, IPhysicalDevice*) +{ + return nullptr; +} + +} + #endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index 7e21b05ef1..9145fe18ac 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -2,31 +2,70 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAImportedMemory.h" -#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" namespace nbl::video { -CUresult CCUDAImportedMemory::getMappedBuffer(CUdeviceptr* mappedBuffer) +CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +namespace cuda_native +{ + +CUexternalMemory CCUDAImportedMemoryAccessor::getInternalObject(const CCUDAImportedMemory& memory) +{ + return SAccess::native(memory).handle; +} + +CUresult CCUDAImportedMemoryAccessor::getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer) { CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; bufferDesc.offset = 0; - bufferDesc.size = m_src->getAllocationSize(); + bufferDesc.size = SAccess::source(memory)->getAllocationSize(); - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, m_handle, &bufferDesc); + const auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*SAccess::device(memory)->getHandler()); + return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, SAccess::native(memory).handle, &bufferDesc); } +} + CCUDAImportedMemory::~CCUDAImportedMemory() { - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_handle), m_device->getHandler()); + auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler()); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle))) + assert(false); } } -#endif \ No newline at end of file +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAImportedMemory::SNativeState {}; + +CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +CCUDAImportedMemory::~CCUDAImportedMemory() = default; + +} + +#endif diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp index 0dc750a4a9..5d7d3e07ae 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -2,17 +2,57 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAImportedSemaphore.h" -#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" + namespace nbl::video { +CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +namespace cuda_native +{ + +CUexternalSemaphore CCUDAImportedSemaphoreAccessor::getInternalObject(const CCUDAImportedSemaphore& semaphore) +{ + return SAccess::native(semaphore).handle; +} + +} + CCUDAImportedSemaphore::~CCUDAImportedSemaphore() { - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_handle), m_device->getHandler()); + auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler()); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle))) + assert(false); } } -#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAImportedSemaphore::SNativeState {}; + +CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default; + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp new file mode 100644 index 0000000000..4be8178aa2 --- /dev/null +++ b/src/nbl/video/CUDAInteropNativeState.hpp @@ -0,0 +1,111 @@ +#ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ + +#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" + +#include + +namespace nbl::video +{ + +struct CCUDAHandler::SNativeState +{ + struct SDeviceState + { + cuda_native::SCUDADeviceInfo info = {}; + std::array attributes = {}; + }; + + cuda_native::CUDA cuda; + cuda_native::NVRTC nvrtc; + core::vector availableDevices; + core::vector deviceStates; + + SNativeState(cuda_native::CUDA&& _cuda, cuda_native::NVRTC&& _nvrtc) + : cuda(std::move(_cuda)) + , nvrtc(std::move(_nvrtc)) + {} +}; + +struct CCUDADevice::SNativeState +{ + CUdevice handle = {}; + CUcontext context = nullptr; + std::array allocationGranularity = {}; + + explicit SNativeState(CUdevice _handle) + : handle(_handle) + {} +}; + +struct CCUDAExportableMemory::SNativeState +{ + CUdeviceptr ptr = 0; +}; + +struct CCUDAImportedMemory::SNativeState +{ + CUexternalMemory handle = nullptr; + + explicit SNativeState(CUexternalMemory _handle) + : handle(_handle) + {} +}; + +struct CCUDAImportedSemaphore::SNativeState +{ + CUexternalSemaphore handle = nullptr; + + explicit SNativeState(CUexternalSemaphore _handle) + : handle(_handle) + {} +}; + +namespace cuda_native +{ + +struct SAccess +{ + static CCUDAHandler::SNativeState& native(CCUDAHandler& handler) { return *handler.m_native; } + static const CCUDAHandler::SNativeState& native(const CCUDAHandler& handler) { return *handler.m_native; } + + static CCUDADevice::SNativeState& native(CCUDADevice& device) { return *device.m_native; } + static const CCUDADevice::SNativeState& native(const CCUDADevice& device) { return *device.m_native; } + + static CCUDAExportableMemory::SNativeState& native(CCUDAExportableMemory& memory) { return *memory.m_native; } + static const CCUDAExportableMemory::SNativeState& native(const CCUDAExportableMemory& memory) { return *memory.m_native; } + static std::unique_ptr makeExportableMemoryNativeState() + { + return std::unique_ptr(new CCUDAExportableMemory::SNativeState()); + } + static CUdeviceptr& deviceptr(CCUDAExportableMemory::SNativeState& nativeState) { return nativeState.ptr; } + static core::smart_refctd_ptr makeExportableMemory(core::smart_refctd_ptr device, CCUDAExportableMemory::SCachedCreationParams&& params, std::unique_ptr&& nativeState) + { + return CCUDAExportableMemory::create(std::move(device),std::move(params),std::move(nativeState)); + } + + static CCUDAImportedMemory::SNativeState& native(CCUDAImportedMemory& memory) { return *memory.m_native; } + static const CCUDAImportedMemory::SNativeState& native(const CCUDAImportedMemory& memory) { return *memory.m_native; } + + static CCUDAImportedSemaphore::SNativeState& native(CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; } + static const CCUDAImportedSemaphore::SNativeState& native(const CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; } + + static system::logger_opt_ptr logger(const CCUDAHandler& handler) { return handler.m_logger.get().get(); } + static system::logger_opt_ptr logger(const CCUDADevice& device) { return device.m_logger; } + static const CCUDADevice* device(const CCUDAImportedMemory& memory) { return memory.m_device.get(); } + static IDeviceMemoryAllocation* source(const CCUDAImportedMemory& memory) { return memory.m_src.get(); } + static CUmemAllocationHandleType allocationHandleType() + { + #ifdef _WIN32 + return CU_MEM_HANDLE_TYPE_WIN32; + #else + return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + #endif + } +}; + +} + +} + +#endif diff --git a/src/nbl/video/EApiType.cpp b/src/nbl/video/EApiType.cpp new file mode 100644 index 0000000000..d7eadd8b08 --- /dev/null +++ b/src/nbl/video/EApiType.cpp @@ -0,0 +1,37 @@ +#include "nbl/video/EApiType.h" + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#else +#include +#endif + +namespace nbl::video +{ + +bool CloseExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + return CloseHandle(handle); +#else + return close(handle)==0; +#endif +} + +external_handle_t DuplicateExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + HANDLE duplicated = ExternalHandleNull; + + const HANDLE process = GetCurrentProcess(); + if (!DuplicateHandle(process,handle,process,&duplicated,GENERIC_ALL,0,DUPLICATE_SAME_ACCESS)) + return ExternalHandleNull; + + return duplicated; +#else + return dup(handle); +#endif +} + +}