Skip to content

Move CUDA interop behind native opt-in#1067

Open
AnastaZIuk wants to merge 27 commits intovk_cuda_interopfrom
cuInteropBS
Open

Move CUDA interop behind native opt-in#1067
AnastaZIuk wants to merge 27 commits intovk_cuda_interopfrom
cuInteropBS

Conversation

@AnastaZIuk
Copy link
Copy Markdown
Member

Moves CUDA interop behind SDK-free Nabla headers with explicit Nabla::ext::CUDAInterop native opt-in. Keeps raw CUDA/NVRTC access available for consumers that ask for native opt-in while avoiding default public SDK requirements.

Comment on lines +163 to +210
// Opt-in native CUDA API. The declarations below are implemented by the Nabla library.
// This header is intentionally the only public path that includes CUDA SDK types.
class NBL_API2 CCUDAHandlerAccessor
{
public:
static const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
static const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
static bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
static bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
static const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
static SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
static SPTXResult compileDirectlyToPTX(
CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
);
};

class NBL_API2 CCUDADeviceAccessor
{
public:
static CUdevice getInternalObject(const CCUDADevice& device);
static CUcontext getContext(const CCUDADevice& device);
static size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
static core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params);
};

class NBL_API2 CCUDAExportableMemoryAccessor
{
public:
static CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
};

class NBL_API2 CCUDAImportedMemoryAccessor
{
public:
static CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
static CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
};

class NBL_API2 CCUDAImportedSemaphoreAccessor
{
public:
static CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
};
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

accessors make no sense just move all the nbl/video/CCUDA*.h to the extension

Comment on lines -298 to -305
#define ASSERT_CUDA_SUCCESS(expr, handler) \
do { \
const auto cudaResult = (expr); \
if (!((handler)->defaultHandleResult(cudaResult))) { \
assert(false); \
} \
} while(0)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that macro was useful just needs a rename

Comment on lines -31 to -52
inline bool CloseExternalHandle(external_handle_t handle)
{
#ifdef _WIN32
return CloseHandle(handle);
#else
return (close(handle) == 0);
#endif
}

inline external_handle_t DuplicateExternalHandle(external_handle_t handle)
{
#ifdef _WIN32
HANDLE re = ExternalHandleNull;

const HANDLE cur = GetCurrentProcess();
if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS))
return ExternalHandleNull;

return re;
#else
return dup(handle);
#endif
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you may want to keep that inline, these are OS calls, and when they're inline they'll work BEFORE Nabla.dll is delay loaded, which is useful

Comment on lines +1 to +29
#include "nbl/video/CUDAInterop.h"
#include "nbl/system/IApplicationFramework.h"

#include <type_traits>

#ifdef _NBL_COMPILE_WITH_CUDA_
#error "Nabla::Nabla must not propagate the CUDA build define."
#endif

#ifdef CUDA_VERSION
#error "Nabla::Nabla must not require CUDA SDK headers."
#endif

namespace
{

class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramework
{
using base_t = nbl::system::IApplicationFramework;

public:
using base_t::base_t;

bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
{
static_assert(std::is_class_v<nbl::video::CCUDADevice>);
static_assert(std::is_class_v<nbl::video::CCUDAExportableMemory>);
static_assert(std::is_class_v<nbl::video::CCUDAImportedMemory>);
static_assert(std::is_class_v<nbl::video::CCUDAImportedSemaphore>);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would make more sense to not have anything CUDA related in Nabla itself

const auto& granularity = SAccess::native(device).allocationGranularity[location];
return ((size - 1) / granularity + 1) * granularity;
}

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mentioned in the original PR, this should be inline

Comment on lines +104 to +119
if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
assert(false);
return err;
}

CUmemAccessDesc accessDesc = {
.location = { .type = location, .id = m_handle },
.location = { .type = location, .id = native.handle },
.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
};

if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
{
ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler);
ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler);
if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size)))
assert(false);
if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
assert(false);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kevyuu thinking of it we shouldn't crash an entire program vecause of failure here :s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants