From f22f11ca4f05a4c4efa3ba9e10ba118e3a1ae3df Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 3 Mar 2026 14:26:18 +0700
Subject: [PATCH 1/9] Initial implementation of CUDA interop unit test

---
 76_CudaInterop/CMakeLists.txt                 |  24 +
 .../app_resources/vectorAdd_kernel.cu         |  42 ++
 76_CudaInterop/main.cpp                       | 543 ++++++++++++++++++
 CMakeLists.txt                                |   1 +
 4 files changed, 610 insertions(+)
 create mode 100644 76_CudaInterop/CMakeLists.txt
 create mode 100644 76_CudaInterop/app_resources/vectorAdd_kernel.cu
 create mode 100644 76_CudaInterop/main.cpp

diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt
new file mode 100644
index 000000000..bc1624875
--- /dev/null
+++ b/76_CudaInterop/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/76_CudaInterop/app_resources/vectorAdd_kernel.cu b/76_CudaInterop/app_resources/vectorAdd_kernel.cu
new file mode 100644
index 000000000..3baef0123
--- /dev/null
+++ b/76_CudaInterop/app_resources/vectorAdd_kernel.cu
@@ -0,0 +1,42 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+
+extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
+                                     int numElements) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < numElements) {
+    C[i] = A[i] + B[i];
+  }
+}
\ No newline at end of file
diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
new file mode 100644
index 000000000..85d10ad13
--- /dev/null
+++ b/76_CudaInterop/main.cpp
@@ -0,0 +1,543 @@
+#include "nbl/video/CCUDAHandler.h"
+// #include "nbl/video/CCUDASharedMemory.h"
+// #include "nbl/video/CCUDASharedSemaphore.h"
+
+#include "nbl/application_templates/MonoDeviceApplication.hpp"
+#include "nbl/examples/common/BuiltinResourcesApplication.hpp"
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
+
+/*
+The start of the main function starts like in most other example. We ask the
+user for the desired renderer and start it up.
+*/
+
+bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line)
+{
+    if (auto re = err; CUDA_SUCCESS != re) 
+    {
+        const char* name = 0, * str = 0;
+        cu.pcuGetErrorName(re, &name);
+        cu.pcuGetErrorString(re, &str);
+        logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str);
+        return false;
+    }
+    return true;
+}
+
+bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log)
+{
+    if (auto re = err; NVRTC_SUCCESS != re) 
+    {
+        const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); 
+        logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
+        return false;
+    }
+    return true;
+}
+
+#define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); }
+#define ASSERT_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
+
+
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::video;
+using namespace nbl::examples;
+using namespace nbl::application_templates;
+
+class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplication
+{
+    using device_base_t = MonoDeviceApplication;
+    using asset_base_t = BuiltinResourcesApplication;
+
+    static constexpr uint32_t gridDim[3] = { 4096,1,1 };
+    static constexpr uint32_t blockDim[3] = { 1024,1,1 };
+    static constexpr size_t numElements = gridDim[0] * blockDim[0];
+    static constexpr size_t size = sizeof(float) * numElements;
+
+public:
+    // Yay thanks to multiple inheritance we cannot forward ctors anymore
+    CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+        system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+    smart_refctd_ptr<CCUDAHandler> cudaHandler;
+    smart_refctd_ptr<CCUDADevice> cudaDevice;
+
+    IQueue* queue;
+
+    // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
+    std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
+    // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
+    // // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side
+    // std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
+    // // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer
+    // smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
+
+    // our Buffer that is bound to cudaMemories[2]
+    smart_refctd_ptr<IGPUBuffer> importedBuf;
+    // our Image that is also bound to cudaMemories[2]
+    smart_refctd_ptr<IGPUImage> importedImg;
+
+    // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing
+    smart_refctd_ptr<IGPUBuffer> stagingBufs[2];
+
+    // Nabla semaphore for sync
+    smart_refctd_ptr<ISemaphore> semaphore;
+
+    smart_refctd_ptr<IGPUCommandPool> commandPool;
+    smart_refctd_ptr<IGPUCommandBuffer> cmd[2];
+
+    // a device filter helps you create a set of physical devices that satisfy your requirements in terms of features, limits etc.
+    virtual void filterDevices(core::set<video::IPhysicalDevice*>& physicalDevices) const
+    {
+        device_base_t::filterDevices(physicalDevices);
+        auto& cuDevices = cudaHandler->getAvailableDevices();
+        std::erase_if(physicalDevices, [&cuDevices](auto pdev) {
+            return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, &cuDev.uuid, 16);  });
+        });
+    }
+
+    bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+    {
+        // Remember to call the base class initialization!
+        if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+            return false;
+
+        cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr<ILogger>(m_logger));
+        if (!cudaHandler) 
+            return logFail("Could not create a CUDA handler!");
+
+        if (!device_base_t::onAppInitialized(std::move(system)))
+            return false;
+
+        cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
+        if (!cudaDevice) 
+            return logFail("Could not create a CUDA Device!");
+
+        
+        queue = device_base_t::getComputeQueue();
+        
+        createResources();
+
+        smart_refctd_ptr<ICPUBuffer> ptx;
+        {
+            IAssetLoader::SAssetLoadParams lp = {};
+            lp.logger = m_logger.get();
+            lp.workingDirectory = ""; // virtual root
+            // this time we load a shader directly from a file
+            auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp);
+            const auto assets = assetBundle.getContents();
+            if (assets.empty())
+                return logFail("Could not load kernel!");
+
+            smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
+            std::string log;
+            auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
+                "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
+            ASSERT_SUCCESS_NV(res, log);
+
+            ptx = std::move(ptx_);
+        }
+        CUmodule   module;
+        CUfunction kernel;
+        CUstream   stream;
+
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+
+        ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
+        ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
+        ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+
+        // launchKernel(kernel, stream);
+
+        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+        ASSERT_SUCCESS(cu.pcuModuleUnload(module));
+        ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
+
+        m_device->waitIdle();
+        
+        // testInterop();
+
+        return true;
+    }
+
+    void createResources()
+    {
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+
+        for (auto& buf : cpuBufs)
+        {
+          ICPUBuffer::SCreationParams params = {};
+          params.size = size;
+          buf = ICPUBuffer::create(std::move(params));
+        }
+
+        for (auto j = 0; j < 2; j++)
+            for (auto i = 0; i < numElements; i++)
+                reinterpret_cast<float*>(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX);
+
+
+        // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
+        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        //
+        // semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+        // ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get()));
+        // {
+        //     // export the CUmem we have just created into a refctd IDeviceMemoryAllocation
+        //     auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
+        //     if (!devmemory)
+        //         logFail("Failed to export CUDA memory!");
+        //
+        //
+        //     // create an importing external buffer on Nabla side
+        //     IGPUBuffer::SCreationParams params = {};
+        //     params.size = devmemory->getAllocationSize();
+        //     params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
+        //     params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+        //     importedBuf = m_device->createBuffer(std::move(params));
+        //     if (!importedBuf) 
+        //         logFail("Failed to create an external buffer");
+        //
+        //     // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
+        //     ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
+        //     bool re = m_device->bindBufferMemory(1, &bindInfo);
+        //     if (!re) logFail("Failed to bind CUDA memory to buffer");
+        // }
+        //
+        // {
+        //     // same thing as above
+        //     // we create an external image and bind the imported external memory to it
+        //     // now we have 2 different resources that are bound to the same memory
+        //     IImage::SCreationParams params = {};
+        //     params.type = IGPUImage::ET_2D;
+        //     params.samples = IGPUImage::ESCF_1_BIT;
+        //     params.format = EF_R32_SFLOAT;
+        //     params.extent = { gridDim[0], blockDim[0], 1 };
+        //     params.mipLevels = 1;
+        //     params.arrayLayers = 1;
+        //     params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
+        //     importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params));
+        //     if (!importedImg) logFail("Failed to create an external image");
+        // }
+        //
+        // commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        // bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
+        //
+        // stagingBufs[0] = createStaging();
+        // stagingBufs[1] = createStaging();
+    }
+
+    // smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
+    // {
+    //     IGPUBuffer::SCreationParams params = {};
+    //     params.size = mem->getAllocationSize();
+    //     params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
+    //     params.externalHandleTypes = mem->getCreationParams().externalHandleType;
+    //     auto buf = m_device->createBuffer(std::move(params));
+    //     ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
+    //     m_device->bindBufferMemory(1, &bindInfo);
+    //     return buf;
+    // }
+
+    // smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz = size)
+    // {
+    //     auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
+    //     auto req = buf->getMemoryReqs();
+    //     req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+    //     auto allocation = m_device->allocate(req, buf.get());
+    //
+    //     void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
+    //     if (!mapping)
+    //         logFail("Failed to map an staging buffer");
+    //     memset(mapping, 0, req.size);
+    //     return buf;
+    // };
+
+    // void launchKernel(CUfunction kernel, CUstream stream)
+    // {
+    //
+    //     // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
+    //     {
+    //         IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+    //                 .barrier = {
+    //                     .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+    //                     .otherQueueFamilyIndex = IQueue::FamilyExternal,
+    //                 },
+    //                 .range = {.buffer = importedBuf, },
+    //         };
+    //
+    //         IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+    //             .barrier = {
+    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
+    //             },
+    //             .image = importedImg.get(),
+    //             .subresourceRange = {
+    //                 .aspectMask = IImage::EAF_COLOR_BIT,
+    //                 .levelCount = 1u,
+    //                 .layerCount = 1u,
+    //             }
+    //         };
+    //         // start recording
+    //         bool re = true;
+    //         re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+    //         re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
+    //         re &= cmd[0]->end();
+    //
+    //         IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 };
+    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
+    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
+    //         auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+    //         re &= IQueue::RESULT::SUCCESS == submitRe;
+    //         if (!re)
+    //             logFail("Something went wrong readying resources for CUDA");
+    //     }
+    //     
+    //     auto& cu = cudaHandler->getCUDAFunctionTable();
+    //     // Launch kernel
+    //     {
+    //         CUdeviceptr ptrs[] = {
+    //             cudaMemories[0]->getDeviceptr(),
+    //             cudaMemories[1]->getDeviceptr(),
+    //             cudaMemories[2]->getDeviceptr(),
+    //         };
+    //         auto numEles = numElements;
+    //         void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
+    //         ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
+    //         ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
+    //
+    //         auto semaphore = cudaSemaphore->getInternalObject();
+    //         CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+    //         ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
+    //         ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
+    //         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+    //         ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+    //     }
+    //     
+    //     // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
+    //     {
+    //         IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+    //             .barrier = {
+    //                 .dep = {
+    //                     .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+    //                     .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
+    //                 },
+    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
+    //             },
+    //             .range = { .buffer = importedBuf, },
+    //         };
+    //         bool re = true;
+    //         re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+    //
+    //         re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
+    //
+    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
+    //         re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
+    //
+    //         IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+    //             .barrier = { 
+    //                 .dep = { 
+    //                     .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+    //                     .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
+    //                 },
+    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
+    //             },
+    //             .image = importedImg.get(),
+    //             .subresourceRange = {
+    //                 .aspectMask = IImage::EAF_COLOR_BIT,
+    //                 .levelCount = 1u,
+    //                 .layerCount = 1u,
+    //             },
+    //             .oldLayout = IImage::LAYOUT::PREINITIALIZED,
+    //             .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
+    //         };
+    //
+    //         re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
+    //
+    //         IImage::SBufferCopy imgRegion = {
+    //             .imageSubresource = {
+    //                 .aspectMask = imgBarrier.subresourceRange.aspectMask,
+    //                 .layerCount = imgBarrier.subresourceRange.layerCount,
+    //             },
+    //             .imageExtent = importedImg->getCreationParameters().extent,
+    //         };
+    //
+    //         re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
+    //         re &= cmd[1]->end();
+    //         
+    //         IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 };
+    //         IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 };
+    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
+    //         IQueue::SSubmitInfo submitInfo = { 
+    //             .waitSemaphores = {&waitInfo,&waitInfo + 1},
+    //             .commandBuffers = {&cmdInfo, &cmdInfo + 1},  
+    //             .signalSemaphores = {&signalInfo,&signalInfo + 1} 
+    //         };
+    //         auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+    //         re &= IQueue::RESULT::SUCCESS == submitRe;
+    //         if (!re)
+    //             logFail("Something went wrong copying results from CUDA");
+    //     }
+    //     
+    //     ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
+    // }
+
+    // void kernelCallback()
+    // {
+    //     // Make sure we are also done with the readback
+    //     auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3}};
+    //     m_device->waitForSemaphores(wait, true, -1);
+    //
+    //     float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
+    //     float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
+    //
+    //     float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
+    //     float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
+    //
+    //     if(memcmp(CBuf, CImg, size))
+    //         logFail("Buffer and Image memories do not match!");
+    //
+    //     for (auto i = 0; i < numElements; i++)
+    //     {
+    //         bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
+    //         if(!re)
+    //             logFail("Element at index %d is incorrect!", i);
+    //     }
+    //     
+    //     std::cout << "Success\n";
+    // }
+
+
+    // void testInterop()
+    // {
+    //     {
+    //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+    //             .size = size,
+    //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+    //             .alignmentLog2 = 10,
+    //         };
+    //
+    //         for (size_t i = 0; i < (1 << 8); ++i)
+    //         {
+    //             auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+    //             assert(memory);
+    //             auto tmpBuf = createExternalBuffer(memory.get());
+    //         }
+    //     }
+    //
+    //     smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
+    //     {
+    //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+    //             .size = size,
+    //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+    //             .alignmentLog2 = 10,
+    //         };
+    //
+    //         auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+    //
+    //         auto tmpBuf = createExternalBuffer(memory.get());
+    //         auto staging = createStaging();
+    //
+    //         auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+    //         for (uint32_t i = 0; i < size / 4; ++i)
+    //             ptr[i] = i;
+    //
+    //         smart_refctd_ptr<IGPUCommandBuffer> cmd;
+    //         commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+    //         cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
+    //         assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+    //         cmd->end();
+    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+    //         queue->submit({ &submitInfo,&submitInfo + 1 });
+    //         m_device->waitIdle();
+    //         escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory;
+    //     }
+    //
+    //     //{
+    //     //    constexpr size_t M = 32;
+    //     //    auto staging = createStaging(size * M);
+    //
+    //     //    auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+    //     //    for (uint32_t i = 0; i < (M * size) / 4; ++i)
+    //     //        ptr[i] = rand();
+    //
+    //     //    std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
+    //     //    commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
+    //
+    //     //    for (size_t i = 0; i < 1 << 10; ++i)
+    //     //    {
+    //     //        IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+    //     //            .size = size * M,
+    //     //            .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+    //     //            .alignmentLog2 = 10,
+    //     //        };
+    //     //    RE:
+    //     //        auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+    //
+    //     //        if (!memory)
+    //     //        {
+    //     //            m_device->waitIdle();
+    //     //            for (size_t j = 0; j < i; ++j)
+    //     //                cmd[j] = 0;
+    //     //            goto END;
+    //     //        }
+    //     //        assert(memory);
+    //     //        auto tmpBuf = createExternalBuffer(memory.get());
+    //
+    //     //        cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+    //     //        IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
+    //     //        assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+    //     //        cmd[i]->end();
+    //     //        IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
+    //     //        IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+    //     //        assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
+    //     //    }
+    //     //END:
+    //     //    m_device->waitIdle();
+    //     //}
+    //
+    //     {
+    //         auto tmpBuf = createExternalBuffer(escaped.get());
+    //         auto staging = createStaging();
+    //
+    //         smart_refctd_ptr<IGPUCommandBuffer> cmd;
+    //         commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+    //         cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
+    //         assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, &region));
+    //         cmd->end();
+    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+    //         auto qre = queue->submit({ &submitInfo,&submitInfo + 1 });
+    //         assert(IQueue::RESULT::SUCCESS == qre);
+    //         m_device->waitIdle();
+    //
+    //         auto& ptr = *(std::array<uint32_t, size>*)staging->getBoundMemory().memory->getMappedPointer();
+    //         for (uint32_t i = 0; i < size / 4; ++i)
+    //             assert(ptr[i] == i);
+    //     }
+    //
+    // }
+
+
+    // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
+    bool keepRunning() override { return false; }
+
+    // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
+    void workLoopBody() override {}
+};
+
+NBL_MAIN_FUNC(CUDA2VKApp)
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d945c547a..7c7990c06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -111,6 +111,7 @@ if(NBL_BUILD_EXAMPLES)
 	endif()
 
 	add_subdirectory(74_QuantizedSequenceTests)
+	add_subdirectory(76_CudaInterop)
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)

From b8abd200a1a83ce4592f7ad3290d07ae02b4f538 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 23 Mar 2026 17:00:19 +0700
Subject: [PATCH 2/9] Dummy

---
 71_RayTracingPipeline/main.cpp |   2 +-
 76_CudaInterop/main.cpp        | 706 +++++++++++++++++----------------
 2 files changed, 359 insertions(+), 349 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index f6b64c5ca..70ab21994 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1245,7 +1245,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 				auto retval = device->allocate(info);
 				// map what is mappable by default so ReBAR checks succeed
 				if (retval.isValid() && retval.memory->isMappable())
-					retval.memory->map({ .offset = 0,.length = info.size });
+					retval.memory->map({ .offset = 0,.length = info.allocationSize });
 				return retval;
 			}
 
diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 85d10ad13..c4b4fd5fe 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -76,9 +76,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
     // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
     // // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side
-    // std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
+    std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
     // // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer
-    // smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
+    smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
 
     // our Buffer that is bound to cudaMemories[2]
     smart_refctd_ptr<IGPUBuffer> importedBuf;
@@ -155,7 +155,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
         ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 
-        // launchKernel(kernel, stream);
+        launchKernel(kernel, stream);
 
         ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
         ASSERT_SUCCESS(cu.pcuModuleUnload(module));
@@ -163,7 +163,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
         m_device->waitIdle();
         
-        // testInterop();
+        testInterop();
 
         return true;
     }
@@ -185,352 +185,362 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
 
         // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        //
-        // semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
-        // ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get()));
-        // {
-        //     // export the CUmem we have just created into a refctd IDeviceMemoryAllocation
-        //     auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
-        //     if (!devmemory)
-        //         logFail("Failed to export CUDA memory!");
-        //
-        //
-        //     // create an importing external buffer on Nabla side
-        //     IGPUBuffer::SCreationParams params = {};
-        //     params.size = devmemory->getAllocationSize();
-        //     params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
-        //     params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-        //     importedBuf = m_device->createBuffer(std::move(params));
-        //     if (!importedBuf) 
-        //         logFail("Failed to create an external buffer");
-        //
-        //     // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
-        //     ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
-        //     bool re = m_device->bindBufferMemory(1, &bindInfo);
-        //     if (!re) logFail("Failed to bind CUDA memory to buffer");
-        // }
-        //
-        // {
-        //     // same thing as above
-        //     // we create an external image and bind the imported external memory to it
-        //     // now we have 2 different resources that are bound to the same memory
-        //     IImage::SCreationParams params = {};
-        //     params.type = IGPUImage::ET_2D;
-        //     params.samples = IGPUImage::ESCF_1_BIT;
-        //     params.format = EF_R32_SFLOAT;
-        //     params.extent = { gridDim[0], blockDim[0], 1 };
-        //     params.mipLevels = 1;
-        //     params.arrayLayers = 1;
-        //     params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
-        //     importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params));
-        //     if (!importedImg) logFail("Failed to create an external image");
-        // }
-        //
-        // commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-        // bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
-        //
-        // stagingBufs[0] = createStaging();
-        // stagingBufs[1] = createStaging();
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        
+        semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+        ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get()));
+        {
+            // export the CUmem we have just created into a refctd IDeviceMemoryAllocation
+            auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
+            if (!devmemory)
+                logFail("Failed to export CUDA memory!");
+            
+            
+            // create an importing external buffer on Nabla side
+            IGPUBuffer::SCreationParams params = {};
+            params.size = devmemory->getAllocationSize();
+            params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
+            params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+            importedBuf = m_device->createBuffer(std::move(params));
+            if (!importedBuf) 
+                logFail("Failed to create an external buffer");
+            
+            // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
+            ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
+            bool re = m_device->bindBufferMemory(1, &bindInfo);
+                if (!re) logFail("Failed to bind CUDA memory to buffer");
+        }
+        
+        {
+            // same thing as above
+            // we create an external image and bind the imported external memory to it
+            // now we have 2 different resources that are bound to the same memory
+
+            auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
+            if (!devmemory)
+                logFail("Failed to export CUDA memory!");
+
+            IGPUImage::SCreationParams params = {};
+            params.type = IGPUImage::ET_2D;
+            params.samples = IGPUImage::ESCF_1_BIT;
+            params.format = EF_R32_SFLOAT;
+            params.extent = { gridDim[0], blockDim[0], 1 };
+            params.mipLevels = 1;
+            params.arrayLayers = 1;
+            params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
+            params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+            importedImg = m_device->createImage(std::move(params));
+            if (!importedImg) logFail("Failed to create an external image");
+            // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
+            ILogicalDevice::SBindImageMemoryInfo bindInfo = { .image = importedImg.get(), .binding = {.memory = devmemory.get() } };
+            bool re = m_device->bindImageMemory(1, &bindInfo);
+                if (!re) logFail("Failed to bind CUDA memory to buffer");
+        }
+        
+        commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
+        
+        stagingBufs[0] = createStaging();
+        stagingBufs[1] = createStaging();
+    }
+
+    smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
+    {
+        IGPUBuffer::SCreationParams params = {};
+        params.size = mem->getAllocationSize();
+        params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
+        params.externalHandleTypes = mem->getCreationParams().externalHandleType;
+        auto buf = m_device->createBuffer(std::move(params));
+        ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
+        m_device->bindBufferMemory(1, &bindInfo);
+        return buf;
+    }
+
+    smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz = size)
+    {
+        auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
+        auto req = buf->getMemoryReqs();
+        req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+        auto allocation = m_device->allocate(req, buf.get());
+    
+        void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
+        if (!mapping)
+            logFail("Failed to map an staging buffer");
+        memset(mapping, 0, req.size);
+        return buf;
+    };
+
+    void launchKernel(CUfunction kernel, CUstream stream)
+    {
+    
+        // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
+        {
+            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                    .barrier = {
+                        .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+                        .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                    },
+                    .range = {.buffer = importedBuf, },
+            };
+    
+            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+                .barrier = {
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .image = importedImg.get(),
+                .subresourceRange = {
+                    .aspectMask = IImage::EAF_COLOR_BIT,
+                    .levelCount = 1u,
+                    .layerCount = 1u,
+                }
+            };
+            // start recording
+            bool re = true;
+            re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
+            re &= cmd[0]->end();
+    
+            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 };
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
+            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            re &= IQueue::RESULT::SUCCESS == submitRe;
+            if (!re)
+                logFail("Something went wrong readying resources for CUDA");
+        }
+        
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+        // Launch kernel
+        {
+            CUdeviceptr ptrs[] = {
+                cudaMemories[0]->getDeviceptr(),
+                cudaMemories[1]->getDeviceptr(),
+                cudaMemories[2]->getDeviceptr(),
+            };
+            auto numEles = numElements;
+            void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
+    
+            auto semaphore = cudaSemaphore->getInternalObject();
+            CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+            ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
+            ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
+            CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+            // ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+        }
+        
+        // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
+        {
+            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                .barrier = {
+                    .dep = {
+                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+                        .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .range = { .buffer = importedBuf, },
+            };
+            bool re = true;
+            re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+        
+            re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
+        
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
+        
+            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+                .barrier = { 
+                    .dep = { 
+                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+                        .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .image = importedImg.get(),
+                .subresourceRange = {
+                    .aspectMask = IImage::EAF_COLOR_BIT,
+                    .levelCount = 1u,
+                    .layerCount = 1u,
+                },
+                .oldLayout = IImage::LAYOUT::PREINITIALIZED,
+                .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
+            };
+        
+            re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
+        
+            IImage::SBufferCopy imgRegion = {
+                .imageSubresource = {
+                    .aspectMask = imgBarrier.subresourceRange.aspectMask,
+                    .layerCount = imgBarrier.subresourceRange.layerCount,
+                },
+                .imageExtent = importedImg->getCreationParameters().extent,
+            };
+        
+            re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
+            re &= cmd[1]->end();
+            
+            IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 };
+            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 };
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
+            IQueue::SSubmitInfo submitInfo = { 
+                .waitSemaphores = {&waitInfo,&waitInfo + 1},
+                .commandBuffers = {&cmdInfo, &cmdInfo + 1},  
+                .signalSemaphores = {&signalInfo,&signalInfo + 1} 
+            };
+            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            re &= IQueue::RESULT::SUCCESS == submitRe;
+            if (!re)
+                logFail("Something went wrong copying results from CUDA");
+        }
+        
+        ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
+    }
+
+    void kernelCallback()
+    {
+        // Make sure we are also done with the readback
+        auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 2}};
+        m_device->waitForSemaphores(wait, true, -1);
+    
+        float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
+        float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
+    
+        float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
+        float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
+    
+        if(memcmp(CBuf, CImg, size))
+            logFail("Buffer and Image memories do not match!");
+    
+        for (auto i = 0; i < numElements; i++)
+        {
+            bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
+            if(!re)
+                logFail("Element at index %d is incorrect!", i);
+        }
+        
+        std::cout << "Success\n";
     }
 
-    // smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
-    // {
-    //     IGPUBuffer::SCreationParams params = {};
-    //     params.size = mem->getAllocationSize();
-    //     params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
-    //     params.externalHandleTypes = mem->getCreationParams().externalHandleType;
-    //     auto buf = m_device->createBuffer(std::move(params));
-    //     ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
-    //     m_device->bindBufferMemory(1, &bindInfo);
-    //     return buf;
-    // }
-
-    // smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz = size)
-    // {
-    //     auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
-    //     auto req = buf->getMemoryReqs();
-    //     req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
-    //     auto allocation = m_device->allocate(req, buf.get());
-    //
-    //     void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
-    //     if (!mapping)
-    //         logFail("Failed to map an staging buffer");
-    //     memset(mapping, 0, req.size);
-    //     return buf;
-    // };
-
-    // void launchKernel(CUfunction kernel, CUstream stream)
-    // {
-    //
-    //     // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
-    //     {
-    //         IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
-    //                 .barrier = {
-    //                     .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
-    //                     .otherQueueFamilyIndex = IQueue::FamilyExternal,
-    //                 },
-    //                 .range = {.buffer = importedBuf, },
-    //         };
-    //
-    //         IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
-    //             .barrier = {
-    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
-    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
-    //             },
-    //             .image = importedImg.get(),
-    //             .subresourceRange = {
-    //                 .aspectMask = IImage::EAF_COLOR_BIT,
-    //                 .levelCount = 1u,
-    //                 .layerCount = 1u,
-    //             }
-    //         };
-    //         // start recording
-    //         bool re = true;
-    //         re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    //         re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
-    //         re &= cmd[0]->end();
-    //
-    //         IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 };
-    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
-    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
-    //         auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
-    //         re &= IQueue::RESULT::SUCCESS == submitRe;
-    //         if (!re)
-    //             logFail("Something went wrong readying resources for CUDA");
-    //     }
-    //     
-    //     auto& cu = cudaHandler->getCUDAFunctionTable();
-    //     // Launch kernel
-    //     {
-    //         CUdeviceptr ptrs[] = {
-    //             cudaMemories[0]->getDeviceptr(),
-    //             cudaMemories[1]->getDeviceptr(),
-    //             cudaMemories[2]->getDeviceptr(),
-    //         };
-    //         auto numEles = numElements;
-    //         void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
-    //         ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
-    //         ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
-    //
-    //         auto semaphore = cudaSemaphore->getInternalObject();
-    //         CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
-    //         ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
-    //         ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-    //         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-    //         ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
-    //     }
-    //     
-    //     // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
-    //     {
-    //         IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
-    //             .barrier = {
-    //                 .dep = {
-    //                     .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-    //                     .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
-    //                 },
-    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
-    //             },
-    //             .range = { .buffer = importedBuf, },
-    //         };
-    //         bool re = true;
-    //         re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    //
-    //         re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
-    //
-    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
-    //         re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
-    //
-    //         IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
-    //             .barrier = { 
-    //                 .dep = { 
-    //                     .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-    //                     .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
-    //                 },
-    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
-    //             },
-    //             .image = importedImg.get(),
-    //             .subresourceRange = {
-    //                 .aspectMask = IImage::EAF_COLOR_BIT,
-    //                 .levelCount = 1u,
-    //                 .layerCount = 1u,
-    //             },
-    //             .oldLayout = IImage::LAYOUT::PREINITIALIZED,
-    //             .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
-    //         };
-    //
-    //         re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
-    //
-    //         IImage::SBufferCopy imgRegion = {
-    //             .imageSubresource = {
-    //                 .aspectMask = imgBarrier.subresourceRange.aspectMask,
-    //                 .layerCount = imgBarrier.subresourceRange.layerCount,
-    //             },
-    //             .imageExtent = importedImg->getCreationParameters().extent,
-    //         };
-    //
-    //         re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
-    //         re &= cmd[1]->end();
-    //         
-    //         IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 };
-    //         IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 };
-    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
-    //         IQueue::SSubmitInfo submitInfo = { 
-    //             .waitSemaphores = {&waitInfo,&waitInfo + 1},
-    //             .commandBuffers = {&cmdInfo, &cmdInfo + 1},  
-    //             .signalSemaphores = {&signalInfo,&signalInfo + 1} 
-    //         };
-    //         auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
-    //         re &= IQueue::RESULT::SUCCESS == submitRe;
-    //         if (!re)
-    //             logFail("Something went wrong copying results from CUDA");
-    //     }
-    //     
-    //     ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
-    // }
-
-    // void kernelCallback()
-    // {
-    //     // Make sure we are also done with the readback
-    //     auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3}};
-    //     m_device->waitForSemaphores(wait, true, -1);
-    //
-    //     float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
-    //     float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
-    //
-    //     float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
-    //     float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
-    //
-    //     if(memcmp(CBuf, CImg, size))
-    //         logFail("Buffer and Image memories do not match!");
-    //
-    //     for (auto i = 0; i < numElements; i++)
-    //     {
-    //         bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
-    //         if(!re)
-    //             logFail("Element at index %d is incorrect!", i);
-    //     }
-    //     
-    //     std::cout << "Success\n";
-    // }
-
-
-    // void testInterop()
-    // {
-    //     {
-    //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-    //             .size = size,
-    //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-    //             .alignmentLog2 = 10,
-    //         };
-    //
-    //         for (size_t i = 0; i < (1 << 8); ++i)
-    //         {
-    //             auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-    //             assert(memory);
-    //             auto tmpBuf = createExternalBuffer(memory.get());
-    //         }
-    //     }
-    //
-    //     smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
-    //     {
-    //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-    //             .size = size,
-    //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-    //             .alignmentLog2 = 10,
-    //         };
-    //
-    //         auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-    //
-    //         auto tmpBuf = createExternalBuffer(memory.get());
-    //         auto staging = createStaging();
-    //
-    //         auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
-    //         for (uint32_t i = 0; i < size / 4; ++i)
-    //             ptr[i] = i;
-    //
-    //         smart_refctd_ptr<IGPUCommandBuffer> cmd;
-    //         commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
-    //         cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
-    //         assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
-    //         cmd->end();
-    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
-    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-    //         queue->submit({ &submitInfo,&submitInfo + 1 });
-    //         m_device->waitIdle();
-    //         escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory;
-    //     }
-    //
-    //     //{
-    //     //    constexpr size_t M = 32;
-    //     //    auto staging = createStaging(size * M);
-    //
-    //     //    auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
-    //     //    for (uint32_t i = 0; i < (M * size) / 4; ++i)
-    //     //        ptr[i] = rand();
-    //
-    //     //    std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
-    //     //    commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
-    //
-    //     //    for (size_t i = 0; i < 1 << 10; ++i)
-    //     //    {
-    //     //        IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-    //     //            .size = size * M,
-    //     //            .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-    //     //            .alignmentLog2 = 10,
-    //     //        };
-    //     //    RE:
-    //     //        auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-    //
-    //     //        if (!memory)
-    //     //        {
-    //     //            m_device->waitIdle();
-    //     //            for (size_t j = 0; j < i; ++j)
-    //     //                cmd[j] = 0;
-    //     //            goto END;
-    //     //        }
-    //     //        assert(memory);
-    //     //        auto tmpBuf = createExternalBuffer(memory.get());
-    //
-    //     //        cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    //     //        IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
-    //     //        assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
-    //     //        cmd[i]->end();
-    //     //        IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
-    //     //        IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-    //     //        assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
-    //     //    }
-    //     //END:
-    //     //    m_device->waitIdle();
-    //     //}
-    //
-    //     {
-    //         auto tmpBuf = createExternalBuffer(escaped.get());
-    //         auto staging = createStaging();
-    //
-    //         smart_refctd_ptr<IGPUCommandBuffer> cmd;
-    //         commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
-    //         cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
-    //         assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, &region));
-    //         cmd->end();
-    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
-    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-    //         auto qre = queue->submit({ &submitInfo,&submitInfo + 1 });
-    //         assert(IQueue::RESULT::SUCCESS == qre);
-    //         m_device->waitIdle();
-    //
-    //         auto& ptr = *(std::array<uint32_t, size>*)staging->getBoundMemory().memory->getMappedPointer();
-    //         for (uint32_t i = 0; i < size / 4; ++i)
-    //             assert(ptr[i] == i);
-    //     }
-    //
-    // }
+
+    void testInterop()
+    {
+        {
+            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+                .size = size,
+                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+                .alignmentLog2 = 10,
+            };
+    
+            for (size_t i = 0; i < (1 << 8); ++i)
+            {
+                auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+                assert(memory);
+                auto tmpBuf = createExternalBuffer(memory.get());
+            }
+        }
+    
+        smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
+        {
+            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+                .size = size,
+                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+                .alignmentLog2 = 10,
+            };
+    
+            auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+    
+            auto tmpBuf = createExternalBuffer(memory.get());
+            auto staging = createStaging();
+    
+            auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < size / 4; ++i)
+                ptr[i] = i;
+    
+            smart_refctd_ptr<IGPUCommandBuffer> cmd;
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+            cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+            cmd->end();
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+            queue->submit({ &submitInfo,&submitInfo + 1 });
+            m_device->waitIdle();
+            escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory;
+        }
+    
+        {
+            constexpr size_t M = 32;
+            auto staging = createStaging(size * M);
+    
+            auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < (M * size) / 4; ++i)
+                ptr[i] = rand();
+    
+            std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
+    
+            for (size_t i = 0; i < 1 << 10; ++i)
+            {
+                IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+                    .size = size * M,
+                    .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+                    .alignmentLog2 = 10,
+                };
+            RE:
+                auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+    
+                if (!memory)
+                {
+                    m_device->waitIdle();
+                    for (size_t j = 0; j < i; ++j)
+                        cmd[j] = 0;
+                    goto END;
+                }
+                assert(memory);
+                auto tmpBuf = createExternalBuffer(memory.get());
+    
+                cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+                IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
+                assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+                cmd[i]->end();
+                IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
+                IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+                assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
+            }
+        END:
+            m_device->waitIdle();
+        }
+    
+        {
+            auto tmpBuf = createExternalBuffer(escaped.get());
+            auto staging = createStaging();
+    
+            smart_refctd_ptr<IGPUCommandBuffer> cmd;
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+            cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, &region));
+            cmd->end();
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+            auto qre = queue->submit({ &submitInfo,&submitInfo + 1 });
+            assert(IQueue::RESULT::SUCCESS == qre);
+            m_device->waitIdle();
+    
+            auto& ptr = *(std::array<uint32_t, size>*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < size / 4; ++i)
+                assert(ptr[i] == i);
+        }
+    
+    }
 
 
     // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.

From 93ca5efe588ca85c1eaf81a486b611df98403580 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Thu, 23 Apr 2026 01:09:08 +0700
Subject: [PATCH 3/9] Refactor test into separate section

---
 .../app_resources/vectorAdd_kernel.cu         |   6 +-
 76_CudaInterop/main.cpp                       | 686 +++++++++---------
 2 files changed, 350 insertions(+), 342 deletions(-)

diff --git a/76_CudaInterop/app_resources/vectorAdd_kernel.cu b/76_CudaInterop/app_resources/vectorAdd_kernel.cu
index 3baef0123..35876a627 100644
--- a/76_CudaInterop/app_resources/vectorAdd_kernel.cu
+++ b/76_CudaInterop/app_resources/vectorAdd_kernel.cu
@@ -33,10 +33,8 @@
  */
 
 extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
-                                     int numElements) {
+                                     size_t numElements) {
   int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < numElements) {
+  if (i < numElements)
     C[i] = A[i] + B[i];
-  }
 }
\ No newline at end of file
diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index c4b4fd5fe..2a64f9428 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -1,6 +1,6 @@
 #include "nbl/video/CCUDAHandler.h"
-// #include "nbl/video/CCUDASharedMemory.h"
-// #include "nbl/video/CCUDASharedSemaphore.h"
+// #include "nbl/video/CCUDAExportableMemory.h"
+// #include "nbl/video/CCUDAImportedSemaphore.h"
 
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
 #include "nbl/examples/common/BuiltinResourcesApplication.hpp"
@@ -57,10 +57,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     using device_base_t = MonoDeviceApplication;
     using asset_base_t = BuiltinResourcesApplication;
 
-    static constexpr uint32_t gridDim[3] = { 4096,1,1 };
-    static constexpr uint32_t blockDim[3] = { 1024,1,1 };
-    static constexpr size_t numElements = gridDim[0] * blockDim[0];
-    static constexpr size_t size = sizeof(float) * numElements;
 
 public:
     // Yay thanks to multiple inheritance we cannot forward ctors anymore
@@ -72,27 +68,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
     IQueue* queue;
 
-    // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
-    std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
-    // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
-    // // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side
-    std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
-    // // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer
-    smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
-
-    // our Buffer that is bound to cudaMemories[2]
-    smart_refctd_ptr<IGPUBuffer> importedBuf;
-    // our Image that is also bound to cudaMemories[2]
-    smart_refctd_ptr<IGPUImage> importedImg;
-
-    // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing
-    smart_refctd_ptr<IGPUBuffer> stagingBufs[2];
-
-    // Nabla semaphore for sync
-    smart_refctd_ptr<ISemaphore> semaphore;
-
-    smart_refctd_ptr<IGPUCommandPool> commandPool;
-    smart_refctd_ptr<IGPUCommandBuffer> cmd[2];
 
     // a device filter helps you create a set of physical devices that satisfy your requirements in terms of features, limits etc.
     virtual void filterDevices(core::set<video::IPhysicalDevice*>& physicalDevices) const
@@ -121,10 +96,47 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         if (!cudaDevice) 
             return logFail("Could not create a CUDA Device!");
 
-        
-        queue = device_base_t::getComputeQueue();
-        
-        createResources();
+        testSharedResource();
+        testDestruction();
+        testLargeAllocations();
+
+        return true;
+    }
+
+    smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
+    {
+        IGPUBuffer::SCreationParams params = {};
+        params.size = mem->getAllocationSize();
+        params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
+        params.externalHandleTypes = mem->getCreationParams().externalHandleType;
+        auto buf = m_device->createBuffer(std::move(params));
+        ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
+        m_device->bindBufferMemory(1, &bindInfo);
+        return buf;
+    }
+
+    smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz)
+    {
+        auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
+        auto req = buf->getMemoryReqs();
+        req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits()
+                            & m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits()
+                            & m_device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT);
+        auto allocation = m_device->allocate(req, buf.get());
+    
+        void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
+        if (!mapping)
+            logFail("Failed to map an staging buffer");
+        memset(mapping, 0, req.size);
+        return buf;
+    };
+
+    void testSharedResource()
+    {
+        static constexpr uint32_t GridDim[3] = { 4096,1,1 };
+        static constexpr uint32_t BlockDim[3] = { 1024,1,1 };
+        static constexpr size_t NumElements = GridDim[0] * BlockDim[0];
+        static constexpr size_t BufferSize = sizeof(float) * NumElements;
 
         smart_refctd_ptr<ICPUBuffer> ptx;
         {
@@ -135,7 +147,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp);
             const auto assets = assetBundle.getContents();
             if (assets.empty())
-                return logFail("Could not load kernel!");
+                logFail("Could not load kernel!");
 
             smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
             std::string log;
@@ -145,197 +157,137 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
             ptx = std::move(ptx_);
         }
+
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+
         CUmodule   module;
         CUfunction kernel;
         CUstream   stream;
 
-        auto& cu = cudaHandler->getCUDAFunctionTable();
-
         ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
         ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
         ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 
-        launchKernel(kernel, stream);
-
-        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
-        ASSERT_SUCCESS(cu.pcuModuleUnload(module));
-        ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
-
-        m_device->waitIdle();
-        
-        testInterop();
-
-        return true;
-    }
-
-    void createResources()
-    {
-        auto& cu = cudaHandler->getCUDAFunctionTable();
+        // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
+        std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
 
         for (auto& buf : cpuBufs)
         {
-          ICPUBuffer::SCreationParams params = {};
-          params.size = size;
-          buf = ICPUBuffer::create(std::move(params));
+            ICPUBuffer::SCreationParams params = {};
+            params.size = BufferSize;
+            buf = ICPUBuffer::create(std::move(params));
         }
 
-        for (auto j = 0; j < 2; j++)
-            for (auto i = 0; i < numElements; i++)
-                reinterpret_cast<float*>(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX);
+        for (auto buf_i = 0; buf_i < cpuBufs.size(); buf_i++)
+            for (auto elem_i = 0; elem_i < NumElements; elem_i++)
+                reinterpret_cast<float*>(cpuBufs[buf_i]->getPointer())[elem_i] = rand() / float(RAND_MAX);
 
+        constexpr auto InputCount = 2;
+        // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
+        // // Kernel writes to cudaInputMemories[2] which we later use to export and read on nabla side
+        std::array<smart_refctd_ptr<CCUDAExportableMemory>, InputCount> cudaInputMemories = {};
+        std::array<smart_refctd_ptr<IDeviceMemoryAllocation>, InputCount> vulkanMemories = {};
+        std::array<smart_refctd_ptr<IGPUBuffer>, InputCount> vulkanInputBuffers = {};
+        std::array<smart_refctd_ptr<IGPUBuffer>, InputCount> inputStagingBuffers = {};
 
-        // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        
-        semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
-        ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get()));
+        for (auto input_i = 0; input_i < InputCount; input_i++)
         {
-            // export the CUmem we have just created into a refctd IDeviceMemoryAllocation
-            auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
-            if (!devmemory)
-                logFail("Failed to export CUDA memory!");
-            
-            
-            // create an importing external buffer on Nabla side
-            IGPUBuffer::SCreationParams params = {};
-            params.size = devmemory->getAllocationSize();
-            params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
-            params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-            importedBuf = m_device->createBuffer(std::move(params));
-            if (!importedBuf) 
-                logFail("Failed to create an external buffer");
-            
-            // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
-            ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
-            bool re = m_device->bindBufferMemory(1, &bindInfo);
-                if (!re) logFail("Failed to bind CUDA memory to buffer");
+          // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
+          ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaInputMemories[input_i], { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+          vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr);
+          vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get());
+          inputStagingBuffers[input_i] = createStaging(BufferSize);
         }
+
+        IGPUBuffer::SCreationParams outputBufferParams;
+        outputBufferParams.size = cudaDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, BufferSize);
+        outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
+        outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+        const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams));
+        auto outputMemReq = outputBuf->getMemoryReqs();
+        auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE);
+        core::smart_refctd_ptr<CCUDAImportedMemory> cudaOutputMemory;
+        ASSERT_SUCCESS(cudaDevice->importExternalMemory(&cudaOutputMemory, allocation.memory.get()));
         
-        {
-            // same thing as above
-            // we create an external image and bind the imported external memory to it
-            // now we have 2 different resources that are bound to the same memory
-
-            auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
-            if (!devmemory)
-                logFail("Failed to export CUDA memory!");
-
-            IGPUImage::SCreationParams params = {};
-            params.type = IGPUImage::ET_2D;
-            params.samples = IGPUImage::ESCF_1_BIT;
-            params.format = EF_R32_SFLOAT;
-            params.extent = { gridDim[0], blockDim[0], 1 };
-            params.mipLevels = 1;
-            params.arrayLayers = 1;
-            params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
-            params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-            importedImg = m_device->createImage(std::move(params));
-            if (!importedImg) logFail("Failed to create an external image");
-            // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
-            ILogicalDevice::SBindImageMemoryInfo bindInfo = { .image = importedImg.get(), .binding = {.memory = devmemory.get() } };
-            bool re = m_device->bindImageMemory(1, &bindInfo);
-                if (!re) logFail("Failed to bind CUDA memory to buffer");
-        }
+        ISemaphore::SCreationParams semParams;
+        semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
+        auto semaphore = m_device->createSemaphore(0, std::move(semParams));
+        core::smart_refctd_ptr<CCUDAImportedSemaphore> cudaSemaphore;
+        ASSERT_SUCCESS(cudaDevice->importExternalSemaphore(&cudaSemaphore, semaphore.get()));
         
-        commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-        bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
+        std::array<smart_refctd_ptr<IGPUCommandBuffer>, 2> cmd;
+        auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger));
         
-        stagingBufs[0] = createStaging();
-        stagingBufs[1] = createStaging();
-    }
-
-    smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
-    {
-        IGPUBuffer::SCreationParams params = {};
-        params.size = mem->getAllocationSize();
-        params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
-        params.externalHandleTypes = mem->getCreationParams().externalHandleType;
-        auto buf = m_device->createBuffer(std::move(params));
-        ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
-        m_device->bindBufferMemory(1, &bindInfo);
-        return buf;
-    }
-
-    smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz = size)
-    {
-        auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
-        auto req = buf->getMemoryReqs();
-        req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
-        auto allocation = m_device->allocate(req, buf.get());
-    
-        void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
-        if (!mapping)
-            logFail("Failed to map an staging buffer");
-        memset(mapping, 0, req.size);
-        return buf;
-    };
+        const auto outputStagingBuffer = createStaging(BufferSize);
 
-    void launchKernel(CUfunction kernel, CUstream stream)
-    {
-    
         // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
         {
-            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
-                    .barrier = {
-                        .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
-                        .otherQueueFamilyIndex = IQueue::FamilyExternal,
-                    },
-                    .range = {.buffer = importedBuf, },
-            };
-    
-            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+            const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
                 .barrier = {
+                    .dep = {
+                        .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+                        .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS,
+                    },
                     .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
                     .otherQueueFamilyIndex = IQueue::FamilyExternal,
                 },
-                .image = importedImg.get(),
-                .subresourceRange = {
-                    .aspectMask = IImage::EAF_COLOR_BIT,
-                    .levelCount = 1u,
-                    .layerCount = 1u,
-                }
+                .range = {
+                  .offset = 0, 
+                  .size = outputBuf->getSize(), 
+                  .buffer = outputBuf, 
+                },
             };
+    
             // start recording
             bool re = true;
             re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
+            re &= cmd[0]->pipelineBarrier(EDF_NONE, {
+              .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}
+            });
             re &= cmd[0]->end();
     
-            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 };
-            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
-            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
-            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+              .semaphore = semaphore.get(), 
+              .value = 1,
+              .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+            };
+            const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() };
+            const IQueue::SSubmitInfo submitInfo = {
+              .commandBuffers = {&cmdInfo, &cmdInfo + 1}, 
+              .signalSemaphores = {&signalInfo, &signalInfo + 1}
+            };
+            const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 });
             re &= IQueue::RESULT::SUCCESS == submitRe;
-            if (!re)
-                logFail("Something went wrong readying resources for CUDA");
+            if (!re) logFail("Something went wrong readying resources for CUDA");
         }
         
-        auto& cu = cudaHandler->getCUDAFunctionTable();
         // Launch kernel
         {
+            CUdeviceptr outputBufPtr;
+            cudaOutputMemory->getMappedBuffer(&outputBufPtr);
             CUdeviceptr ptrs[] = {
-                cudaMemories[0]->getDeviceptr(),
-                cudaMemories[1]->getDeviceptr(),
-                cudaMemories[2]->getDeviceptr(),
+              cudaInputMemories[0]->getDeviceptr(),
+              cudaInputMemories[1]->getDeviceptr(),
+              outputBufPtr
             };
-            auto numEles = numElements;
-            void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
-            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
-            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
+            auto numElements = &NumElements;
+            void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements };
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream));
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream));
     
             auto semaphore = cudaSemaphore->getInternalObject();
-            CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+            const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
             ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
-            ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-            CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-            // ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+            ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr));
+            const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+            ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
         }
+        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
         
         // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
         {
-            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+            const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
                 .barrier = {
                     .dep = {
                         .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
@@ -344,202 +296,260 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
                     .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
                     .otherQueueFamilyIndex = IQueue::FamilyExternal,
                 },
-                .range = { .buffer = importedBuf, },
+                .range = { 
+                  .offset = 0,
+                  .size = outputBuf->getSize(),
+                  .buffer = outputBuf, 
+                },
             };
             bool re = true;
             re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-        
             re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
-        
-            IGPUCommandBuffer::SBufferCopy region = { .size = size };
-            re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
-        
-            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
-                .barrier = { 
-                    .dep = { 
-                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-                        .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
-                    },
-                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
-                },
-                .image = importedImg.get(),
-                .subresourceRange = {
-                    .aspectMask = IImage::EAF_COLOR_BIT,
-                    .levelCount = 1u,
-                    .layerCount = 1u,
-                },
-                .oldLayout = IImage::LAYOUT::PREINITIALIZED,
-                .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
+            const auto region = IGPUCommandBuffer::SBufferCopy{ 
+              .srcOffset = 0,
+              .dstOffset = 0,
+              .size = BufferSize 
             };
-        
-            re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
-        
-            IImage::SBufferCopy imgRegion = {
-                .imageSubresource = {
-                    .aspectMask = imgBarrier.subresourceRange.aspectMask,
-                    .layerCount = imgBarrier.subresourceRange.layerCount,
-                },
-                .imageExtent = importedImg->getCreationParameters().extent,
-            };
-        
-            re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
-            re &= cmd[1]->end();
+            re &= cmd[1]->copyBuffer(outputBuf.get(), outputStagingBuffer.get(), 1, &region);
+            for (auto input_i = 0; input_i < InputCount; input_i++)
+              re &= cmd[1]->copyBuffer(vulkanInputBuffers[input_i].get(), inputStagingBuffers[input_i].get(), 1, &region);
+            cmd[1]->end();
             
-            IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 };
-            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 };
-            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
-            IQueue::SSubmitInfo submitInfo = { 
-                .waitSemaphores = {&waitInfo,&waitInfo + 1},
-                .commandBuffers = {&cmdInfo, &cmdInfo + 1},  
-                .signalSemaphores = {&signalInfo,&signalInfo + 1} 
+            const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= {
+              .semaphore = semaphore.get(), 
+              .value = 2,
+              .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+            };
+            const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+              .semaphore = semaphore.get(), 
+              .value = 3,
+              .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
             };
-            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
+            const IQueue::SSubmitInfo submitInfo = { 
+                .waitSemaphores = { &waitInfo, &waitInfo + 1 },
+                .commandBuffers = { &cmdInfo, &cmdInfo + 1 },  
+                .signalSemaphores = { &signalInfo, &signalInfo + 1 } 
+            };
+            const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 });
             re &= IQueue::RESULT::SUCCESS == submitRe;
             if (!re)
                 logFail("Something went wrong copying results from CUDA");
-        }
-        
-        ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
-    }
 
-    void kernelCallback()
-    {
-        // Make sure we are also done with the readback
-        auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 2}};
-        m_device->waitForSemaphores(wait, true, -1);
-    
-        float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
-        float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
-    
-        float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
-        float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
-    
-        if(memcmp(CBuf, CImg, size))
-            logFail("Buffer and Image memories do not match!");
-    
-        for (auto i = 0; i < numElements; i++)
-        {
-            bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
-            if(!re)
-                logFail("Element at index %d is incorrect!", i);
         }
-        
-        std::cout << "Success\n";
-    }
 
-
-    void testInterop()
-    {
+        struct CallbackContext
         {
-            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-                .size = size,
-                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-                .alignmentLog2 = 10,
+            core::smart_refctd_ptr<ISemaphore> semaphore;
+            std::array<core::smart_refctd_ptr<ICPUBuffer>, InputCount> cpuBuffers;
+            std::array<core::smart_refctd_ptr<IGPUBuffer>, InputCount> inputStagingBuffers;
+            core::smart_refctd_ptr<IGPUBuffer> outputStagingBuffer;
+            core::smart_refctd_ptr<video::ILogicalDevice> device;
+            core::smart_refctd_ptr<system::ILogger> logger;
+        };
+
+        CallbackContext ctx;
+        ctx.semaphore = semaphore;
+        ctx.cpuBuffers = cpuBufs;
+        ctx.inputStagingBuffers = inputStagingBuffers;
+        ctx.outputStagingBuffer = outputStagingBuffer;
+        ctx.device = m_device;
+        ctx.logger = m_logger;
+
+        auto cudaCallback = [](void* userData)
+        {
+            const auto* ctx = reinterpret_cast<CallbackContext*>(userData);
+
+            // Make sure we are also done with the readback 
+            const auto wait = std::array{
+              ISemaphore::SWaitInfo{
+                .semaphore = ctx->semaphore.get(), 
+                .value = 3,
+              }
             };
-    
-            for (size_t i = 0; i < (1 << 8); ++i)
+            ctx->device->blockForSemaphores(wait, true);
+
+            auto* stagingMem = ctx->outputStagingBuffer->getBoundMemory().memory;
+            if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
             {
-                auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-                assert(memory);
-                auto tmpBuf = createExternalBuffer(memory.get());
+                ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize());
+                ctx->device->invalidateMappedMemoryRanges(1, &range);
             }
-        }
-    
+
+            const auto* inputs1 = reinterpret_cast<float*>(ctx->cpuBuffers[0]->getPointer());
+            const auto* inputs2 = reinterpret_cast<float*>(ctx->cpuBuffers[1]->getPointer());
+
+            const auto* outputs = reinterpret_cast<float*>(ctx->outputStagingBuffer->getBoundMemory().memory->getMappedPointer());
+            const auto* inputsInStaging1 = reinterpret_cast<float*>(ctx->inputStagingBuffers[0]->getBoundMemory().memory->getMappedPointer());
+            const auto* inputsInStaging2 = reinterpret_cast<float*>(ctx->inputStagingBuffers[1]->getBoundMemory().memory->getMappedPointer());
+
+            for (auto elem_i = 0; elem_i < NumElements; elem_i++)
+            {
+              const auto input1 = inputs1[elem_i];
+              const auto input2 = inputs2[elem_i];
+              const auto inputInStaging1 = inputsInStaging1[elem_i];
+              const auto inputInStaging2 = inputsInStaging2[elem_i];
+              if (inputInStaging1 != input1)
+                ctx->logger->log("Input1 in Staging %d is incorrect!", ILogger::ELL_ERROR, elem_i);
+              if (inputInStaging2 != input2)
+                ctx->logger->log("Input2 in Staging %d is incorrect!", ILogger::ELL_ERROR, elem_i);
+
+              const auto output = outputs[elem_i];
+              const auto expected = input1 + input2;
+              const auto diff = abs(output - expected);
+              bool re = diff < 0.01;
+              if (!re)
+                ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i);
+            }
+
+            ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO);
+        };
+
+        ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx));
+        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+
+        ASSERT_SUCCESS(cu.pcuModuleUnload(module));
+        ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
+    }
+
+    void testDestruction()
+    {
+
+        auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        constexpr auto ElementCount = 1024;
+        constexpr auto BufferSize = ElementCount * sizeof(int);
+        auto& cu = cudaHandler->getCUDAFunctionTable();
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
-            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-                .size = size,
-                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-                .alignmentLog2 = 10,
-            };
-    
-            auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-    
-            auto tmpBuf = createExternalBuffer(memory.get());
-            auto staging = createStaging();
-    
+            core::smart_refctd_ptr<CCUDAExportableMemory> cudaMemory;
+            ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaMemory, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+            escaped = cudaMemory->exportAsMemory(m_device.get());
+            if (!escaped) logFail("Fail to export CUDA memory!");
+        
+            auto tmpBuf = createExternalBuffer(escaped.get());
+            auto staging = createStaging(BufferSize);
+        
             auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
-            for (uint32_t i = 0; i < size / 4; ++i)
+            for (uint32_t i = 0; i < ElementCount; ++i)
                 ptr[i] = i;
-    
-            smart_refctd_ptr<IGPUCommandBuffer> cmd;
-            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
-            cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            IGPUCommandBuffer::SBufferCopy region = { .size = size };
-            assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
-            cmd->end();
-            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
-            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-            queue->submit({ &submitInfo,&submitInfo + 1 });
-            m_device->waitIdle();
-            escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory;
-        }
-    
-        {
-            constexpr size_t M = 32;
-            auto staging = createStaging(size * M);
-    
-            auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
-            for (uint32_t i = 0; i < (M * size) / 4; ++i)
-                ptr[i] = rand();
-    
-            std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
-            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
-    
-            for (size_t i = 0; i < 1 << 10; ++i)
-            {
-                IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-                    .size = size * M,
-                    .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-                    .alignmentLog2 = 10,
-                };
-            RE:
-                auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-    
-                if (!memory)
-                {
-                    m_device->waitIdle();
-                    for (size_t j = 0; j < i; ++j)
-                        cmd[j] = 0;
-                    goto END;
-                }
-                assert(memory);
-                auto tmpBuf = createExternalBuffer(memory.get());
-    
-                cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-                IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
-                assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
-                cmd[i]->end();
-                IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
-                IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-                assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
-            }
-        END:
+        
+            const auto semaphore = m_device->createSemaphore(0);
+            IQueue::SSubmitInfo::SSemaphoreInfo semInfo;
+            semInfo.semaphore = semaphore.get();
+            semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
+            semInfo.value = 1;
+        
+            smart_refctd_ptr<IGPUCommandBuffer> cmdBuffer;
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmdBuffer);
+            cmdBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            IGPUCommandBuffer::SBufferCopy region = { .size = BufferSize };
+            assert(cmdBuffer->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+            cmdBuffer->end();
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmdBuffer.get() };
+            const IQueue::SSubmitInfo submitInfo = {
+              .commandBuffers = {&cmdInfo, &cmdInfo + 1}, 
+              .signalSemaphores = {&semInfo, 1}
+            };
+            auto qre = queue->submit({ &submitInfo, &submitInfo + 1 });
+            assert(IQueue::RESULT::SUCCESS == qre);
             m_device->waitIdle();
-        }
-    
+        }        
+        
         {
             auto tmpBuf = createExternalBuffer(escaped.get());
-            auto staging = createStaging();
-    
+            auto staging = createStaging(BufferSize);
+        
+            const auto semaphore = m_device->createSemaphore(0);
+            IQueue::SSubmitInfo::SSemaphoreInfo semInfo;
+            semInfo.semaphore = semaphore.get();
+            semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
+            semInfo.value = 1;
+        
             smart_refctd_ptr<IGPUCommandBuffer> cmd;
             commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
             cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            IGPUCommandBuffer::SBufferCopy region = { .size = BufferSize };
             assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, &region));
             cmd->end();
             IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
-            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-            auto qre = queue->submit({ &submitInfo,&submitInfo + 1 });
+            const IQueue::SSubmitInfo submitInfo = {
+              .commandBuffers = {&cmdInfo, &cmdInfo + 1}, 
+              .signalSemaphores = {&semInfo, 1}
+            };
+            auto qre = queue->submit({ &submitInfo, &submitInfo + 1 });
             assert(IQueue::RESULT::SUCCESS == qre);
+        
             m_device->waitIdle();
-    
-            auto& ptr = *(std::array<uint32_t, size>*)staging->getBoundMemory().memory->getMappedPointer();
-            for (uint32_t i = 0; i < size / 4; ++i)
-                assert(ptr[i] == i);
+        
+            auto& ptr = *(std::array<uint32_t, BufferSize>*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < ElementCount; ++i)
+            {
+                if (ptr[i] != i) logFail("Test Destruction: Element %d is incorrect", i);
+            }
+            m_logger->log("Test Destruction complete", ILogger::ELL_INFO);
         }
     
+        // {
+        //     constexpr size_t M = 32;
+        //     auto staging = createStaging(size * M);
+        //
+        //     auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+        //     for (uint32_t i = 0; i < (M * size) / 4; ++i)
+        //         ptr[i] = rand();
+        //
+        //     std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
+        //     commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
+        //
+        //     for (size_t i = 0; i < 1 << 10; ++i)
+        //     {
+        //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+        //             .size = size * M,
+        //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+        //             .alignmentLog2 = 10,
+        //         };
+        //     RE:
+        //         auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+        //
+        //         if (!memory)
+        //         {
+        //             m_device->waitIdle();
+        //             for (size_t j = 0; j < i; ++j)
+        //                 cmd[j] = 0;
+        //             goto END;
+        //         }
+        //         assert(memory);
+        //         auto tmpBuf = createExternalBuffer(memory.get());
+        //
+        //         cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+        //         IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
+        //         assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+        //         cmd[i]->end();
+        //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
+        //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+        //         assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
+        //     }
+        // END:
+        //     m_device->waitIdle();
+        // }
+    
+    }
+
+    void testLargeAllocations()
+    {
+        // TODO(kevin): Calculate BufferSize that is big enough to fill the machine VRAM
+        constexpr auto BufferSize = 1024;
+        IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+            .size = BufferSize,
+            .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+            .alignmentLog2 = 10,
+        };
+    
+        for (size_t i = 0; i < (1 << 8); ++i)
+        {
+            auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+            assert(memory);
+            auto tmpBuf = createExternalBuffer(memory.get());
+        }
     }
 
 

From 03d2ce251e39cd58057a52d6728ec73484f0216d Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Fri, 24 Apr 2026 00:52:33 +0700
Subject: [PATCH 4/9] Update to follow latest commit on main repo

---
 76_CudaInterop/main.cpp | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 2a64f9428..2c4f819b2 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -96,6 +96,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         if (!cudaDevice) 
             return logFail("Could not create a CUDA Device!");
 
+
+        queue = getComputeQueue();
+
         testSharedResource();
         testDestruction();
         testLargeAllocations();
@@ -193,7 +196,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         for (auto input_i = 0; input_i < InputCount; input_i++)
         {
           // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-          ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaInputMemories[input_i], { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+          cudaInputMemories[input_i] = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
           vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr);
           vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get());
           inputStagingBuffers[input_i] = createStaging(BufferSize);
@@ -205,15 +208,18 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
         const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams));
         auto outputMemReq = outputBuf->getMemoryReqs();
+
         auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE);
-        core::smart_refctd_ptr<CCUDAImportedMemory> cudaOutputMemory;
-        ASSERT_SUCCESS(cudaDevice->importExternalMemory(&cudaOutputMemory, allocation.memory.get()));
+        const auto cudaOutputMemory = cudaDevice->importExternalMemory(core::smart_refctd_ptr(allocation.memory));
+        if (!cudaOutputMemory)
+          logFail("Fail to import Vulkan Memory into CUDA!");
         
         ISemaphore::SCreationParams semParams;
         semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
         auto semaphore = m_device->createSemaphore(0, std::move(semParams));
-        core::smart_refctd_ptr<CCUDAImportedSemaphore> cudaSemaphore;
-        ASSERT_SUCCESS(cudaDevice->importExternalSemaphore(&cudaSemaphore, semaphore.get()));
+        const auto cudaSemaphore = cudaDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
+        if (!cudaSemaphore)
+          logFail("Fail to import Vulkan Semaphore into CUDA!");
         
         std::array<smart_refctd_ptr<IGPUCommandBuffer>, 2> cmd;
         auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
@@ -414,15 +420,15 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
     void testDestruction()
     {
-
         auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
         constexpr auto ElementCount = 1024;
         constexpr auto BufferSize = ElementCount * sizeof(int);
         auto& cu = cudaHandler->getCUDAFunctionTable();
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
-            core::smart_refctd_ptr<CCUDAExportableMemory> cudaMemory;
-            ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaMemory, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+            const auto cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+            if (!cudaMemory) logFail("Fail to create exportable memory!");
+
             escaped = cudaMemory->exportAsMemory(m_device.get());
             if (!escaped) logFail("Fail to export CUDA memory!");
         

From 1e120e8956181d8de7931f1fd2e8bb350a046c2a Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Sat, 25 Apr 2026 17:18:45 +0700
Subject: [PATCH 5/9] Fix ex 67 due to changes in memory allocation

---
 67_RayQueryGeometry/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 63346ac4c..2f196e140 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -664,7 +664,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 					auto retval = device->allocate(info);
 					// map what is mappable by default so ReBAR checks succeed
 					if (retval.isValid() && retval.memory->isMappable())
-						retval.memory->map({.offset=0,.length=info.size});
+						retval.memory->map({.offset=0,.length=info.allocationSize});
 					return retval;
 				}
 

From fc00a68b3dec9f4c3ff81419ea77e5f85f5ff4ce Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Thu, 30 Apr 2026 15:03:58 +0700
Subject: [PATCH 6/9] ASSERT_SUCCESS into ASSERT_CUDA_SUCCESS

---
 76_CudaInterop/main.cpp | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 2c4f819b2..8231586d5 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -40,8 +40,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
     return true;
 }
 
-#define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); }
-#define ASSERT_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
+#define ASSERT_CUDA_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
 
 
 using namespace nbl::core;
@@ -156,7 +155,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             std::string log;
             auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
                 "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
-            ASSERT_SUCCESS_NV(res, log);
+            ASSERT_CUDA_SUCCESS_NV(res, log);
 
             ptx = std::move(ptx_);
         }
@@ -167,9 +166,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         CUfunction kernel;
         CUstream   stream;
 
-        ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
-        ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
-        ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), cudaHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), cudaHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), cudaHandler);
 
         // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
         std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
@@ -279,17 +278,17 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             };
             auto numElements = &NumElements;
             void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements };
-            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream));
-            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream));
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler);
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler);
     
             auto semaphore = cudaSemaphore->getInternalObject();
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
-            ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
-            ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr));
+            ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan
+            ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler);
             const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-            ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+            ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), cudaHandler); // Signal the imported semaphore
         }
-        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
         
         // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
         {
@@ -411,11 +410,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO);
         };
 
-        ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx));
-        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+        ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), cudaHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
 
-        ASSERT_SUCCESS(cu.pcuModuleUnload(module));
-        ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), cudaHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), cudaHandler);
     }
 
     void testDestruction()

From 00572257f2370be17e118f3186ea032119e186cd Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 4 May 2026 14:22:22 +0700
Subject: [PATCH 7/9] Refactor ASSERT_CUDA_SUCCESS

---
 76_CudaInterop/main.cpp | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 8231586d5..84dbac39f 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -16,19 +16,6 @@ The start of the main function starts like in most other example. We ask the
 user for the desired renderer and start it up.
 */
 
-bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line)
-{
-    if (auto re = err; CUDA_SUCCESS != re) 
-    {
-        const char* name = 0, * str = 0;
-        cu.pcuGetErrorName(re, &name);
-        cu.pcuGetErrorString(re, &str);
-        logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str);
-        return false;
-    }
-    return true;
-}
-
 bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log)
 {
     if (auto re = err; NVRTC_SUCCESS != re) 
@@ -40,7 +27,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
     return true;
 }
 
-#define ASSERT_CUDA_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
+#define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
 
 
 using namespace nbl::core;
@@ -155,7 +142,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             std::string log;
             auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
                 "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
-            ASSERT_CUDA_SUCCESS_NV(res, log);
+            ASSERT_NV_SUCCESS(res, log);
 
             ptx = std::move(ptx_);
         }

From 82d05923f15c09f1f1de771c14b9c1b89c5ca28b Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 4 May 2026 14:22:49 +0700
Subject: [PATCH 8/9] Slight naming refactor

---
 76_CudaInterop/main.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 84dbac39f..5fd8151bf 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -85,7 +85,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
         queue = getComputeQueue();
 
-        testSharedResource();
+        testVectorAddKernel();
         testDestruction();
         testLargeAllocations();
 
@@ -120,7 +120,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         return buf;
     };
 
-    void testSharedResource()
+    void testVectorAddKernel()
     {
         static constexpr uint32_t GridDim[3] = { 4096,1,1 };
         static constexpr uint32_t BlockDim[3] = { 1024,1,1 };
@@ -389,8 +389,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
               const auto output = outputs[elem_i];
               const auto expected = input1 + input2;
               const auto diff = abs(output - expected);
-              bool re = diff < 0.01;
-              if (!re)
+              if (diff < 0.01)
                 ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i);
             }
 

From a229db2993e35af7b09c8fd5393b8e16d7ff6435 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 4 May 2026 14:24:35 +0700
Subject: [PATCH 9/9] Remove unused commented code

---
 76_CudaInterop/main.cpp | 43 -----------------------------------------
 1 file changed, 43 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 5fd8151bf..dfd214384 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -480,49 +480,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             m_logger->log("Test Destruction complete", ILogger::ELL_INFO);
         }
     
-        // {
-        //     constexpr size_t M = 32;
-        //     auto staging = createStaging(size * M);
-        //
-        //     auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
-        //     for (uint32_t i = 0; i < (M * size) / 4; ++i)
-        //         ptr[i] = rand();
-        //
-        //     std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
-        //     commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
-        //
-        //     for (size_t i = 0; i < 1 << 10; ++i)
-        //     {
-        //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-        //             .size = size * M,
-        //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-        //             .alignmentLog2 = 10,
-        //         };
-        //     RE:
-        //         auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-        //
-        //         if (!memory)
-        //         {
-        //             m_device->waitIdle();
-        //             for (size_t j = 0; j < i; ++j)
-        //                 cmd[j] = 0;
-        //             goto END;
-        //         }
-        //         assert(memory);
-        //         auto tmpBuf = createExternalBuffer(memory.get());
-        //
-        //         cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-        //         IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
-        //         assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
-        //         cmd[i]->end();
-        //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
-        //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-        //         assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
-        //     }
-        // END:
-        //     m_device->waitIdle();
-        // }
-    
     }
 
     void testLargeAllocations()