diff --git a/CMakeLists.txt b/CMakeLists.txt index a93a86a4f..b3f986b46 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids) add_subdirectory(71_RayTracingPipeline) add_subdirectory(72_CooperativeBinarySearch) + add_subdirectory(XX_NewFFT) if (NBL_BUILD_MITSUBA_LOADER) add_subdirectory(73_GeometryInspector) diff --git a/XX_NewFFT/CMakeLists.txt b/XX_NewFFT/CMakeLists.txt new file mode 100644 index 000000000..6b6304ed8 --- /dev/null +++ b/XX_NewFFT/CMakeLists.txt @@ -0,0 +1,62 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() + +set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") + +set(SM 6_8) +set(JSON [=[ +[ + { + "INPUT": "app_resources/shader.comp.hlsl", + "KEY": "shader", + } +] +]=]) +string(CONFIGURE "${JSON}" JSON) + +set(COMPILE_OPTIONS + -I "${CMAKE_CURRENT_SOURCE_DIR}" + -T lib_${SM} +) + +NBL_CREATE_NSC_COMPILE_RULES( + TARGET ${EXECUTABLE_NAME}SPIRV + LINK_TO ${EXECUTABLE_NAME} + BINARY_DIR ${OUTPUT_DIRECTORY} + MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT + COMMON_OPTIONS ${COMPILE_OPTIONS} + OUTPUT_VAR KEYS + INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp + NAMESPACE nbl::this_example::builtin::build + INPUTS ${JSON} +) + +NBL_CREATE_RESOURCE_ARCHIVE( + NAMESPACE nbl::this_example::builtin::build + TARGET ${EXECUTABLE_NAME}_builtinsBuild + LINK_TO ${EXECUTABLE_NAME} + BIND ${OUTPUT_DIRECTORY} + BUILTINS ${KEYS} +) diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl new file mode 100644 index 000000000..8cd0cf44f --- /dev/null +++ b/XX_NewFFT/app_resources/common.hlsl @@ -0,0 +1,29 @@ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/workgroup2/fft.hlsl" + +using scalar_t = nbl::hlsl::float32_t; + +struct PushConstantData +{ + uint64_t deviceBufferAddress; +}; + +NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 9; +NBL_CONSTEXPR uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2; +NBL_CONSTEXPR uint32_t SubgroupSizeLog2 = 5; // hardcoded to my nvidia gpu, should be queried at compilation time +NBL_CONSTEXPR uint32_t SubgroupSize = uint32_t(1) << SubgroupSizeLog2; + +NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationPerChannelLog2 = 1; +NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(1); +NBL_CONSTEXPR uint32_t ElementsPerInvocationPerChannel = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationPerChannelLog2); + +NBL_CONSTEXPR uint32_t Channels = 1; +NBL_CONSTEXPR uint32_t complexElementCountPerChannel = ElementsPerInvocationPerChannel * (uint32_t(1) << WorkgroupSizeLog2); +NBL_CONSTEXPR uint32_t complexElementCount = Channels * complexElementCountPerChannel; + +NBL_CONSTEXPR uint16_t InnerVirtualChannels = Channels * (ElementsPerInvocationPerChannel >> 1); +NBL_CONSTEXPR uint32_t ShuffledVirtualChannelsPerRound = nbl::hlsl::mpl::min_v; + +NBL_CONSTEXPR bool ShareTwiddles = true; + +using ConstevalParameters = workgroup2::fft::ConstevalParameters; \ No newline at end of file diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl new file mode 100644 index 000000000..c1db53f30 --- /dev/null +++ b/XX_NewFFT/app_resources/shader.comp.hlsl @@ -0,0 +1,134 @@ +#include "common.hlsl" +#include "nbl/builtin/hlsl/subgroup2/fft.hlsl" +#include "nbl/builtin/hlsl/workgroup2/fft.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" + +[[vk::push_constant]] PushConstantData pushConstants; + +using namespace nbl::hlsl; + +//using ConstevalParameters = workgroup::fft::ConstevalParameters; + +groupshared uint32_t sharedmem[4 * ((sizeof(complex_t) / sizeof(uint32_t)) << WorkgroupSizeLog2) ]; + +struct SharedMemoryAccessor +{ + template + void set(IndexType idx, AccessType value) + { + sharedmem[idx] = value; + } + + template + void get(IndexType idx, NBL_REF_ARG(AccessType) value) + { + value = sharedmem[idx]; + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + } + +}; + +// Almost a LegacyBdaAccessor, but since we need `uint32_t index` getter and setter it's the same as writing one ourselves + +struct Accessor +{ + static Accessor create(const uint64_t address) + { + Accessor accessor; + accessor.address = address; + return accessor; + } + + // TODO: can't use our own BDA yet, because it doesn't support the types `workgroup::FFT` will invoke these templates with + template + void get(const IndexType index, NBL_REF_ARG(AccessType) value) + { + value = vk::RawBufferLoad(address + index * sizeof(AccessType)); + } + + template + void set(const IndexType index, const AccessType value) + { + vk::RawBufferStore(address + index * sizeof(AccessType), value); + } + + uint64_t address; +}; + + +template +struct InvocationElementsAccessor +{ + scalar_t real[Channels][Size]; + scalar_t imag[Channels][Size]; + + void get(uint32_t channel, uint32_t pair, NBL_REF_ARG(complex_t) value) + { + value.real(real[channel][pair]); + value.imag(imag[channel][pair]); + } + + void set(uint32_t channel, uint32_t pair, NBL_CONST_REF_ARG(complex_t) value) + { + real[channel][pair] = value.real(); + imag[channel][pair] = value.imag(); + } +}; + +using _InvocationElementsAccessor = InvocationElementsAccessor; +using ElementsAccessorAdaptor = workgroup2::fft::WorkgroupRadix2AccessorAdaptor; + +//[numthreads(ConstevalParameters::WorkgroupSize,1,1)] +[numthreads(WorkgroupSize, 1, 1)] +[shader("compute")] +void main(uint32_t3 ID : SV_DispatchThreadID) +{ + // global mem read write + Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress); + // Load elements into the accessor + _InvocationElementsAccessor loElementAccessor; + ElementsAccessorAdaptor loAcc = ElementsAccessorAdaptor::create(loElementAccessor); + _InvocationElementsAccessor hiElementAccessor; + ElementsAccessorAdaptor hiAcc = ElementsAccessorAdaptor::create(hiElementAccessor); + + // Set up the memory adaptor + SharedMemoryAccessor sharedmemAccessor; + //using adaptor_t = accessor_adaptors::StructureOfArrays; + //adaptor_t sharedmemAdaptor; + //sharedmemAdaptor.accessor = sharedmemAccessor; + + using FFT = workgroup2::impl::InnerFFT; + using IFFT = workgroup2::impl::InnerFFT; + + // Invert last channel to ensure ping pong works + [unroll] + for (uint32_t pair = 0u; pair < ElementsPerInvocationPerChannel / 2; pair++) + { + complex_t lo, hi; + accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo); + loAcc.set(pair, lo); + accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi); + hiAcc.set(pair, hi); + //printf("Pair %d is lo: %f, %f hi: %f, %f", pair, lo.real(), lo.imag(), hi.real(), hi.imag()); + //printf("SharedmemSize: %d", 4 * ((sizeof(complex_t) / sizeof(uint32_t)) << WorkgroupSizeLog2)); + //printf("ShuffleRounds: %d", ConstevalParameters::ShuffleRounds); + } + + FFT::__call(loAcc, hiAcc, sharedmemAccessor); + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + IFFT::__call(loAcc, hiAcc, sharedmemAccessor); + + [unroll] + for (uint32_t pair = 0u; pair < ElementsPerInvocationPerChannel / 2; pair++) + { + complex_t lo, hi; + loAcc.get(pair, lo); + accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo); + hiAcc.get(pair, hi); + accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi); + } +} \ No newline at end of file diff --git a/XX_NewFFT/config.json.template b/XX_NewFFT/config.json.template new file mode 100644 index 000000000..717d05d53 --- /dev/null +++ b/XX_NewFFT/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", // should be none + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp new file mode 100644 index 000000000..087117f47 --- /dev/null +++ b/XX_NewFFT/main.cpp @@ -0,0 +1,491 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/this_example/builtin/build/spirv/keys.hpp" + +#include "nbl/examples/examples.hpp" + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; + +#include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/bit.hlsl" +#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" + +// Function implemented in workgroup2::FFTIndexingUtils is meant to be fast based on the observation that reordering can still be performed fast in the case of +// a single prime factor. However, we need to test that the implemented fast version matches the real ordering +template +void DIFOrderTester() +{ + using IndexingUtils = nbl::hlsl::workgroup2::FFTIndexingUtils; + using IndexingUtilsHelper = typename IndexingUtils::helper_t; + const uint32_t FFTSize = IndexingUtils::FFTSize; + const uint32_t Radix2FFTSizeLog2 = IndexingUtils::Radix2FFTSizeLog2; + const uint32_t Radix2FFTSize = IndexingUtils::Radix2FFTSize; + + // Check fast div correctness + if constexpr (ExtraPrimeFactor > 1) + { + bool correct = true; + for (auto idx = 0u; idx < FFTSize; idx++) + { + if (idx / ExtraPrimeFactor != IndexingUtilsHelper::fastDiv(idx)) correct = false; + } + std::cout << "Fast div test " << (correct ? "passed\n" : "did not pass\n"); + } + + // Check whether the forward ordering is computed properly + { + bool correct = true; + for (auto idx = 0u; idx < FFTSize; idx++) + { + uint32_t fastIdx = IndexingUtilsHelper::mapLaneToFreq(idx); + if constexpr (ExtraPrimeFactor == 1) + { + if (fastIdx != nbl::hlsl::bitReverseAs(idx, Radix2FFTSizeLog2)) correct = false; + } + else + { + uint32_t index = idx; + std::vector digits; + for (auto i = 0u; i < Radix2FFTSizeLog2; i++) + { + digits.push_back(index & 1); + index >>= 1; + } + digits.push_back(index); + // Reconstruct mapping + uint32_t correctIdx = 0; + uint32_t multiplier = ExtraPrimeFactor * Radix2FFTSize; + for (auto i = 0u; i < Radix2FFTSizeLog2; i++) + { + multiplier >>= 1; + correctIdx += multiplier * digits[i]; + } + multiplier /= ExtraPrimeFactor; + correctIdx += multiplier * digits[Radix2FFTSizeLog2]; + if (fastIdx != correctIdx) + correct = false; + } + } + std::cout << "Forward test " << (correct ? "passed\n" : "did not pass\n"); + } + + // Check whether the inverse actually computes the inverse + { + bool correct = true; + for (auto idx = 0; idx < FFTSize; idx++) + { + if (idx != IndexingUtilsHelper::mapFreqToLane(IndexingUtilsHelper::mapLaneToFreq(idx))) correct = false; + } + std::cout << "Inverse test " << (correct ? "passed\n" : "did not pass\n"); + } +} + +// Simple showcase of how to run FFT on a 1D array +class FFT_Test final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication +{ + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + + smart_refctd_ptr m_pipeline; + + smart_refctd_ptr m_utils; + + nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer; + StreamingTransientDataBufferMT<>* m_downStreamingBuffer; + smart_refctd_ptr m_deviceLocalBuffer; + + // These are Buffer Device Addresses + uint64_t m_upStreamingBufferAddress; + uint64_t m_downStreamingBufferAddress; + uint64_t m_deviceLocalBufferAddress; + + // You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!) + uint32_t m_alignment; + + // This example really lets the advantages of a timeline semaphore shine through! + smart_refctd_ptr m_timeline; + uint64_t semaphorValue = 0; + +public: + // Yay thanks to multiple inheritance we cannot forward ctors anymore + FFT_Test(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + inline core::smart_refctd_ptr createShader(const char* includeMainName) + { + auto HLSLShader = core::make_smart_refctd_ptr((std::string("#include \"") + includeMainName + "\"\n").c_str(), + IShader::E_CONTENT_TYPE::ECT_HLSL, + includeMainName); + assert(HLSLShader); + + ILogicalDevice::SShaderCreationParameters shaderCreationParams{ .source = HLSLShader.get(), + .preprocessedOutputPath = (localOutputCWD / "preprocessed.hlsl").string(), + .spvOutputPath = (localOutputCWD / "out.spv").string() }; +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + shaderCreationParams.optimizer = opt.get(); + shaderCreationParams.optimizerIsExtraPasses = true; +#endif + return m_device->compileShader(shaderCreationParams); + } + + // useful for debugging compiler issues + inline core::smart_refctd_ptr createSpirvShader(const char* spirvName) + { + core::smart_refctd_ptr shaderBuffer; + { + core::smart_refctd_ptr shaderReadFile; + system::ISystem::future_t> future; + m_system->createFile(future, spirvName, system::IFile::ECF_READ); + if (future.wait()) + { + future.acquire().move_into(shaderReadFile); + if (shaderReadFile) + { + const size_t size = shaderReadFile->getSize(); + if (size > 0ull) + { + asset::IBuffer::SCreationParams bufferCreationParams{ .size = size }; + asset::ICPUBuffer::SCreationParams foo; + foo = bufferCreationParams; + shaderBuffer = ICPUBuffer::create(std::move(foo)); + system::IFile::success_t succ; + shaderReadFile->read(succ, shaderBuffer->getPointer(), 0, size); + if (!succ) + m_logger->log("Failed Reading From Shader File.", ILogger::ELL_ERROR); + } + } + else + { + m_logger->log("Failed Opening Shader File.", ILogger::ELL_ERROR); + } + } + else + { + m_logger->log("Failed Opening Shader Cache File.", ILogger::ELL_ERROR); + } + } + + auto SPIRVShader = core::make_smart_refctd_ptr(std::move(shaderBuffer), IShader::E_CONTENT_TYPE::ECT_SPIRV, spirvName); + assert(SPIRVShader); + + return m_device->compileShader({ .source = SPIRVShader.get() }); + } + + // we stuff all our work here because its a "single shot" app + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + smart_refctd_ptr shader = createShader("app_resources/shader.comp.hlsl"); + // DEBUG + //smart_refctd_ptr shader = createSpirvShader("app_resources/optimized.spv"); + if (!shader) + return logFail("Invalid shader!"); + + // Create massive upload/download buffers + constexpr uint32_t DownstreamBufferSize = sizeof(scalar_t) << 23; + constexpr uint32_t UpstreamBufferSize = sizeof(scalar_t) << 23; + + m_utils = IUtilities::create(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize); + if (!m_utils) + return logFail("Failed to create Utilities!"); + m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer(); + m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer(); + m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress(); + m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress(); + + // Create device-local buffer + { + const uint32_t scalarElementCount = 2 * complexElementCount; + IGPUBuffer::SCreationParams deviceLocalBufferParams = {}; + + IQueue* const queue = getComputeQueue(); + uint32_t queueFamilyIndex = queue->getFamilyIndex(); + + deviceLocalBufferParams.queueFamilyIndexCount = 1; + deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex; + deviceLocalBufferParams.size = sizeof(scalar_t) * scalarElementCount; + deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; + + m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams)); + auto mreqs = m_deviceLocalBuffer->getMemoryReqs(); + mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); + auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT); + + m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress(); + } + + const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(PushConstantData) }; + + { + auto layout = m_device->createPipelineLayout({ &pcRange,1 }); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout.get(); + params.shader.shader = shader.get(); + params.shader.entryPoint = "main"; + params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)); + params.cached.requireFullSubgroups = true; + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) + return logFail("Failed to create compute pipeline!\n"); + } + + const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); + // The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices + // which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets. + // Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc. + // there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those. + // We'll align to max of coherent atom size even if the memory is coherent, + // and we also need to take into account BDA shader loads need to be aligned to the type being loaded. + m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(float)); + + // Semaphor used here to know the FFT is done before download + m_timeline = m_device->createSemaphore(semaphorValue); + + IQueue* const queue = getComputeQueue(); + + // Note that I'm using the sample struct with methods that have identical code which compiles as both C++ and HLSL + auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({ semaphorValue ^ 0xdeadbeefu,std::hash()(_NBL_APP_NAME_) }); + + const uint32_t scalarElementCount = 2 * complexElementCount; + const uint32_t inputSize = sizeof(scalar_t) * scalarElementCount; + + // Just need a single suballocation in this example + const uint32_t AllocationCount = 1; + + // It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value + // this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args. + auto inputOffset = m_upStreamingBuffer->invalid_value; + + // We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled) + // Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later). + std::chrono::steady_clock::time_point waitTill(std::chrono::years(45)); + // note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly + m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment); + + // Run DIF Ordering test + DIFOrderTester(); + + // Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example! + { + auto* const inputPtr = reinterpret_cast(reinterpret_cast(m_upStreamingBuffer->getBufferPointer()) + inputOffset); + std::cout << "Begin array CPU\n"; + for (auto channel = 0; channel < Channels; channel++) + { + std::cout << "Begin channel " << channel << "\n"; + for (auto j = 0; j < complexElementCountPerChannel; j++) + { + //Random array + + //scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits::max); + //#define DIVIDE + + + //FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),... + + + scalar_t x = j > 0 ? 0.f : 1.f, y = 0; + + // FFT( (c,0), (c,0), (c,0),... ) = (Nc,0), (0,0), (0,0),... + + + //scalar_t x = 1.f, y = 0.f; + + inputPtr[2 * complexElementCountPerChannel * channel + 2 * j] = x; + inputPtr[2 * complexElementCountPerChannel * channel + 2 * j + 1] = y; + std::cout << "(" << x << ", " << y << "), "; + } + std::cout << "\nEnd channel " << channel << "\n"; + } + /* + for (auto j = 0; j < complexElementCount; j++) + { + //Random array + + scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits::max); + #define DIVIDE + + //FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),... + + + //scalar_t x = j > 0 ? 0.f : 1.f, y = 0; + + // FFT( (c,0), (c,0), (c,0),... ) = (Nc,0), (0,0), (0,0),... + + + //scalar_t x = 1.f, y = 0.f; + + inputPtr[2 * j] = x; + inputPtr[2 * j + 1] = y; + std::cout << "(" << x << ", " << y << "), "; + } + */ + std::cout << "\nEnd array CPU\n"; + + // Always remember to flush! + if (m_upStreamingBuffer->needsManualFlushOrInvalidate()) + { + const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory(); + const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize); + m_device->flushMappedMemoryRanges(1, &range); + } + } + + // finally allocate our output range + const uint32_t outputSize = inputSize; + + auto outputOffset = m_downStreamingBuffer->invalid_value; + m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment); + + smart_refctd_ptr cmdbuf; + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) { + return logFail("Failed to create Command Buffers!\n"); + } + cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger)); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->bindComputePipeline(m_pipeline.get()); + // This is the new fun part, pushing constants + const PushConstantData pc = {.deviceBufferAddress = m_deviceLocalBufferAddress}; + IGPUCommandBuffer::SBufferCopy copyInfo = {}; + copyInfo.srcOffset = 0; + copyInfo.dstOffset = 0; + copyInfo.size = m_deviceLocalBuffer->getSize(); + cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, ©Info); + cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + // Remember we do a single workgroup per 1D array in these parts + cmdbuf->dispatch(1, 1, 1); + + // Pipeline barrier: wait for FFT shader to be done before copying to downstream buffer + IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo = {}; + + decltype(pipelineBarrierInfo)::buffer_barrier_t barrier = {}; + pipelineBarrierInfo.bufBarriers = { &barrier, 1u }; + + barrier.range.buffer = m_deviceLocalBuffer; + + barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS; + barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS; + + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo); + cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, ©Info); + cmdbuf->end(); + } + + semaphorValue++; + { + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = + { + .cmdbuf = cmdbuf.get() + }; + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = + { + .semaphore = m_timeline.get(), + .value = semaphorValue, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + + const IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = {}, + .commandBuffers = {&cmdbufInfo,1}, + .signalSemaphores = {&signalInfo,1} + }; + + m_api->startCapture(); + queue->submit({ &submitInfo,1 }); + m_api->endCapture(); + } + + // We let all latches know what semaphore and counter value has to be passed for the functors to execute + const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue }; + + // As promised, we can defer an upstreaming buffer deallocation until a fence is signalled + // You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation. + m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait); + + // Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer. + // Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory. + // Its nice because it will also remember to invalidate our memory mapping if its not coherent. + auto latchedConsumer = make_smart_refctd_ptr( + IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize), + // Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals + [=](const size_t dstOffset, const void* bufSrc, const size_t size)->void + { + // The unused variable is used for letting the consumer know the subsection of the output we've managed to download + // But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves. + assert(dstOffset == 0 && size == outputSize); + + std::cout << "Begin array GPU\n"; + scalar_t* const data = reinterpret_cast(const_cast(bufSrc)); + for (auto channel = 0; channel < Channels; channel++) + { + std::cout << "Begin channel " << channel << "\n"; + for (auto j = 0; j < complexElementCountPerChannel; j++) + { + std::cout << "(" << data[2 * complexElementCountPerChannel * channel + 2 * j] << ", " << data[2 * complexElementCountPerChannel * channel + 2 * j + 1] << "), "; + } + std::cout << "\nEnd channel " << channel << "\n"; + } + + /* + #ifdef DIVIDE + for (auto j = 0; j < complexElementCount; j++) + { + std::cout << "(" << data[2 * j] / complexElementCount << ", " << data[2 * j + 1] / complexElementCount << "), "; + } + #else + for (auto j = 0; j < complexElementCount; j++) + { + std::cout << "(" << data[2 * j] << ", " << data[2 * j + 1] << "), "; + } + #endif + */ + + std::cout << "\nEnd array GPU\n"; + }, + // Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it + // hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands. + // It could also be latched in the upstreaming deallocate, because its the same fence. + std::move(cmdbuf), m_downStreamingBuffer + ); + // We put a function we want to execute + m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get()); + + return true; + } + + // One-shot App + bool keepRunning() override { return false; } + + // One-shot App + void workLoopBody() override{} + + // Cleanup + bool onAppTerminated() override + { + // Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated` + // (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain) + while (m_downStreamingBuffer->cull_frees()) {} + return device_base_t::onAppTerminated(); + } +}; + + +NBL_MAIN_FUNC(FFT_Test) \ No newline at end of file diff --git a/XX_NewFFT/pipeline.groovy b/XX_NewFFT/pipeline.groovy new file mode 100644 index 000000000..1a7b043a4 --- /dev/null +++ b/XX_NewFFT/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CStreamingAndBufferDeviceAddressBuilder extends IBuilder +{ + public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info) +} + +return this \ No newline at end of file