From a6337f84afd36d24a40b7d391515707df1e2a0ac Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 24 Mar 2026 21:38:17 -0300 Subject: [PATCH 1/8] Add new FFT example --- CMakeLists.txt | 1 + XX_NewFFT/CMakeLists.txt | 62 +++++ XX_NewFFT/app_resources/common.hlsl | 12 + XX_NewFFT/app_resources/shader.comp.hlsl | 74 ++++++ XX_NewFFT/config.json.template | 28 ++ XX_NewFFT/main.cpp | 322 +++++++++++++++++++++++ XX_NewFFT/pipeline.groovy | 50 ++++ 7 files changed, 549 insertions(+) create mode 100644 XX_NewFFT/CMakeLists.txt create mode 100644 XX_NewFFT/app_resources/common.hlsl create mode 100644 XX_NewFFT/app_resources/shader.comp.hlsl create mode 100644 XX_NewFFT/config.json.template create mode 100644 XX_NewFFT/main.cpp create mode 100644 XX_NewFFT/pipeline.groovy diff --git a/CMakeLists.txt b/CMakeLists.txt index a93a86a4f..b3f986b46 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids) add_subdirectory(71_RayTracingPipeline) add_subdirectory(72_CooperativeBinarySearch) + add_subdirectory(XX_NewFFT) if (NBL_BUILD_MITSUBA_LOADER) add_subdirectory(73_GeometryInspector) diff --git a/XX_NewFFT/CMakeLists.txt b/XX_NewFFT/CMakeLists.txt new file mode 100644 index 000000000..6b6304ed8 --- /dev/null +++ b/XX_NewFFT/CMakeLists.txt @@ -0,0 +1,62 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() + +set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") + +set(SM 6_8) +set(JSON [=[ +[ + { + "INPUT": "app_resources/shader.comp.hlsl", + "KEY": "shader", + } +] +]=]) +string(CONFIGURE "${JSON}" JSON) + +set(COMPILE_OPTIONS + -I "${CMAKE_CURRENT_SOURCE_DIR}" + -T lib_${SM} +) + +NBL_CREATE_NSC_COMPILE_RULES( + TARGET ${EXECUTABLE_NAME}SPIRV + LINK_TO ${EXECUTABLE_NAME} + BINARY_DIR ${OUTPUT_DIRECTORY} + MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT + COMMON_OPTIONS ${COMPILE_OPTIONS} + OUTPUT_VAR KEYS + INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp + NAMESPACE nbl::this_example::builtin::build + INPUTS ${JSON} +) + +NBL_CREATE_RESOURCE_ARCHIVE( + NAMESPACE nbl::this_example::builtin::build + TARGET ${EXECUTABLE_NAME}_builtinsBuild + LINK_TO ${EXECUTABLE_NAME} + BIND ${OUTPUT_DIRECTORY} + BUILTINS ${KEYS} +) diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl new file mode 100644 index 000000000..dc6b96e71 --- /dev/null +++ b/XX_NewFFT/app_resources/common.hlsl @@ -0,0 +1,12 @@ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +using scalar_t = nbl::hlsl::float32_t; + +struct PushConstantData +{ + uint64_t deviceBufferAddress; +}; + +NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 6; +NBL_CONSTEXPR uint32_t ElementsPerThreadLog2 = 3; +NBL_CONSTEXPR uint32_t complexElementCount = uint32_t(1) << (WorkgroupSizeLog2 + ElementsPerThreadLog2); \ No newline at end of file diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl new file mode 100644 index 000000000..7c86f50b4 --- /dev/null +++ b/XX_NewFFT/app_resources/shader.comp.hlsl @@ -0,0 +1,74 @@ +#include "common.hlsl" +#include "nbl/builtin/hlsl/workgroup/fft.hlsl" + +[[vk::push_constant]] PushConstantData pushConstants; + +using namespace nbl::hlsl; + +using ConstevalParameters = workgroup::fft::ConstevalParameters; + +groupshared uint32_t sharedmem[ ConstevalParameters::SharedMemoryDWORDs]; + +// Users MUST define this method for FFT to work +uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParameters::WorkgroupSize), 1, 1); } + +struct SharedMemoryAccessor +{ + template + void set(IndexType idx, AccessType value) + { + sharedmem[idx] = value; + } + + template + void get(IndexType idx, NBL_REF_ARG(AccessType) value) + { + value = sharedmem[idx]; + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + } + +}; + +// Almost a LegacyBdaAccessor, but since we need `uint32_t index` getter and setter it's the same as writing one ourselves +struct Accessor +{ + static Accessor create(const uint64_t address) + { + Accessor accessor; + accessor.address = address; + return accessor; + } + + // TODO: can't use our own BDA yet, because it doesn't support the types `workgroup::FFT` will invoke these templates with + template + void get(const IndexType index, NBL_REF_ARG(AccessType) value) + { + value = vk::RawBufferLoad(address + index * sizeof(AccessType)); + } + + template + void set(const IndexType index, const AccessType value) + { + vk::RawBufferStore(address + index * sizeof(AccessType), value); + } + + uint64_t address; +}; + +[numthreads(ConstevalParameters::WorkgroupSize,1,1)] +[shader("compute")] +void main(uint32_t3 ID : SV_DispatchThreadID) +{ + Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress); + SharedMemoryAccessor sharedmemAccessor; + + // FFT + + workgroup::FFT::template __call(accessor, sharedmemAccessor); + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + workgroup::FFT::template __call(accessor, sharedmemAccessor); +} \ No newline at end of file diff --git a/XX_NewFFT/config.json.template b/XX_NewFFT/config.json.template new file mode 100644 index 000000000..717d05d53 --- /dev/null +++ b/XX_NewFFT/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", // should be none + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp new file mode 100644 index 000000000..49d157a38 --- /dev/null +++ b/XX_NewFFT/main.cpp @@ -0,0 +1,322 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/this_example/builtin/build/spirv/keys.hpp" + +#include "nbl/examples/examples.hpp" + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; + +#include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/bit.hlsl" +#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" + + +// Simple showcase of how to run FFT on a 1D array +class FFT_Test final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication +{ + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + + smart_refctd_ptr m_pipeline; + + smart_refctd_ptr m_utils; + + nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer; + StreamingTransientDataBufferMT<>* m_downStreamingBuffer; + smart_refctd_ptr m_deviceLocalBuffer; + + // These are Buffer Device Addresses + uint64_t m_upStreamingBufferAddress; + uint64_t m_downStreamingBufferAddress; + uint64_t m_deviceLocalBufferAddress; + + // You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!) + uint32_t m_alignment; + + // This example really lets the advantages of a timeline semaphore shine through! + smart_refctd_ptr m_timeline; + uint64_t semaphorValue = 0; + +public: + // Yay thanks to multiple inheritance we cannot forward ctors anymore + FFT_Test(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + // we stuff all our work here because its a "single shot" app + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + smart_refctd_ptr shader; + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; // virtual root + auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get()); + auto assetBundle = m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return logFail("Could not load shader!"); + + // Cast down the asset to its proper type + shader = IAsset::castDown(assets[0]); + + if (!shader) + return logFail("Invalid shader!"); + } + + // Create massive upload/download buffers + constexpr uint32_t DownstreamBufferSize = sizeof(scalar_t) << 23; + constexpr uint32_t UpstreamBufferSize = sizeof(scalar_t) << 23; + + m_utils = IUtilities::create(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize); + if (!m_utils) + return logFail("Failed to create Utilities!"); + m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer(); + m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer(); + m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress(); + m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress(); + + // Create device-local buffer + { + const uint32_t scalarElementCount = 2 * complexElementCount; + IGPUBuffer::SCreationParams deviceLocalBufferParams = {}; + + IQueue* const queue = getComputeQueue(); + uint32_t queueFamilyIndex = queue->getFamilyIndex(); + + deviceLocalBufferParams.queueFamilyIndexCount = 1; + deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex; + deviceLocalBufferParams.size = sizeof(scalar_t) * scalarElementCount; + deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; + + m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams)); + auto mreqs = m_deviceLocalBuffer->getMemoryReqs(); + mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); + auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT); + + m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress(); + } + + const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(PushConstantData) }; + + { + auto layout = m_device->createPipelineLayout({ &pcRange,1 }); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout.get(); + params.shader.shader = shader.get(); + params.shader.entryPoint = "main"; + params.shader.requiredSubgroupSize = static_cast(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize)); + params.cached.requireFullSubgroups = true; + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) + return logFail("Failed to create compute pipeline!\n"); + } + + const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits(); + // The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices + // which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets. + // Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc. + // there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those. + // We'll align to max of coherent atom size even if the memory is coherent, + // and we also need to take into account BDA shader loads need to be aligned to the type being loaded. + m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(float)); + + // Semaphor used here to know the FFT is done before download + m_timeline = m_device->createSemaphore(semaphorValue); + + IQueue* const queue = getComputeQueue(); + + // Note that I'm using the sample struct with methods that have identical code which compiles as both C++ and HLSL + auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({ semaphorValue ^ 0xdeadbeefu,std::hash()(_NBL_APP_NAME_) }); + + const uint32_t scalarElementCount = 2 * complexElementCount; + const uint32_t inputSize = sizeof(scalar_t) * scalarElementCount; + + // Just need a single suballocation in this example + const uint32_t AllocationCount = 1; + + // It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value + // this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args. + auto inputOffset = m_upStreamingBuffer->invalid_value; + + // We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled) + // Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later). + std::chrono::steady_clock::time_point waitTill(std::chrono::years(45)); + // note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly + m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment); + + // Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example! + { + auto* const inputPtr = reinterpret_cast(reinterpret_cast(m_upStreamingBuffer->getBufferPointer()) + inputOffset); + std::cout << "Begin array CPU\n"; + for (auto j = 0; j < complexElementCount; j++) + { + //Random array + + //scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits::max); + + // FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),... + + + scalar_t x = j > 0 ? 0.f : 1.f; + scalar_t y = 0; + + + // FFT( (c,0), (c,0), (c,0),... ) = (Nc,0), (0,0), (0,0),... + + /* + scalar_t x = 1.f; + scalar_t y = 0.f; + */ + + inputPtr[2 * j] = x; + inputPtr[2 * j + 1] = y; + std::cout << "(" << x << ", " << y << "), "; + } + std::cout << "\nEnd array CPU\n"; + // Always remember to flush! + if (m_upStreamingBuffer->needsManualFlushOrInvalidate()) + { + const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory(); + const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize); + m_device->flushMappedMemoryRanges(1, &range); + } + } + + // finally allocate our output range + const uint32_t outputSize = inputSize; + + auto outputOffset = m_downStreamingBuffer->invalid_value; + m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment); + + smart_refctd_ptr cmdbuf; + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) { + return logFail("Failed to create Command Buffers!\n"); + } + cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger)); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdbuf->bindComputePipeline(m_pipeline.get()); + // This is the new fun part, pushing constants + const PushConstantData pc = {.deviceBufferAddress = m_deviceLocalBufferAddress}; + IGPUCommandBuffer::SBufferCopy copyInfo = {}; + copyInfo.srcOffset = 0; + copyInfo.dstOffset = 0; + copyInfo.size = m_deviceLocalBuffer->getSize(); + cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, ©Info); + cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + // Remember we do a single workgroup per 1D array in these parts + cmdbuf->dispatch(1, 1, 1); + + // Pipeline barrier: wait for FFT shader to be done before copying to downstream buffer + IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo = {}; + + decltype(pipelineBarrierInfo)::buffer_barrier_t barrier = {}; + pipelineBarrierInfo.bufBarriers = { &barrier, 1u }; + + barrier.range.buffer = m_deviceLocalBuffer; + + barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS; + barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS; + + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo); + cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, ©Info); + cmdbuf->end(); + } + + semaphorValue++; + { + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo = + { + .cmdbuf = cmdbuf.get() + }; + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = + { + .semaphore = m_timeline.get(), + .value = semaphorValue, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + + const IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = {}, + .commandBuffers = {&cmdbufInfo,1}, + .signalSemaphores = {&signalInfo,1} + }; + + m_api->startCapture(); + queue->submit({ &submitInfo,1 }); + m_api->endCapture(); + } + + // We let all latches know what semaphore and counter value has to be passed for the functors to execute + const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue }; + + // As promised, we can defer an upstreaming buffer deallocation until a fence is signalled + // You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation. + m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait); + + // Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer. + // Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory. + // Its nice because it will also remember to invalidate our memory mapping if its not coherent. + auto latchedConsumer = make_smart_refctd_ptr( + IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize), + // Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals + [=](const size_t dstOffset, const void* bufSrc, const size_t size)->void + { + // The unused variable is used for letting the consumer know the subsection of the output we've managed to download + // But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves. + assert(dstOffset == 0 && size == outputSize); + + std::cout << "Begin array GPU\n"; + scalar_t* const data = reinterpret_cast(const_cast(bufSrc)); + for (auto i = 0u; i < complexElementCount; i++) { + std::cout << "(" << data[2 * i] << ", " << data[2 * i + 1] << "), "; + } + + std::cout << "\nEnd array GPU\n"; + }, + // Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it + // hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands. + // It could also be latched in the upstreaming deallocate, because its the same fence. + std::move(cmdbuf), m_downStreamingBuffer + ); + // We put a function we want to execute + m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get()); + + return true; + } + + // One-shot App + bool keepRunning() override { return false; } + + // One-shot App + void workLoopBody() override{} + + // Cleanup + bool onAppTerminated() override + { + // Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated` + // (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain) + while (m_downStreamingBuffer->cull_frees()) {} + return device_base_t::onAppTerminated(); + } +}; + + +NBL_MAIN_FUNC(FFT_Test) \ No newline at end of file diff --git a/XX_NewFFT/pipeline.groovy b/XX_NewFFT/pipeline.groovy new file mode 100644 index 000000000..1a7b043a4 --- /dev/null +++ b/XX_NewFFT/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CStreamingAndBufferDeviceAddressBuilder extends IBuilder +{ + public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info) +} + +return this \ No newline at end of file From 1d787b3725970ae8c706c858fc7b675faff952a1 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Wed, 8 Apr 2026 16:36:35 -0300 Subject: [PATCH 2/8] Subgroup runs --- XX_NewFFT/app_resources/common.hlsl | 14 ++- XX_NewFFT/app_resources/shader.comp.hlsl | 70 ++++++++++++--- XX_NewFFT/main.cpp | 107 +++++++++++++++++++---- 3 files changed, 163 insertions(+), 28 deletions(-) diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl index dc6b96e71..2391f9ee1 100644 --- a/XX_NewFFT/app_resources/common.hlsl +++ b/XX_NewFFT/app_resources/common.hlsl @@ -1,4 +1,5 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/workgroup2/fft.hlsl" using scalar_t = nbl::hlsl::float32_t; @@ -7,6 +8,13 @@ struct PushConstantData uint64_t deviceBufferAddress; }; -NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 6; -NBL_CONSTEXPR uint32_t ElementsPerThreadLog2 = 3; -NBL_CONSTEXPR uint32_t complexElementCount = uint32_t(1) << (WorkgroupSizeLog2 + ElementsPerThreadLog2); \ No newline at end of file +NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 5; +NBL_CONSTEXPR uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2; +NBL_CONSTEXPR uint32_t SubgroupSizeLog2 = 5; // hardcoded to my nvidia gpu, should be queried at compilation time +NBL_CONSTEXPR uint32_t SubgroupSize = uint32_t(1) << SubgroupSizeLog2; + +NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 1; +NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(1); +NBL_CONSTEXPR uint32_t ElementsPerThread = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationLog2); + +NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) << WorkgroupSizeLog2); \ No newline at end of file diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl index 7c86f50b4..acf35be42 100644 --- a/XX_NewFFT/app_resources/shader.comp.hlsl +++ b/XX_NewFFT/app_resources/shader.comp.hlsl @@ -1,17 +1,20 @@ #include "common.hlsl" -#include "nbl/builtin/hlsl/workgroup/fft.hlsl" +#include "nbl/builtin/hlsl/subgroup2/fft.hlsl" +#include "nbl/builtin/hlsl/workgroup2/fft.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" [[vk::push_constant]] PushConstantData pushConstants; using namespace nbl::hlsl; -using ConstevalParameters = workgroup::fft::ConstevalParameters; +//using ConstevalParameters = workgroup::fft::ConstevalParameters; -groupshared uint32_t sharedmem[ ConstevalParameters::SharedMemoryDWORDs]; +//groupshared uint32_t sharedmem[ ConstevalParameters::SharedMemoryDWORDs]; // Users MUST define this method for FFT to work -uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParameters::WorkgroupSize), 1, 1); } +//uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParameters::WorkgroupSize), 1, 1); } +/* struct SharedMemoryAccessor { template @@ -32,8 +35,10 @@ struct SharedMemoryAccessor } }; +*/ // Almost a LegacyBdaAccessor, but since we need `uint32_t index` getter and setter it's the same as writing one ourselves + struct Accessor { static Accessor create(const uint64_t address) @@ -59,16 +64,61 @@ struct Accessor uint64_t address; }; -[numthreads(ConstevalParameters::WorkgroupSize,1,1)] + +template +struct InvocationElementsAccessor +{ + float32_t real[Size]; + float32_t imag[Size]; + + void get(uint32_t channel, NBL_REF_ARG(complex_t) value) + { + value.real(real[channel]); + value.imag(imag[channel]); + } + + void set(uint32_t channel, complex_t value) + { + real[channel] = value.real(); + imag[channel] = value.imag(); + } +}; + +//[numthreads(ConstevalParameters::WorkgroupSize,1,1)] +[numthreads(WorkgroupSize, 1, 1)] [shader("compute")] void main(uint32_t3 ID : SV_DispatchThreadID) { + // global mem read write Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress); - SharedMemoryAccessor sharedmemAccessor; + // Load elements into the accessor + InvocationElementsAccessor loAcc; + InvocationElementsAccessor hiAcc; + + using IndexingUtils = workgroup2::FFTIndexingUtils; - // FFT + [unroll] + for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) + { + complex_t lo, hi; + accessor.get(glsl::gl_SubgroupInvocationID() + 2 * pair * SubgroupSize, lo); + loAcc.set(pair, lo); + accessor.get(glsl::gl_SubgroupInvocationID() + (2 * pair + 1) * SubgroupSize, hi); + hiAcc.set(pair, hi); + } + //subgroup2::FFT::__call(0, ElementsPerThread / 2 - 1, loAcc, hiAcc); + //subgroup2::FFT::__call(0, ElementsPerThread / 2 - 1, loAcc, hiAcc); + //subgroup2::FFT::__callInterleaved<1, WorkgroupSize>(WorkgroupSize, 1, 0, ElementsPerThread / 2 - 1, loAcc, hiAcc); + //subgroup2::FFT::__callInterleaved<1, WorkgroupSize>(1, WorkgroupSize, 1, 0, ElementsPerThread / 2 - 1, loAcc, hiAcc); - workgroup::FFT::template __call(accessor, sharedmemAccessor); - sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); - workgroup::FFT::template __call(accessor, sharedmemAccessor); + [unroll] + for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) + { + complex_t lo, hi; + loAcc.get(pair, lo); + accessor.set(glsl::gl_SubgroupInvocationID() + 2 * pair * SubgroupSize, lo); + hiAcc.get(pair, hi); + accessor.set(glsl::gl_SubgroupInvocationID() + (2 * pair + 1) * SubgroupSize, hi); + } + } \ No newline at end of file diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp index 49d157a38..f539e3a5b 100644 --- a/XX_NewFFT/main.cpp +++ b/XX_NewFFT/main.cpp @@ -19,6 +19,74 @@ using namespace nbl::examples; #include "nbl/builtin/hlsl/bit.hlsl" #include "nbl/builtin/hlsl/random/xoroshiro.hlsl" +// Function implemented in workgroup2::FFTIndexingUtils is meant to be fast based on the observation that reordering can still be performed fast in the case of +// a single prime factor. However, we need to test that the implemented fast version matches the real ordering +template +void DIFOrderTester() +{ + using IndexingUtils = nbl::hlsl::workgroup2::FFTIndexingUtils; + using IndexingUtilsHelper = typename IndexingUtils::helper_t; + const uint32_t FFTSize = IndexingUtils::FFTSize; + const uint32_t Radix2FFTSizeLog2 = IndexingUtils::Radix2FFTSizeLog2; + const uint32_t Radix2FFTSize = IndexingUtils::Radix2FFTSize; + + // Check fast div correctness + { + bool correct = true; + for (auto idx = 0u; idx < FFTSize; idx++) + { + if (idx / ExtraPrimeFactor != IndexingUtilsHelper::fastDiv(idx)) correct = false; + } + std::cout << "Fast div test " << (correct ? "passed\n" : "did not pass\n"); + } + + // Check whether the forward ordering is computed properly + { + bool correct = true; + for (auto idx = 0u; idx < FFTSize; idx++) + { + uint32_t fastIdx = IndexingUtilsHelper::mapLaneToFreq(idx); + if constexpr (ExtraPrimeFactor == 1) + { + if (fastIdx != nbl::hlsl::bitReverseAs(idx, Radix2FFTSizeLog2)) correct = false; + } + else + { + uint32_t index = idx; + std::vector digits; + for (auto i = 0u; i < Radix2FFTSizeLog2; i++) + { + digits.push_back(index & 1); + index >>= 1; + } + digits.push_back(index); + // Reconstruct mapping + uint32_t correctIdx = 0; + uint32_t multiplier = ExtraPrimeFactor * Radix2FFTSize; + for (auto i = 0u; i < Radix2FFTSizeLog2; i++) + { + multiplier >>= 1; + correctIdx += multiplier * digits[i]; + } + multiplier /= ExtraPrimeFactor; + correctIdx += multiplier * digits[Radix2FFTSizeLog2]; + if (fastIdx != correctIdx) + correct = false; + } + } + std::cout << "Forward test " << (correct ? "passed\n" : "did not pass\n"); + } + + // Check whether the inverse actually computes the inverse + { + bool correct = true; + for (auto idx = 0; idx < FFTSize; idx++) + { + if (idx != IndexingUtilsHelper::mapFreqToLane(IndexingUtilsHelper::mapLaneToFreq(idx))) correct = false; + } + std::cout << "Inverse test " << (correct ? "passed\n" : "did not pass\n"); + } +} // Simple showcase of how to run FFT on a 1D array class FFT_Test final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication @@ -158,6 +226,9 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ // note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment); + // Run DIF Ordering test + DIFOrderTester(); + // Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example! { auto* const inputPtr = reinterpret_cast(reinterpret_cast(m_upStreamingBuffer->getBufferPointer()) + inputOffset); @@ -166,27 +237,25 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ { //Random array - //scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits::max); + scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits::max); + #define DIVIDE + + //FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),... - // FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),... - - - scalar_t x = j > 0 ? 0.f : 1.f; - scalar_t y = 0; - - + + //scalar_t x = j > 0 ? 0.f : 1.f, y = 0; + // FFT( (c,0), (c,0), (c,0),... ) = (Nc,0), (0,0), (0,0),... - - /* - scalar_t x = 1.f; - scalar_t y = 0.f; - */ + + + //scalar_t x = 1.f, y = 0.f; inputPtr[2 * j] = x; inputPtr[2 * j + 1] = y; std::cout << "(" << x << ", " << y << "), "; } std::cout << "\nEnd array CPU\n"; + // Always remember to flush! if (m_upStreamingBuffer->needsManualFlushOrInvalidate()) { @@ -285,9 +354,17 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ std::cout << "Begin array GPU\n"; scalar_t* const data = reinterpret_cast(const_cast(bufSrc)); - for (auto i = 0u; i < complexElementCount; i++) { - std::cout << "(" << data[2 * i] << ", " << data[2 * i + 1] << "), "; + #ifdef DIVIDE + for (auto j = 0; j < complexElementCount; j++) + { + std::cout << "(" << data[2 * j] / complexElementCount << ", " << data[2 * j + 1] / complexElementCount << "), "; + } + #else + for (auto j = 0; j < complexElementCount; j++) + { + std::cout << "(" << data[2 * j] << ", " << data[2 * j + 1] << "), "; } + #endif std::cout << "\nEnd array GPU\n"; }, From d84bcfcb04960d5d69b47fe969e10c676ac8ca86 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Thu, 9 Apr 2026 21:33:59 -0300 Subject: [PATCH 3/8] Working, missing divisionpolicy and proper channel count --- XX_NewFFT/app_resources/common.hlsl | 12 +++-- XX_NewFFT/app_resources/shader.comp.hlsl | 59 ++++++++++++++++++------ XX_NewFFT/main.cpp | 44 +++++++++++++++++- 3 files changed, 97 insertions(+), 18 deletions(-) diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl index 2391f9ee1..4d1f916d9 100644 --- a/XX_NewFFT/app_resources/common.hlsl +++ b/XX_NewFFT/app_resources/common.hlsl @@ -8,13 +8,17 @@ struct PushConstantData uint64_t deviceBufferAddress; }; -NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 5; +NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 9; NBL_CONSTEXPR uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2; NBL_CONSTEXPR uint32_t SubgroupSizeLog2 = 5; // hardcoded to my nvidia gpu, should be queried at compilation time NBL_CONSTEXPR uint32_t SubgroupSize = uint32_t(1) << SubgroupSizeLog2; -NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 1; -NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(1); +NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 2; +NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(5); NBL_CONSTEXPR uint32_t ElementsPerThread = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationLog2); -NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) << WorkgroupSizeLog2); \ No newline at end of file +NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) << WorkgroupSizeLog2); +NBL_CONSTEXPR uint32_t complexElementCountPerChannel = uint32_t(1) << (WorkgroupSizeLog2 + 1); +NBL_CONSTEXPR uint32_t Channels = ElementsPerThread / 2; + +NBL_CONSTEXPR uint32_t ShuffledChannelsPerRound = nbl::hlsl::mpl::min_v; \ No newline at end of file diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl index acf35be42..d9633de33 100644 --- a/XX_NewFFT/app_resources/shader.comp.hlsl +++ b/XX_NewFFT/app_resources/shader.comp.hlsl @@ -9,12 +9,11 @@ using namespace nbl::hlsl; //using ConstevalParameters = workgroup::fft::ConstevalParameters; -//groupshared uint32_t sharedmem[ ConstevalParameters::SharedMemoryDWORDs]; +groupshared uint32_t sharedmem[4 * ((sizeof(complex_t) / sizeof(uint32_t)) << WorkgroupSizeLog2) ]; // Users MUST define this method for FFT to work //uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParameters::WorkgroupSize), 1, 1); } -/* struct SharedMemoryAccessor { template @@ -35,7 +34,6 @@ struct SharedMemoryAccessor } }; -*/ // Almost a LegacyBdaAccessor, but since we need `uint32_t index` getter and setter it's the same as writing one ourselves @@ -95,30 +93,65 @@ void main(uint32_t3 ID : SV_DispatchThreadID) InvocationElementsAccessor loAcc; InvocationElementsAccessor hiAcc; - using IndexingUtils = workgroup2::FFTIndexingUtils; + // Set up the memory adaptor + SharedMemoryAccessor sharedmemAccessor; + //using adaptor_t = accessor_adaptors::StructureOfArrays; + //adaptor_t sharedmemAdaptor; + //sharedmemAdaptor.accessor = sharedmemAccessor; + using ConstevalParameters = workgroup2::fft::ConstevalParameters; + using FFT = workgroup2::FFT; + using IFFT = workgroup2::FFT; + + // Invert last channel to ensure ping pong works [unroll] for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) { complex_t lo, hi; - accessor.get(glsl::gl_SubgroupInvocationID() + 2 * pair * SubgroupSize, lo); + accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo); loAcc.set(pair, lo); - accessor.get(glsl::gl_SubgroupInvocationID() + (2 * pair + 1) * SubgroupSize, hi); + accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi); hiAcc.set(pair, hi); + //printf("Pair %d is lo: %f, %f hi: %f, %f", pair, lo.real(), lo.imag(), hi.real(), hi.imag()); + //printf("SharedmemSize: %d", 4 * ((sizeof(complex_t) / sizeof(uint32_t)) << WorkgroupSizeLog2)); + //printf("ShuffleRounds: %d", ConstevalParameters::ShuffleRounds); + } + + FFT::__call(loAcc, hiAcc, sharedmemAccessor); + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + IFFT::__call(loAcc, hiAcc, sharedmemAccessor); + + [unroll] + for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) + { + complex_t lo, hi; + loAcc.get(pair, lo); + accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo); + hiAcc.get(pair, hi); + accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi); } - //subgroup2::FFT::__call(0, ElementsPerThread / 2 - 1, loAcc, hiAcc); - //subgroup2::FFT::__call(0, ElementsPerThread / 2 - 1, loAcc, hiAcc); - //subgroup2::FFT::__callInterleaved<1, WorkgroupSize>(WorkgroupSize, 1, 0, ElementsPerThread / 2 - 1, loAcc, hiAcc); - //subgroup2::FFT::__callInterleaved<1, WorkgroupSize>(1, WorkgroupSize, 1, 0, ElementsPerThread / 2 - 1, loAcc, hiAcc); + + /* + [unroll] + for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) + { + complex_t lo, hi; + accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo); + loAcc.set(pair, lo); + accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi); + hiAcc.set(pair, hi); + } + + Exchanger::__call(0, Channels - 1, loAcc, hiAcc, TestStride, sharedmemAdaptor, false); [unroll] for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) { complex_t lo, hi; loAcc.get(pair, lo); - accessor.set(glsl::gl_SubgroupInvocationID() + 2 * pair * SubgroupSize, lo); + accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo); hiAcc.get(pair, hi); - accessor.set(glsl::gl_SubgroupInvocationID() + (2 * pair + 1) * SubgroupSize, hi); + accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi); } - + */ } \ No newline at end of file diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp index f539e3a5b..3089caf82 100644 --- a/XX_NewFFT/main.cpp +++ b/XX_NewFFT/main.cpp @@ -31,6 +31,7 @@ void DIFOrderTester() const uint32_t Radix2FFTSize = IndexingUtils::Radix2FFTSize; // Check fast div correctness + if constexpr (ExtraPrimeFactor > 1) { bool correct = true; for (auto idx = 0u; idx < FFTSize; idx++) @@ -233,6 +234,34 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ { auto* const inputPtr = reinterpret_cast(reinterpret_cast(m_upStreamingBuffer->getBufferPointer()) + inputOffset); std::cout << "Begin array CPU\n"; + for (auto channel = 0; channel < Channels; channel++) + { + std::cout << "Begin channel " << channel << "\n"; + for (auto j = 0; j < complexElementCountPerChannel; j++) + { + //Random array + + //scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits::max); + //#define DIVIDE + + + //FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),... + + + scalar_t x = j > 0 ? 0.f : 1.f, y = 0; + + // FFT( (c,0), (c,0), (c,0),... ) = (Nc,0), (0,0), (0,0),... + + + //scalar_t x = 1.f, y = 0.f; + + inputPtr[2 * complexElementCountPerChannel * channel + 2 * j] = x; + inputPtr[2 * complexElementCountPerChannel * channel + 2 * j + 1] = y; + std::cout << "(" << x << ", " << y << "), "; + } + std::cout << "\nEnd channel " << channel << "\n"; + } + /* for (auto j = 0; j < complexElementCount; j++) { //Random array @@ -254,6 +283,7 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ inputPtr[2 * j + 1] = y; std::cout << "(" << x << ", " << y << "), "; } + */ std::cout << "\nEnd array CPU\n"; // Always remember to flush! @@ -354,6 +384,17 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ std::cout << "Begin array GPU\n"; scalar_t* const data = reinterpret_cast(const_cast(bufSrc)); + for (auto channel = 0; channel < Channels; channel++) + { + std::cout << "Begin channel " << channel << "\n"; + for (auto j = 0; j < complexElementCountPerChannel; j++) + { + std::cout << "(" << data[2 * complexElementCountPerChannel * channel + 2 * j] << ", " << data[2 * complexElementCountPerChannel * channel + 2 * j + 1] << "), "; + } + std::cout << "\nEnd channel " << channel << "\n"; + } + + /* #ifdef DIVIDE for (auto j = 0; j < complexElementCount; j++) { @@ -364,7 +405,8 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ { std::cout << "(" << data[2 * j] << ", " << data[2 * j + 1] << "), "; } - #endif + #endif + */ std::cout << "\nEnd array GPU\n"; }, From 2e6ea2fb6f573c7c89414e3c60b14df034d8c822 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 14 Apr 2026 17:10:54 -0300 Subject: [PATCH 4/8] Division policy and twiddle opts still missing. Ran into DXC and Nvidia compiler bugs --- XX_NewFFT/app_resources/common.hlsl | 4 ++-- XX_NewFFT/app_resources/shader.comp.hlsl | 4 ++-- XX_NewFFT/main.cpp | 16 ++++++++++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl index 4d1f916d9..8272894ac 100644 --- a/XX_NewFFT/app_resources/common.hlsl +++ b/XX_NewFFT/app_resources/common.hlsl @@ -13,8 +13,8 @@ NBL_CONSTEXPR uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2; NBL_CONSTEXPR uint32_t SubgroupSizeLog2 = 5; // hardcoded to my nvidia gpu, should be queried at compilation time NBL_CONSTEXPR uint32_t SubgroupSize = uint32_t(1) << SubgroupSizeLog2; -NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 2; -NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(5); +NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 1; +NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(1); NBL_CONSTEXPR uint32_t ElementsPerThread = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationLog2); NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) << WorkgroupSizeLog2); diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl index d9633de33..90a45c8a0 100644 --- a/XX_NewFFT/app_resources/shader.comp.hlsl +++ b/XX_NewFFT/app_resources/shader.comp.hlsl @@ -118,8 +118,8 @@ void main(uint32_t3 ID : SV_DispatchThreadID) } FFT::__call(loAcc, hiAcc, sharedmemAccessor); - sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); - IFFT::__call(loAcc, hiAcc, sharedmemAccessor); + //sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + //IFFT::__call(loAcc, hiAcc, sharedmemAccessor); [unroll] for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp index 3089caf82..b2f8f8db8 100644 --- a/XX_NewFFT/main.cpp +++ b/XX_NewFFT/main.cpp @@ -120,6 +120,22 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ FFT_Test(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + inline core::smart_refctd_ptr createShader(const std::string& includeMainName) + { + auto HLSLShader = core::make_smart_refctd_ptr(("#include \"" + includeMainName + "\"\n").c_str(), + IShader::E_CONTENT_TYPE::ECT_HLSL, + includeMainName); + assert(HLSLShader); + +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + return m_device->compileShader({ HLSLShader.get(), opt.get() }); +#else + return m_device->compileShader({ HLSLShader.get() }); +#endif + } + // we stuff all our work here because its a "single shot" app bool onAppInitialized(smart_refctd_ptr&& system) override { From accdf13e75b2158544ef27b17919e190a96d6fa9 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Wed, 22 Apr 2026 00:01:38 -0300 Subject: [PATCH 5/8] No major changes --- XX_NewFFT/app_resources/shader.comp.hlsl | 27 -------- XX_NewFFT/main.cpp | 78 +++++++++++++++++------- 2 files changed, 56 insertions(+), 49 deletions(-) diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl index 90a45c8a0..18f46ab75 100644 --- a/XX_NewFFT/app_resources/shader.comp.hlsl +++ b/XX_NewFFT/app_resources/shader.comp.hlsl @@ -11,9 +11,6 @@ using namespace nbl::hlsl; groupshared uint32_t sharedmem[4 * ((sizeof(complex_t) / sizeof(uint32_t)) << WorkgroupSizeLog2) ]; -// Users MUST define this method for FFT to work -//uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParameters::WorkgroupSize), 1, 1); } - struct SharedMemoryAccessor { template @@ -130,28 +127,4 @@ void main(uint32_t3 ID : SV_DispatchThreadID) hiAcc.get(pair, hi); accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi); } - - /* - [unroll] - for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) - { - complex_t lo, hi; - accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo); - loAcc.set(pair, lo); - accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi); - hiAcc.set(pair, hi); - } - - Exchanger::__call(0, Channels - 1, loAcc, hiAcc, TestStride, sharedmemAdaptor, false); - - [unroll] - for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) - { - complex_t lo, hi; - loAcc.get(pair, lo); - accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo); - hiAcc.get(pair, hi); - accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi); - } - */ } \ No newline at end of file diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp index b2f8f8db8..c61b354eb 100644 --- a/XX_NewFFT/main.cpp +++ b/XX_NewFFT/main.cpp @@ -120,20 +120,66 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ FFT_Test(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} - inline core::smart_refctd_ptr createShader(const std::string& includeMainName) + inline core::smart_refctd_ptr createShader(const char* includeMainName) { - auto HLSLShader = core::make_smart_refctd_ptr(("#include \"" + includeMainName + "\"\n").c_str(), + auto HLSLShader = core::make_smart_refctd_ptr((std::string("#include \"") + includeMainName + "\"\n").c_str(), IShader::E_CONTENT_TYPE::ECT_HLSL, includeMainName); assert(HLSLShader); + ILogicalDevice::SShaderCreationParameters shaderCreationParams{ .source = HLSLShader.get(), + .preprocessedOutputPath = (localOutputCWD / "preprocessed.hlsl").string(), + .spvOutputPath = (localOutputCWD / "out.spv").string() }; #ifndef _NBL_DEBUG ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); - return m_device->compileShader({ HLSLShader.get(), opt.get() }); -#else - return m_device->compileShader({ HLSLShader.get() }); + shaderCreationParams.optimizer = opt.get(); + shaderCreationParams.optimizerIsExtraPasses = true; #endif + return m_device->compileShader(shaderCreationParams); + } + + // useful for debugging compiler issues + inline core::smart_refctd_ptr createSpirvShader(const char* spirvName) + { + core::smart_refctd_ptr shaderBuffer; + { + core::smart_refctd_ptr shaderReadFile; + system::ISystem::future_t> future; + m_system->createFile(future, spirvName, system::IFile::ECF_READ); + if (future.wait()) + { + future.acquire().move_into(shaderReadFile); + if (shaderReadFile) + { + const size_t size = shaderReadFile->getSize(); + if (size > 0ull) + { + asset::IBuffer::SCreationParams bufferCreationParams{ .size = size }; + asset::ICPUBuffer::SCreationParams foo; + foo = bufferCreationParams; + shaderBuffer = ICPUBuffer::create(std::move(foo)); + system::IFile::success_t succ; + shaderReadFile->read(succ, shaderBuffer->getPointer(), 0, size); + if (!succ) + m_logger->log("Failed Reading From Shader File.", ILogger::ELL_ERROR); + } + } + else + { + m_logger->log("Failed Opening Shader File.", ILogger::ELL_ERROR); + } + } + else + { + m_logger->log("Failed Opening Shader Cache File.", ILogger::ELL_ERROR); + } + } + + auto SPIRVShader = core::make_smart_refctd_ptr(std::move(shaderBuffer), IShader::E_CONTENT_TYPE::ECT_SPIRV, spirvName); + assert(SPIRVShader); + + return m_device->compileShader({ .source = SPIRVShader.get() }); } // we stuff all our work here because its a "single shot" app @@ -145,23 +191,11 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ if (!asset_base_t::onAppInitialized(std::move(system))) return false; - smart_refctd_ptr shader; - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; // virtual root - auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get()); - auto assetBundle = m_assetMgr->getAsset(key.data(), lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - return logFail("Could not load shader!"); - - // Cast down the asset to its proper type - shader = IAsset::castDown(assets[0]); - - if (!shader) - return logFail("Invalid shader!"); - } + smart_refctd_ptr shader = createShader("app_resources/shader.comp.hlsl"); + // DEBUG + //smart_refctd_ptr shader = createSpirvShader("app_resources/optimized.spv"); + if (!shader) + return logFail("Invalid shader!"); // Create massive upload/download buffers constexpr uint32_t DownstreamBufferSize = sizeof(scalar_t) << 23; From f466d64c96374cb9c80769c664c7aba99f228a69 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 5 May 2026 00:52:54 -0300 Subject: [PATCH 6/8] Halfway added subgroup shared twiddles --- XX_NewFFT/app_resources/shader.comp.hlsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl index 18f46ab75..5663395a8 100644 --- a/XX_NewFFT/app_resources/shader.comp.hlsl +++ b/XX_NewFFT/app_resources/shader.comp.hlsl @@ -96,7 +96,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID) //adaptor_t sharedmemAdaptor; //sharedmemAdaptor.accessor = sharedmemAccessor; - using ConstevalParameters = workgroup2::fft::ConstevalParameters; + using ConstevalParameters = workgroup2::fft::ConstevalParameters; using FFT = workgroup2::FFT; using IFFT = workgroup2::FFT; From a69d8e8d5ab45d6bc58986dafa777539689517f1 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Tue, 5 May 2026 20:19:56 -0300 Subject: [PATCH 7/8] Implementing channels --- XX_NewFFT/app_resources/common.hlsl | 4 +++- XX_NewFFT/app_resources/shader.comp.hlsl | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl index 8272894ac..39d12f994 100644 --- a/XX_NewFFT/app_resources/common.hlsl +++ b/XX_NewFFT/app_resources/common.hlsl @@ -21,4 +21,6 @@ NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) << NBL_CONSTEXPR uint32_t complexElementCountPerChannel = uint32_t(1) << (WorkgroupSizeLog2 + 1); NBL_CONSTEXPR uint32_t Channels = ElementsPerThread / 2; -NBL_CONSTEXPR uint32_t ShuffledChannelsPerRound = nbl::hlsl::mpl::min_v; \ No newline at end of file +NBL_CONSTEXPR uint32_t ShuffledChannelsPerRound = nbl::hlsl::mpl::min_v; + +NBL_CONSTEXPR bool ShareTwiddles = true; \ No newline at end of file diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl index 5663395a8..9933591fb 100644 --- a/XX_NewFFT/app_resources/shader.comp.hlsl +++ b/XX_NewFFT/app_resources/shader.comp.hlsl @@ -96,7 +96,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID) //adaptor_t sharedmemAdaptor; //sharedmemAdaptor.accessor = sharedmemAccessor; - using ConstevalParameters = workgroup2::fft::ConstevalParameters; + using ConstevalParameters = workgroup2::fft::ConstevalParameters; using FFT = workgroup2::FFT; using IFFT = workgroup2::FFT; @@ -115,8 +115,8 @@ void main(uint32_t3 ID : SV_DispatchThreadID) } FFT::__call(loAcc, hiAcc, sharedmemAccessor); - //sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); - //IFFT::__call(loAcc, hiAcc, sharedmemAccessor); + sharedmemAccessor.workgroupExecutionAndMemoryBarrier(); + IFFT::__call(loAcc, hiAcc, sharedmemAccessor); [unroll] for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) From 1d6aae3f97998e36199645a6b08b76e39189b615 Mon Sep 17 00:00:00 2001 From: Fletterio Date: Wed, 6 May 2026 18:07:43 -0300 Subject: [PATCH 8/8] Channels added, lacks support on cpp side for further examples --- XX_NewFFT/app_resources/common.hlsl | 17 ++++++----- XX_NewFFT/app_resources/shader.comp.hlsl | 38 +++++++++++++----------- XX_NewFFT/main.cpp | 2 +- 3 files changed, 32 insertions(+), 25 deletions(-) diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl index 39d12f994..8cd0cf44f 100644 --- a/XX_NewFFT/app_resources/common.hlsl +++ b/XX_NewFFT/app_resources/common.hlsl @@ -13,14 +13,17 @@ NBL_CONSTEXPR uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2; NBL_CONSTEXPR uint32_t SubgroupSizeLog2 = 5; // hardcoded to my nvidia gpu, should be queried at compilation time NBL_CONSTEXPR uint32_t SubgroupSize = uint32_t(1) << SubgroupSizeLog2; -NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 1; +NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationPerChannelLog2 = 1; NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(1); -NBL_CONSTEXPR uint32_t ElementsPerThread = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationLog2); +NBL_CONSTEXPR uint32_t ElementsPerInvocationPerChannel = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationPerChannelLog2); -NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) << WorkgroupSizeLog2); -NBL_CONSTEXPR uint32_t complexElementCountPerChannel = uint32_t(1) << (WorkgroupSizeLog2 + 1); -NBL_CONSTEXPR uint32_t Channels = ElementsPerThread / 2; +NBL_CONSTEXPR uint32_t Channels = 1; +NBL_CONSTEXPR uint32_t complexElementCountPerChannel = ElementsPerInvocationPerChannel * (uint32_t(1) << WorkgroupSizeLog2); +NBL_CONSTEXPR uint32_t complexElementCount = Channels * complexElementCountPerChannel; -NBL_CONSTEXPR uint32_t ShuffledChannelsPerRound = nbl::hlsl::mpl::min_v; +NBL_CONSTEXPR uint16_t InnerVirtualChannels = Channels * (ElementsPerInvocationPerChannel >> 1); +NBL_CONSTEXPR uint32_t ShuffledVirtualChannelsPerRound = nbl::hlsl::mpl::min_v; -NBL_CONSTEXPR bool ShareTwiddles = true; \ No newline at end of file +NBL_CONSTEXPR bool ShareTwiddles = true; + +using ConstevalParameters = workgroup2::fft::ConstevalParameters; \ No newline at end of file diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl index 9933591fb..c1db53f30 100644 --- a/XX_NewFFT/app_resources/shader.comp.hlsl +++ b/XX_NewFFT/app_resources/shader.comp.hlsl @@ -9,7 +9,7 @@ using namespace nbl::hlsl; //using ConstevalParameters = workgroup::fft::ConstevalParameters; -groupshared uint32_t sharedmem[4 * ((sizeof(complex_t) / sizeof(uint32_t)) << WorkgroupSizeLog2) ]; +groupshared uint32_t sharedmem[4 * ((sizeof(complex_t) / sizeof(uint32_t)) << WorkgroupSizeLog2) ]; struct SharedMemoryAccessor { @@ -60,25 +60,28 @@ struct Accessor }; -template +template struct InvocationElementsAccessor { - float32_t real[Size]; - float32_t imag[Size]; + scalar_t real[Channels][Size]; + scalar_t imag[Channels][Size]; - void get(uint32_t channel, NBL_REF_ARG(complex_t) value) + void get(uint32_t channel, uint32_t pair, NBL_REF_ARG(complex_t) value) { - value.real(real[channel]); - value.imag(imag[channel]); + value.real(real[channel][pair]); + value.imag(imag[channel][pair]); } - void set(uint32_t channel, complex_t value) + void set(uint32_t channel, uint32_t pair, NBL_CONST_REF_ARG(complex_t) value) { - real[channel] = value.real(); - imag[channel] = value.imag(); + real[channel][pair] = value.real(); + imag[channel][pair] = value.imag(); } }; +using _InvocationElementsAccessor = InvocationElementsAccessor; +using ElementsAccessorAdaptor = workgroup2::fft::WorkgroupRadix2AccessorAdaptor; + //[numthreads(ConstevalParameters::WorkgroupSize,1,1)] [numthreads(WorkgroupSize, 1, 1)] [shader("compute")] @@ -87,8 +90,10 @@ void main(uint32_t3 ID : SV_DispatchThreadID) // global mem read write Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress); // Load elements into the accessor - InvocationElementsAccessor loAcc; - InvocationElementsAccessor hiAcc; + _InvocationElementsAccessor loElementAccessor; + ElementsAccessorAdaptor loAcc = ElementsAccessorAdaptor::create(loElementAccessor); + _InvocationElementsAccessor hiElementAccessor; + ElementsAccessorAdaptor hiAcc = ElementsAccessorAdaptor::create(hiElementAccessor); // Set up the memory adaptor SharedMemoryAccessor sharedmemAccessor; @@ -96,13 +101,12 @@ void main(uint32_t3 ID : SV_DispatchThreadID) //adaptor_t sharedmemAdaptor; //sharedmemAdaptor.accessor = sharedmemAccessor; - using ConstevalParameters = workgroup2::fft::ConstevalParameters; - using FFT = workgroup2::FFT; - using IFFT = workgroup2::FFT; + using FFT = workgroup2::impl::InnerFFT; + using IFFT = workgroup2::impl::InnerFFT; // Invert last channel to ensure ping pong works [unroll] - for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) + for (uint32_t pair = 0u; pair < ElementsPerInvocationPerChannel / 2; pair++) { complex_t lo, hi; accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo); @@ -119,7 +123,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID) IFFT::__call(loAcc, hiAcc, sharedmemAccessor); [unroll] - for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++) + for (uint32_t pair = 0u; pair < ElementsPerInvocationPerChannel / 2; pair++) { complex_t lo, hi; loAcc.get(pair, lo); diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp index c61b354eb..087117f47 100644 --- a/XX_NewFFT/main.cpp +++ b/XX_NewFFT/main.cpp @@ -278,7 +278,7 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment); // Run DIF Ordering test - DIFOrderTester(); + DIFOrderTester(); // Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example! {