From a6337f84afd36d24a40b7d391515707df1e2a0ac Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Tue, 24 Mar 2026 21:38:17 -0300
Subject: [PATCH 1/8] Add new FFT example

---
 CMakeLists.txt                           |   1 +
 XX_NewFFT/CMakeLists.txt                 |  62 +++++
 XX_NewFFT/app_resources/common.hlsl      |  12 +
 XX_NewFFT/app_resources/shader.comp.hlsl |  74 ++++++
 XX_NewFFT/config.json.template           |  28 ++
 XX_NewFFT/main.cpp                       | 322 +++++++++++++++++++++++
 XX_NewFFT/pipeline.groovy                |  50 ++++
 7 files changed, 549 insertions(+)
 create mode 100644 XX_NewFFT/CMakeLists.txt
 create mode 100644 XX_NewFFT/app_resources/common.hlsl
 create mode 100644 XX_NewFFT/app_resources/shader.comp.hlsl
 create mode 100644 XX_NewFFT/config.json.template
 create mode 100644 XX_NewFFT/main.cpp
 create mode 100644 XX_NewFFT/pipeline.groovy

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a93a86a4f..b3f986b46 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,6 +106,7 @@ if(NBL_BUILD_EXAMPLES)
   	add_subdirectory(70_FLIPFluids)
 	add_subdirectory(71_RayTracingPipeline)
 	add_subdirectory(72_CooperativeBinarySearch)
+	add_subdirectory(XX_NewFFT)
 
 	if (NBL_BUILD_MITSUBA_LOADER)
 		add_subdirectory(73_GeometryInspector)
diff --git a/XX_NewFFT/CMakeLists.txt b/XX_NewFFT/CMakeLists.txt
new file mode 100644
index 000000000..6b6304ed8
--- /dev/null
+++ b/XX_NewFFT/CMakeLists.txt
@@ -0,0 +1,62 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+
+set(SM 6_8)
+set(JSON [=[
+[
+    {
+        "INPUT": "app_resources/shader.comp.hlsl",
+        "KEY": "shader",
+    }
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+)
diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl
new file mode 100644
index 000000000..dc6b96e71
--- /dev/null
+++ b/XX_NewFFT/app_resources/common.hlsl
@@ -0,0 +1,12 @@
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+using scalar_t = nbl::hlsl::float32_t;
+
+struct PushConstantData
+{
+	uint64_t deviceBufferAddress;
+};
+
+NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 6;
+NBL_CONSTEXPR uint32_t ElementsPerThreadLog2 = 3;
+NBL_CONSTEXPR uint32_t complexElementCount = uint32_t(1) << (WorkgroupSizeLog2 + ElementsPerThreadLog2);
\ No newline at end of file
diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl
new file mode 100644
index 000000000..7c86f50b4
--- /dev/null
+++ b/XX_NewFFT/app_resources/shader.comp.hlsl
@@ -0,0 +1,74 @@
+#include "common.hlsl"
+#include "nbl/builtin/hlsl/workgroup/fft.hlsl"
+
+[[vk::push_constant]] PushConstantData pushConstants;
+
+using namespace nbl::hlsl;
+
+using ConstevalParameters = workgroup::fft::ConstevalParameters<ElementsPerThreadLog2, WorkgroupSizeLog2, scalar_t>;
+
+groupshared uint32_t sharedmem[ ConstevalParameters::SharedMemoryDWORDs];
+
+// Users MUST define this method for FFT to work
+uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParameters::WorkgroupSize), 1, 1); }
+
+struct SharedMemoryAccessor 
+{
+	template <typename AccessType, typename IndexType>
+	void set(IndexType idx, AccessType value)
+	{
+		sharedmem[idx] = value;
+	}
+
+	template <typename AccessType, typename IndexType>
+	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
+	{
+		value = sharedmem[idx];
+	}
+
+	void workgroupExecutionAndMemoryBarrier()
+	{
+		glsl::barrier();
+	}
+
+};
+
+// Almost a LegacyBdaAccessor, but since we need `uint32_t index` getter and setter it's the same as writing one ourselves
+struct Accessor
+{
+	static Accessor create(const uint64_t address)
+    {
+        Accessor accessor;
+        accessor.address = address;
+        return accessor;
+    }
+
+	// TODO: can't use our own BDA yet, because it doesn't support the types `workgroup::FFT` will invoke these templates with
+	template <typename AccessType, typename IndexType>
+	void get(const IndexType index, NBL_REF_ARG(AccessType) value)
+	{
+		value = vk::RawBufferLoad<AccessType>(address + index * sizeof(AccessType));
+	}
+
+	template <typename AccessType, typename IndexType>
+	void set(const IndexType index, const AccessType value)
+	{
+		vk::RawBufferStore<AccessType>(address + index * sizeof(AccessType), value);
+	}
+
+	uint64_t address;
+};
+
+[numthreads(ConstevalParameters::WorkgroupSize,1,1)]
+[shader("compute")]
+void main(uint32_t3 ID : SV_DispatchThreadID)
+{
+	Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress);
+	SharedMemoryAccessor sharedmemAccessor;
+
+	// FFT
+
+	workgroup::FFT<false, ConstevalParameters>::template __call<Accessor, SharedMemoryAccessor>(accessor, sharedmemAccessor);
+	sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
+	workgroup::FFT<true, ConstevalParameters>::template __call<Accessor, SharedMemoryAccessor>(accessor, sharedmemAccessor);	
+}
\ No newline at end of file
diff --git a/XX_NewFFT/config.json.template b/XX_NewFFT/config.json.template
new file mode 100644
index 000000000..717d05d53
--- /dev/null
+++ b/XX_NewFFT/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp
new file mode 100644
index 000000000..49d157a38
--- /dev/null
+++ b/XX_NewFFT/main.cpp
@@ -0,0 +1,322 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
+
+#include "nbl/examples/examples.hpp"
+
+using namespace nbl;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
+
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bit.hlsl"
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+
+
+// Simple showcase of how to run FFT on a 1D array
+class FFT_Test final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
+{
+	using device_base_t = application_templates::MonoDeviceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
+
+	smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
+
+	smart_refctd_ptr<nbl::video::IUtilities> m_utils;
+
+	nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
+	StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
+	smart_refctd_ptr<nbl::video::IGPUBuffer> m_deviceLocalBuffer;
+
+	// These are Buffer Device Addresses
+	uint64_t m_upStreamingBufferAddress;
+	uint64_t m_downStreamingBufferAddress;
+	uint64_t m_deviceLocalBufferAddress;
+
+	// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
+	uint32_t m_alignment;
+
+	// This example really lets the advantages of a timeline semaphore shine through!
+	smart_refctd_ptr<ISemaphore> m_timeline;
+	uint64_t semaphorValue = 0;
+
+public:
+	// Yay thanks to multiple inheritance we cannot forward ctors anymore
+	FFT_Test(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	// we stuff all our work here because its a "single shot" app
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		// Remember to call the base class initialization!
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		smart_refctd_ptr<IShader> shader;
+		{
+			IAssetLoader::SAssetLoadParams lp = {};
+			lp.logger = m_logger.get();
+			lp.workingDirectory = "app_resources"; // virtual root
+			auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get());
+			auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
+			const auto assets = assetBundle.getContents();
+			if (assets.empty())
+				return logFail("Could not load shader!");
+
+			// Cast down the asset to its proper type
+			shader = IAsset::castDown<IShader>(assets[0]);
+			
+			if (!shader)
+				return logFail("Invalid shader!");
+		}
+
+		// Create massive upload/download buffers
+		constexpr uint32_t DownstreamBufferSize = sizeof(scalar_t) << 23;
+		constexpr uint32_t UpstreamBufferSize = sizeof(scalar_t) << 23;
+
+		m_utils = IUtilities::create(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize);
+		if (!m_utils)
+			return logFail("Failed to create Utilities!");
+		m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
+		m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
+		m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress();
+		m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
+
+		// Create device-local buffer
+		{
+			const uint32_t scalarElementCount = 2 * complexElementCount;
+			IGPUBuffer::SCreationParams deviceLocalBufferParams = {};
+			
+			IQueue* const queue = getComputeQueue();
+			uint32_t queueFamilyIndex = queue->getFamilyIndex();
+			
+			deviceLocalBufferParams.queueFamilyIndexCount = 1;
+			deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex;
+			deviceLocalBufferParams.size = sizeof(scalar_t) * scalarElementCount;
+			deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			
+			m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams));
+			auto mreqs = m_deviceLocalBuffer->getMemoryReqs();
+			mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+			auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+
+			m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress();
+		}
+		
+		const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(PushConstantData) };
+
+		{
+			auto layout = m_device->createPipelineLayout({ &pcRange,1 });
+			IGPUComputePipeline::SCreationParams params = {};
+			params.layout = layout.get();
+			params.shader.shader = shader.get();
+			params.shader.entryPoint = "main";
+			params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
+			params.cached.requireFullSubgroups = true;
+			if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
+				return logFail("Failed to create compute pipeline!\n");
+		}
+
+		const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
+		// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
+		// which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets.
+		// Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc.
+		// there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those.
+		// We'll align to max of coherent atom size even if the memory is coherent,
+		// and we also need to take into account BDA shader loads need to be aligned to the type being loaded.
+		m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(float));
+
+		// Semaphor used here to know the FFT is done before download
+		m_timeline = m_device->createSemaphore(semaphorValue);
+
+		IQueue* const queue = getComputeQueue();
+
+		// Note that I'm using the sample struct with methods that have identical code which compiles as both C++ and HLSL
+		auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({ semaphorValue ^ 0xdeadbeefu,std::hash<string>()(_NBL_APP_NAME_) });
+
+		const uint32_t scalarElementCount = 2 * complexElementCount;
+		const uint32_t inputSize = sizeof(scalar_t) * scalarElementCount;
+
+		// Just need a single suballocation in this example
+		const uint32_t AllocationCount = 1;
+
+		// It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value
+		// this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args.
+		auto inputOffset = m_upStreamingBuffer->invalid_value;
+
+		// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
+		// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
+		std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
+		// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
+		m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment);
+
+		// Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example!
+		{
+			auto* const inputPtr = reinterpret_cast<scalar_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer()) + inputOffset);
+			std::cout << "Begin array CPU\n";
+			for (auto j = 0; j < complexElementCount; j++)
+			{
+				//Random array
+
+				//scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max);
+
+				// FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),...
+				
+				
+				scalar_t x = j > 0 ? 0.f : 1.f;
+				scalar_t y = 0;
+				
+				
+				// FFT( (c,0), (c,0), (c,0),... ) = (Nc,0), (0,0), (0,0),...
+				
+				/*
+				scalar_t x = 1.f;
+				scalar_t y = 0.f;
+				*/
+
+				inputPtr[2 * j] = x;
+				inputPtr[2 * j + 1] = y;
+				std::cout << "(" << x << ", " << y << "), ";
+			}
+			std::cout << "\nEnd array CPU\n";
+			// Always remember to flush!
+			if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
+			{
+				const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory();
+				const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize);
+				m_device->flushMappedMemoryRanges(1, &range);
+			}
+		}
+
+		// finally allocate our output range
+		const uint32_t outputSize = inputSize;
+
+		auto outputOffset = m_downStreamingBuffer->invalid_value;
+		m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment);
+
+		smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+		{
+			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) {
+				return logFail("Failed to create Command Buffers!\n");
+			}
+			cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger));
+			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdbuf->bindComputePipeline(m_pipeline.get());
+			// This is the new fun part, pushing constants
+			const PushConstantData pc = {.deviceBufferAddress = m_deviceLocalBufferAddress};
+			IGPUCommandBuffer::SBufferCopy copyInfo = {};
+			copyInfo.srcOffset = 0;
+			copyInfo.dstOffset = 0;
+			copyInfo.size = m_deviceLocalBuffer->getSize();
+			cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, &copyInfo);
+			cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+			// Remember we do a single workgroup per 1D array in these parts
+			cmdbuf->dispatch(1, 1, 1);
+
+			// Pipeline barrier: wait for FFT shader to be done before copying to downstream buffer 
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo = {};
+
+			decltype(pipelineBarrierInfo)::buffer_barrier_t barrier = {}; 
+			pipelineBarrierInfo.bufBarriers = { &barrier, 1u };
+
+			barrier.range.buffer = m_deviceLocalBuffer;
+
+			barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS;
+			barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+			barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS;
+
+			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo);
+			cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, &copyInfo);
+			cmdbuf->end();
+		}
+
+		semaphorValue++;
+		{
+			const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
+			{
+				.cmdbuf = cmdbuf.get()
+			};
+			const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
+			{
+				.semaphore = m_timeline.get(),
+				.value = semaphorValue,
+				.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+			};
+
+			const IQueue::SSubmitInfo submitInfo = {
+				.waitSemaphores = {},
+				.commandBuffers = {&cmdbufInfo,1},
+				.signalSemaphores = {&signalInfo,1}
+			};
+
+			m_api->startCapture();
+			queue->submit({ &submitInfo,1 });
+			m_api->endCapture();
+		}
+
+		// We let all latches know what semaphore and counter value has to be passed for the functors to execute
+		const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue };
+
+		// As promised, we can defer an upstreaming buffer deallocation until a fence is signalled
+		// You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation.
+		m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait);
+
+		// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
+		// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
+		// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
+		auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
+			IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize),
+			// Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals
+			[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
+			{
+				// The unused variable is used for letting the consumer know the subsection of the output we've managed to download
+				// But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves.
+				assert(dstOffset == 0 && size == outputSize);
+
+				std::cout << "Begin array GPU\n";
+				scalar_t* const data = reinterpret_cast<scalar_t*>(const_cast<void*>(bufSrc));
+				for (auto i = 0u; i < complexElementCount; i++) {
+					std::cout << "(" << data[2 * i] << ", " << data[2 * i + 1] << "), ";
+				}
+
+				std::cout << "\nEnd array GPU\n";
+			},
+			// Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it
+			// hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands.
+			// It could also be latched in the upstreaming deallocate, because its the same fence.
+			std::move(cmdbuf), m_downStreamingBuffer
+		);
+		// We put a function we want to execute 
+		m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get());
+
+		return true;
+	}
+
+	// One-shot App
+	bool keepRunning() override { return false; }
+
+	// One-shot App
+	void workLoopBody() override{}
+
+	// Cleanup
+	bool onAppTerminated() override
+	{
+		// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
+		// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
+		while (m_downStreamingBuffer->cull_frees()) {}
+		return device_base_t::onAppTerminated();
+	}
+};
+
+
+NBL_MAIN_FUNC(FFT_Test)
\ No newline at end of file
diff --git a/XX_NewFFT/pipeline.groovy b/XX_NewFFT/pipeline.groovy
new file mode 100644
index 000000000..1a7b043a4
--- /dev/null
+++ b/XX_NewFFT/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
+{
+	public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file

From 1d787b3725970ae8c706c858fc7b675faff952a1 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Wed, 8 Apr 2026 16:36:35 -0300
Subject: [PATCH 2/8] Subgroup runs

---
 XX_NewFFT/app_resources/common.hlsl      |  14 ++-
 XX_NewFFT/app_resources/shader.comp.hlsl |  70 ++++++++++++---
 XX_NewFFT/main.cpp                       | 107 +++++++++++++++++++----
 3 files changed, 163 insertions(+), 28 deletions(-)

diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl
index dc6b96e71..2391f9ee1 100644
--- a/XX_NewFFT/app_resources/common.hlsl
+++ b/XX_NewFFT/app_resources/common.hlsl
@@ -1,4 +1,5 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/fft.hlsl"
 
 using scalar_t = nbl::hlsl::float32_t;
 
@@ -7,6 +8,13 @@ struct PushConstantData
 	uint64_t deviceBufferAddress;
 };
 
-NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 6;
-NBL_CONSTEXPR uint32_t ElementsPerThreadLog2 = 3;
-NBL_CONSTEXPR uint32_t complexElementCount = uint32_t(1) << (WorkgroupSizeLog2 + ElementsPerThreadLog2);
\ No newline at end of file
+NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 5;
+NBL_CONSTEXPR uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2;
+NBL_CONSTEXPR uint32_t SubgroupSizeLog2 = 5; // hardcoded to my nvidia gpu, should be queried at compilation time
+NBL_CONSTEXPR uint32_t SubgroupSize = uint32_t(1) << SubgroupSizeLog2;
+
+NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 1;
+NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(1);
+NBL_CONSTEXPR uint32_t ElementsPerThread = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationLog2);
+
+NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) << WorkgroupSizeLog2);
\ No newline at end of file
diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl
index 7c86f50b4..acf35be42 100644
--- a/XX_NewFFT/app_resources/shader.comp.hlsl
+++ b/XX_NewFFT/app_resources/shader.comp.hlsl
@@ -1,17 +1,20 @@
 #include "common.hlsl"
-#include "nbl/builtin/hlsl/workgroup/fft.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/fft.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/fft.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 
 [[vk::push_constant]] PushConstantData pushConstants;
 
 using namespace nbl::hlsl;
 
-using ConstevalParameters = workgroup::fft::ConstevalParameters<ElementsPerThreadLog2, WorkgroupSizeLog2, scalar_t>;
+//using ConstevalParameters = workgroup::fft::ConstevalParameters<ElementsPerThreadLog2, WorkgroupSizeLog2, scalar_t>;
 
-groupshared uint32_t sharedmem[ ConstevalParameters::SharedMemoryDWORDs];
+//groupshared uint32_t sharedmem[ ConstevalParameters::SharedMemoryDWORDs];
 
 // Users MUST define this method for FFT to work
-uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParameters::WorkgroupSize), 1, 1); }
+//uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParameters::WorkgroupSize), 1, 1); }
 
+/*
 struct SharedMemoryAccessor 
 {
 	template <typename AccessType, typename IndexType>
@@ -32,8 +35,10 @@ struct SharedMemoryAccessor
 	}
 
 };
+*/
 
 // Almost a LegacyBdaAccessor, but since we need `uint32_t index` getter and setter it's the same as writing one ourselves
+
 struct Accessor
 {
 	static Accessor create(const uint64_t address)
@@ -59,16 +64,61 @@ struct Accessor
 	uint64_t address;
 };
 
-[numthreads(ConstevalParameters::WorkgroupSize,1,1)]
+
+template<uint16_t Size>
+struct InvocationElementsAccessor
+{
+	float32_t real[Size];
+	float32_t imag[Size];
+
+	void get(uint32_t channel, NBL_REF_ARG(complex_t<float32_t>) value)
+	{
+		value.real(real[channel]);
+		value.imag(imag[channel]);
+	}
+
+	void set(uint32_t channel, complex_t<float32_t> value)
+	{
+		real[channel] = value.real();
+		imag[channel] = value.imag();
+	}
+};
+
+//[numthreads(ConstevalParameters::WorkgroupSize,1,1)]
+[numthreads(WorkgroupSize, 1, 1)]
 [shader("compute")]
 void main(uint32_t3 ID : SV_DispatchThreadID)
 {
+	// global mem read write
 	Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress);
-	SharedMemoryAccessor sharedmemAccessor;
+	// Load elements into the accessor
+	InvocationElementsAccessor<ElementsPerThread / 2> loAcc;
+	InvocationElementsAccessor<ElementsPerThread / 2> hiAcc;
+
+	using IndexingUtils = workgroup2::FFTIndexingUtils<Radix2ElementsPerInvocationLog2, WorkgroupSizeLog2, ExtraPrimeFactor>;
 
-	// FFT
+	[unroll]
+	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)
+	{
+		complex_t<float32_t> lo, hi;
+		accessor.get(glsl::gl_SubgroupInvocationID() + 2 * pair * SubgroupSize, lo);
+		loAcc.set(pair, lo);
+		accessor.get(glsl::gl_SubgroupInvocationID() + (2 * pair + 1) * SubgroupSize, hi);
+		hiAcc.set(pair, hi);
+	}
+	//subgroup2::FFT<SubgroupSize, true, float32_t>::__call(0, ElementsPerThread / 2 - 1, loAcc, hiAcc);
+	//subgroup2::FFT<SubgroupSize, false, float32_t>::__call(0, ElementsPerThread / 2 - 1, loAcc, hiAcc);
+	//subgroup2::FFT<SubgroupSize, false, float32_t>::__callInterleaved<1, WorkgroupSize>(WorkgroupSize, 1, 0, ElementsPerThread / 2 - 1, loAcc, hiAcc);
+	//subgroup2::FFT<SubgroupSize, true, float32_t>::__callInterleaved<1, WorkgroupSize>(1, WorkgroupSize, 1, 0, ElementsPerThread / 2 - 1, loAcc, hiAcc);
 
-	workgroup::FFT<false, ConstevalParameters>::template __call<Accessor, SharedMemoryAccessor>(accessor, sharedmemAccessor);
-	sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
-	workgroup::FFT<true, ConstevalParameters>::template __call<Accessor, SharedMemoryAccessor>(accessor, sharedmemAccessor);	
+	[unroll]
+	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)
+	{
+		complex_t<float32_t> lo, hi;
+		loAcc.get(pair, lo);
+		accessor.set(glsl::gl_SubgroupInvocationID() + 2 * pair * SubgroupSize, lo);
+		hiAcc.get(pair, hi);
+		accessor.set(glsl::gl_SubgroupInvocationID() + (2 * pair + 1) * SubgroupSize, hi);
+	}
+	
 }
\ No newline at end of file
diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp
index 49d157a38..f539e3a5b 100644
--- a/XX_NewFFT/main.cpp
+++ b/XX_NewFFT/main.cpp
@@ -19,6 +19,74 @@ using namespace nbl::examples;
 #include "nbl/builtin/hlsl/bit.hlsl"
 #include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
 
+// Function implemented in workgroup2::FFTIndexingUtils is meant to be fast based on the observation that reordering can still be performed fast in the case of
+// a single prime factor. However, we need to test that the implemented fast version matches the real ordering
+template<uint32_t Radix2ElementsPerInvocationLog2, uint32_t WorkgroupSizeLog2, uint32_t ExtraPrimeFactor>
+void DIFOrderTester()
+{
+	using IndexingUtils = nbl::hlsl::workgroup2::FFTIndexingUtils<Radix2ElementsPerInvocationLog2, WorkgroupSizeLog2, ExtraPrimeFactor>;
+	using IndexingUtilsHelper = typename IndexingUtils::helper_t;
+	const uint32_t FFTSize = IndexingUtils::FFTSize;
+	const uint32_t Radix2FFTSizeLog2 = IndexingUtils::Radix2FFTSizeLog2;
+	const uint32_t Radix2FFTSize = IndexingUtils::Radix2FFTSize;
+
+	// Check fast div correctness
+	{
+		bool correct = true;
+		for (auto idx = 0u; idx < FFTSize; idx++)
+		{
+			if (idx / ExtraPrimeFactor != IndexingUtilsHelper::fastDiv(idx)) correct = false;
+		}
+		std::cout << "Fast div test " << (correct ? "passed\n" : "did not pass\n");
+	}
+
+	// Check whether the forward ordering is computed properly
+	{
+		bool correct = true;
+		for (auto idx = 0u; idx < FFTSize; idx++)
+		{
+			uint32_t fastIdx = IndexingUtilsHelper::mapLaneToFreq(idx);
+			if constexpr (ExtraPrimeFactor == 1)
+			{
+				if (fastIdx != nbl::hlsl::bitReverseAs<uint32_t>(idx, Radix2FFTSizeLog2)) correct = false;
+			}
+			else
+			{
+				uint32_t index = idx;
+				std::vector<uint32_t> digits;
+				for (auto i = 0u; i < Radix2FFTSizeLog2; i++)
+				{
+					digits.push_back(index & 1);
+					index >>= 1;
+				}
+				digits.push_back(index);
+				// Reconstruct mapping
+				uint32_t correctIdx = 0;
+				uint32_t multiplier = ExtraPrimeFactor * Radix2FFTSize;
+				for (auto i = 0u; i < Radix2FFTSizeLog2; i++)
+				{
+					multiplier >>= 1;
+					correctIdx += multiplier * digits[i];
+				}
+				multiplier /= ExtraPrimeFactor;
+				correctIdx += multiplier * digits[Radix2FFTSizeLog2];
+				if (fastIdx != correctIdx)
+					correct = false;
+			}
+		}
+		std::cout << "Forward test " << (correct ? "passed\n" : "did not pass\n");
+	}
+
+	// Check whether the inverse actually computes the inverse
+	{
+		bool correct = true;
+		for (auto idx = 0; idx < FFTSize; idx++)
+		{
+			if (idx != IndexingUtilsHelper::mapFreqToLane(IndexingUtilsHelper::mapLaneToFreq(idx))) correct = false;
+		}
+		std::cout << "Inverse test " << (correct ? "passed\n" : "did not pass\n");
+	}
+}
 
 // Simple showcase of how to run FFT on a 1D array
 class FFT_Test final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
@@ -158,6 +226,9 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 		// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
 		m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment);
 
+		// Run DIF Ordering test
+		DIFOrderTester<Radix2ElementsPerInvocationLog2, WorkgroupSizeLog2, ExtraPrimeFactor>();
+
 		// Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example!
 		{
 			auto* const inputPtr = reinterpret_cast<scalar_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer()) + inputOffset);
@@ -166,27 +237,25 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 			{
 				//Random array
 
-				//scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max);
+				scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max);
+				#define DIVIDE
+
+				//FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),...
 
-				// FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),...
-				
-				
-				scalar_t x = j > 0 ? 0.f : 1.f;
-				scalar_t y = 0;
-				
-				
+
+				//scalar_t x = j > 0 ? 0.f : 1.f, y = 0;
+					
 				// FFT( (c,0), (c,0), (c,0),... ) = (Nc,0), (0,0), (0,0),...
-				
-				/*
-				scalar_t x = 1.f;
-				scalar_t y = 0.f;
-				*/
+
+
+				//scalar_t x = 1.f, y = 0.f;
 
 				inputPtr[2 * j] = x;
 				inputPtr[2 * j + 1] = y;
 				std::cout << "(" << x << ", " << y << "), ";
 			}
 			std::cout << "\nEnd array CPU\n";
+			
 			// Always remember to flush!
 			if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
 			{
@@ -285,9 +354,17 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 
 				std::cout << "Begin array GPU\n";
 				scalar_t* const data = reinterpret_cast<scalar_t*>(const_cast<void*>(bufSrc));
-				for (auto i = 0u; i < complexElementCount; i++) {
-					std::cout << "(" << data[2 * i] << ", " << data[2 * i + 1] << "), ";
+				#ifdef DIVIDE
+				for (auto j = 0; j < complexElementCount; j++)
+				{
+					std::cout << "(" << data[2 * j] / complexElementCount << ", " << data[2 * j + 1] / complexElementCount << "), ";
+				}
+				#else
+				for (auto j = 0; j < complexElementCount; j++)
+				{
+					std::cout << "(" << data[2 * j] << ", " << data[2 * j + 1] << "), ";
 				}
+				#endif			
 
 				std::cout << "\nEnd array GPU\n";
 			},

From d84bcfcb04960d5d69b47fe969e10c676ac8ca86 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Thu, 9 Apr 2026 21:33:59 -0300
Subject: [PATCH 3/8] Working, missing divisionpolicy and proper channel count

---
 XX_NewFFT/app_resources/common.hlsl      | 12 +++--
 XX_NewFFT/app_resources/shader.comp.hlsl | 59 ++++++++++++++++++------
 XX_NewFFT/main.cpp                       | 44 +++++++++++++++++-
 3 files changed, 97 insertions(+), 18 deletions(-)

diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl
index 2391f9ee1..4d1f916d9 100644
--- a/XX_NewFFT/app_resources/common.hlsl
+++ b/XX_NewFFT/app_resources/common.hlsl
@@ -8,13 +8,17 @@ struct PushConstantData
 	uint64_t deviceBufferAddress;
 };
 
-NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 5;
+NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 9;
 NBL_CONSTEXPR uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2;
 NBL_CONSTEXPR uint32_t SubgroupSizeLog2 = 5; // hardcoded to my nvidia gpu, should be queried at compilation time
 NBL_CONSTEXPR uint32_t SubgroupSize = uint32_t(1) << SubgroupSizeLog2;
 
-NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 1;
-NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(1);
+NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 2;
+NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(5);
 NBL_CONSTEXPR uint32_t ElementsPerThread = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationLog2);
 
-NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) << WorkgroupSizeLog2);
\ No newline at end of file
+NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) << WorkgroupSizeLog2);
+NBL_CONSTEXPR uint32_t complexElementCountPerChannel = uint32_t(1) << (WorkgroupSizeLog2 + 1);
+NBL_CONSTEXPR uint32_t Channels = ElementsPerThread / 2;
+
+NBL_CONSTEXPR uint32_t ShuffledChannelsPerRound = nbl::hlsl::mpl::min_v<uint32_t, Channels, 4>;
\ No newline at end of file
diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl
index acf35be42..d9633de33 100644
--- a/XX_NewFFT/app_resources/shader.comp.hlsl
+++ b/XX_NewFFT/app_resources/shader.comp.hlsl
@@ -9,12 +9,11 @@ using namespace nbl::hlsl;
 
 //using ConstevalParameters = workgroup::fft::ConstevalParameters<ElementsPerThreadLog2, WorkgroupSizeLog2, scalar_t>;
 
-//groupshared uint32_t sharedmem[ ConstevalParameters::SharedMemoryDWORDs];
+groupshared uint32_t sharedmem[4 * ((sizeof(complex_t<float32_t>) / sizeof(uint32_t)) << WorkgroupSizeLog2) ];
 
 // Users MUST define this method for FFT to work
 //uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParameters::WorkgroupSize), 1, 1); }
 
-/*
 struct SharedMemoryAccessor 
 {
 	template <typename AccessType, typename IndexType>
@@ -35,7 +34,6 @@ struct SharedMemoryAccessor
 	}
 
 };
-*/
 
 // Almost a LegacyBdaAccessor, but since we need `uint32_t index` getter and setter it's the same as writing one ourselves
 
@@ -95,30 +93,65 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 	InvocationElementsAccessor<ElementsPerThread / 2> loAcc;
 	InvocationElementsAccessor<ElementsPerThread / 2> hiAcc;
 
-	using IndexingUtils = workgroup2::FFTIndexingUtils<Radix2ElementsPerInvocationLog2, WorkgroupSizeLog2, ExtraPrimeFactor>;
+	// Set up the memory adaptor
+	SharedMemoryAccessor sharedmemAccessor;
+	//using adaptor_t = accessor_adaptors::StructureOfArrays<SharedMemoryAccessor, uint32_t, uint32_t, 1, WorkgroupSize>;
+	//adaptor_t sharedmemAdaptor;
+	//sharedmemAdaptor.accessor = sharedmemAccessor;
 
+	using ConstevalParameters = workgroup2::fft::ConstevalParameters<ElementsPerThread, SubgroupSizeLog2, WorkgroupSizeLog2, ShuffledChannelsPerRound, false, 0, float32_t>;
+	using FFT = workgroup2::FFT<false, ConstevalParameters>;
+	using IFFT = workgroup2::FFT<true, ConstevalParameters>;
+
+	// Invert last channel to ensure ping pong works
 	[unroll]
 	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)
 	{
 		complex_t<float32_t> lo, hi;
-		accessor.get(glsl::gl_SubgroupInvocationID() + 2 * pair * SubgroupSize, lo);
+		accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo);
 		loAcc.set(pair, lo);
-		accessor.get(glsl::gl_SubgroupInvocationID() + (2 * pair + 1) * SubgroupSize, hi);
+		accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi);
 		hiAcc.set(pair, hi);
+		//printf("Pair %d is lo: %f, %f hi: %f, %f", pair, lo.real(), lo.imag(), hi.real(), hi.imag());
+		//printf("SharedmemSize: %d", 4 * ((sizeof(complex_t<float32_t>) / sizeof(uint32_t)) << WorkgroupSizeLog2));
+		//printf("ShuffleRounds: %d", ConstevalParameters::ShuffleRounds);
+	}
+
+	FFT::__call(loAcc, hiAcc, sharedmemAccessor);
+	sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
+	IFFT::__call(loAcc, hiAcc, sharedmemAccessor);
+
+	[unroll]
+	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)
+	{
+		complex_t<float32_t> lo, hi;
+		loAcc.get(pair, lo);
+		accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo);
+		hiAcc.get(pair, hi);
+		accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi);
 	}
-	//subgroup2::FFT<SubgroupSize, true, float32_t>::__call(0, ElementsPerThread / 2 - 1, loAcc, hiAcc);
-	//subgroup2::FFT<SubgroupSize, false, float32_t>::__call(0, ElementsPerThread / 2 - 1, loAcc, hiAcc);
-	//subgroup2::FFT<SubgroupSize, false, float32_t>::__callInterleaved<1, WorkgroupSize>(WorkgroupSize, 1, 0, ElementsPerThread / 2 - 1, loAcc, hiAcc);
-	//subgroup2::FFT<SubgroupSize, true, float32_t>::__callInterleaved<1, WorkgroupSize>(1, WorkgroupSize, 1, 0, ElementsPerThread / 2 - 1, loAcc, hiAcc);
+
+	/*
+	[unroll]
+	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)
+	{
+		complex_t<float32_t> lo, hi;
+		accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo);
+		loAcc.set(pair, lo);
+		accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi);
+		hiAcc.set(pair, hi);
+	}
+
+	Exchanger::__call(0, Channels - 1, loAcc, hiAcc, TestStride, sharedmemAdaptor, false);
 
 	[unroll]
 	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)
 	{
 		complex_t<float32_t> lo, hi;
 		loAcc.get(pair, lo);
-		accessor.set(glsl::gl_SubgroupInvocationID() + 2 * pair * SubgroupSize, lo);
+		accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo);
 		hiAcc.get(pair, hi);
-		accessor.set(glsl::gl_SubgroupInvocationID() + (2 * pair + 1) * SubgroupSize, hi);
+		accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi);
 	}
-	
+	*/
 }
\ No newline at end of file
diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp
index f539e3a5b..3089caf82 100644
--- a/XX_NewFFT/main.cpp
+++ b/XX_NewFFT/main.cpp
@@ -31,6 +31,7 @@ void DIFOrderTester()
 	const uint32_t Radix2FFTSize = IndexingUtils::Radix2FFTSize;
 
 	// Check fast div correctness
+	if constexpr (ExtraPrimeFactor > 1)
 	{
 		bool correct = true;
 		for (auto idx = 0u; idx < FFTSize; idx++)
@@ -233,6 +234,34 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 		{
 			auto* const inputPtr = reinterpret_cast<scalar_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer()) + inputOffset);
 			std::cout << "Begin array CPU\n";
+			for (auto channel = 0; channel < Channels; channel++)
+			{
+				std::cout << "Begin channel " << channel << "\n";
+				for (auto j = 0; j < complexElementCountPerChannel; j++)
+				{
+					//Random array
+
+					//scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max);
+					//#define DIVIDE
+
+
+					//FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),...
+
+
+					scalar_t x = j > 0 ? 0.f : 1.f, y = 0;
+
+					// FFT( (c,0), (c,0), (c,0),... ) = (Nc,0), (0,0), (0,0),...
+
+
+					//scalar_t x = 1.f, y = 0.f;
+
+					inputPtr[2 * complexElementCountPerChannel * channel + 2 * j] = x;
+					inputPtr[2 * complexElementCountPerChannel * channel + 2 * j + 1] = y;
+					std::cout << "(" << x << ", " << y << "), ";
+				}
+				std::cout << "\nEnd channel " << channel << "\n";
+			}
+			/*
 			for (auto j = 0; j < complexElementCount; j++)
 			{
 				//Random array
@@ -254,6 +283,7 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 				inputPtr[2 * j + 1] = y;
 				std::cout << "(" << x << ", " << y << "), ";
 			}
+			*/
 			std::cout << "\nEnd array CPU\n";
 			
 			// Always remember to flush!
@@ -354,6 +384,17 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 
 				std::cout << "Begin array GPU\n";
 				scalar_t* const data = reinterpret_cast<scalar_t*>(const_cast<void*>(bufSrc));
+				for (auto channel = 0; channel < Channels; channel++)
+				{
+					std::cout << "Begin channel " << channel << "\n";
+					for (auto j = 0; j < complexElementCountPerChannel; j++)
+					{
+						std::cout << "(" << data[2 * complexElementCountPerChannel * channel + 2 * j] << ", " << data[2 * complexElementCountPerChannel * channel + 2 * j + 1] << "), ";
+					}
+					std::cout << "\nEnd channel " << channel << "\n";
+				}
+
+				/*
 				#ifdef DIVIDE
 				for (auto j = 0; j < complexElementCount; j++)
 				{
@@ -364,7 +405,8 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 				{
 					std::cout << "(" << data[2 * j] << ", " << data[2 * j + 1] << "), ";
 				}
-				#endif			
+				#endif		
+				*/
 
 				std::cout << "\nEnd array GPU\n";
 			},

From 2e6ea2fb6f573c7c89414e3c60b14df034d8c822 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Tue, 14 Apr 2026 17:10:54 -0300
Subject: [PATCH 4/8] Division policy and twiddle opts still missing. Ran into
 DXC and Nvidia compiler bugs

---
 XX_NewFFT/app_resources/common.hlsl      |  4 ++--
 XX_NewFFT/app_resources/shader.comp.hlsl |  4 ++--
 XX_NewFFT/main.cpp                       | 16 ++++++++++++++++
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl
index 4d1f916d9..8272894ac 100644
--- a/XX_NewFFT/app_resources/common.hlsl
+++ b/XX_NewFFT/app_resources/common.hlsl
@@ -13,8 +13,8 @@ NBL_CONSTEXPR uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2;
 NBL_CONSTEXPR uint32_t SubgroupSizeLog2 = 5; // hardcoded to my nvidia gpu, should be queried at compilation time
 NBL_CONSTEXPR uint32_t SubgroupSize = uint32_t(1) << SubgroupSizeLog2;
 
-NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 2;
-NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(5);
+NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 1;
+NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(1);
 NBL_CONSTEXPR uint32_t ElementsPerThread = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationLog2);
 
 NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) << WorkgroupSizeLog2);
diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl
index d9633de33..90a45c8a0 100644
--- a/XX_NewFFT/app_resources/shader.comp.hlsl
+++ b/XX_NewFFT/app_resources/shader.comp.hlsl
@@ -118,8 +118,8 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 	}
 
 	FFT::__call(loAcc, hiAcc, sharedmemAccessor);
-	sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
-	IFFT::__call(loAcc, hiAcc, sharedmemAccessor);
+	//sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
+	//IFFT::__call(loAcc, hiAcc, sharedmemAccessor);
 
 	[unroll]
 	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)
diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp
index 3089caf82..b2f8f8db8 100644
--- a/XX_NewFFT/main.cpp
+++ b/XX_NewFFT/main.cpp
@@ -120,6 +120,22 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 	FFT_Test(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
 		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
+	inline core::smart_refctd_ptr<IShader> createShader(const std::string& includeMainName)
+	{
+		auto HLSLShader = core::make_smart_refctd_ptr<IShader>(("#include \"" + includeMainName + "\"\n").c_str(),
+			IShader::E_CONTENT_TYPE::ECT_HLSL,
+			includeMainName);
+		assert(HLSLShader);
+
+#ifndef _NBL_DEBUG
+		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+		auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+		return m_device->compileShader({ HLSLShader.get(), opt.get() });
+#else 
+		return m_device->compileShader({ HLSLShader.get() });
+#endif
+	}
+
 	// we stuff all our work here because its a "single shot" app
 	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 	{

From accdf13e75b2158544ef27b17919e190a96d6fa9 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Wed, 22 Apr 2026 00:01:38 -0300
Subject: [PATCH 5/8] No major changes

---
 XX_NewFFT/app_resources/shader.comp.hlsl | 27 --------
 XX_NewFFT/main.cpp                       | 78 +++++++++++++++++-------
 2 files changed, 56 insertions(+), 49 deletions(-)

diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl
index 90a45c8a0..18f46ab75 100644
--- a/XX_NewFFT/app_resources/shader.comp.hlsl
+++ b/XX_NewFFT/app_resources/shader.comp.hlsl
@@ -11,9 +11,6 @@ using namespace nbl::hlsl;
 
 groupshared uint32_t sharedmem[4 * ((sizeof(complex_t<float32_t>) / sizeof(uint32_t)) << WorkgroupSizeLog2) ];
 
-// Users MUST define this method for FFT to work
-//uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParameters::WorkgroupSize), 1, 1); }
-
 struct SharedMemoryAccessor 
 {
 	template <typename AccessType, typename IndexType>
@@ -130,28 +127,4 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 		hiAcc.get(pair, hi);
 		accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi);
 	}
-
-	/*
-	[unroll]
-	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)
-	{
-		complex_t<float32_t> lo, hi;
-		accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo);
-		loAcc.set(pair, lo);
-		accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi);
-		hiAcc.set(pair, hi);
-	}
-
-	Exchanger::__call(0, Channels - 1, loAcc, hiAcc, TestStride, sharedmemAdaptor, false);
-
-	[unroll]
-	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)
-	{
-		complex_t<float32_t> lo, hi;
-		loAcc.get(pair, lo);
-		accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo);
-		hiAcc.get(pair, hi);
-		accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi);
-	}
-	*/
 }
\ No newline at end of file
diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp
index b2f8f8db8..c61b354eb 100644
--- a/XX_NewFFT/main.cpp
+++ b/XX_NewFFT/main.cpp
@@ -120,20 +120,66 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 	FFT_Test(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
 		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
-	inline core::smart_refctd_ptr<IShader> createShader(const std::string& includeMainName)
+	inline core::smart_refctd_ptr<IShader> createShader(const char* includeMainName)
 	{
-		auto HLSLShader = core::make_smart_refctd_ptr<IShader>(("#include \"" + includeMainName + "\"\n").c_str(),
+		auto HLSLShader = core::make_smart_refctd_ptr<IShader>((std::string("#include \"") + includeMainName + "\"\n").c_str(),
 			IShader::E_CONTENT_TYPE::ECT_HLSL,
 			includeMainName);
 		assert(HLSLShader);
 
+		ILogicalDevice::SShaderCreationParameters shaderCreationParams{ .source = HLSLShader.get(),
+																		.preprocessedOutputPath = (localOutputCWD / "preprocessed.hlsl").string(),
+																		.spvOutputPath = (localOutputCWD / "out.spv").string() };
 #ifndef _NBL_DEBUG
 		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
 		auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
-		return m_device->compileShader({ HLSLShader.get(), opt.get() });
-#else 
-		return m_device->compileShader({ HLSLShader.get() });
+		shaderCreationParams.optimizer = opt.get();
+		shaderCreationParams.optimizerIsExtraPasses = true;
 #endif
+		return m_device->compileShader(shaderCreationParams);
+	}
+	
+	// useful for debugging compiler issues
+	inline core::smart_refctd_ptr<IShader> createSpirvShader(const char* spirvName)
+	{
+		core::smart_refctd_ptr<ICPUBuffer> shaderBuffer;
+		{
+			core::smart_refctd_ptr<system::IFile> shaderReadFile;
+			system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+			m_system->createFile(future, spirvName, system::IFile::ECF_READ);
+			if (future.wait())
+			{
+				future.acquire().move_into(shaderReadFile);
+				if (shaderReadFile)
+				{
+					const size_t size = shaderReadFile->getSize();
+					if (size > 0ull)
+					{
+						asset::IBuffer::SCreationParams bufferCreationParams{ .size = size };
+						asset::ICPUBuffer::SCreationParams foo;
+						foo = bufferCreationParams;
+						shaderBuffer = ICPUBuffer::create(std::move(foo));
+						system::IFile::success_t succ;
+						shaderReadFile->read(succ, shaderBuffer->getPointer(), 0, size);
+						if (!succ)
+							m_logger->log("Failed Reading From Shader File.", ILogger::ELL_ERROR);
+					}
+				}
+				else
+				{
+					m_logger->log("Failed Opening Shader File.", ILogger::ELL_ERROR);
+				}
+			}
+			else
+			{
+				m_logger->log("Failed Opening Shader Cache File.", ILogger::ELL_ERROR);
+			}
+		}
+
+		auto SPIRVShader = core::make_smart_refctd_ptr<IShader>(std::move(shaderBuffer), IShader::E_CONTENT_TYPE::ECT_SPIRV, spirvName);
+		assert(SPIRVShader);
+
+		return m_device->compileShader({ .source = SPIRVShader.get() });
 	}
 
 	// we stuff all our work here because its a "single shot" app
@@ -145,23 +191,11 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 		if (!asset_base_t::onAppInitialized(std::move(system)))
 			return false;
 
-		smart_refctd_ptr<IShader> shader;
-		{
-			IAssetLoader::SAssetLoadParams lp = {};
-			lp.logger = m_logger.get();
-			lp.workingDirectory = "app_resources"; // virtual root
-			auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get());
-			auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
-			const auto assets = assetBundle.getContents();
-			if (assets.empty())
-				return logFail("Could not load shader!");
-
-			// Cast down the asset to its proper type
-			shader = IAsset::castDown<IShader>(assets[0]);
-			
-			if (!shader)
-				return logFail("Invalid shader!");
-		}
+		smart_refctd_ptr<IShader> shader = createShader("app_resources/shader.comp.hlsl");
+		// DEBUG
+		//smart_refctd_ptr<IShader> shader = createSpirvShader("app_resources/optimized.spv");
+		if (!shader)
+			return logFail("Invalid shader!");
 
 		// Create massive upload/download buffers
 		constexpr uint32_t DownstreamBufferSize = sizeof(scalar_t) << 23;

From f466d64c96374cb9c80769c664c7aba99f228a69 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Tue, 5 May 2026 00:52:54 -0300
Subject: [PATCH 6/8] Halfway added subgroup shared twiddles

---
 XX_NewFFT/app_resources/shader.comp.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl
index 18f46ab75..5663395a8 100644
--- a/XX_NewFFT/app_resources/shader.comp.hlsl
+++ b/XX_NewFFT/app_resources/shader.comp.hlsl
@@ -96,7 +96,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 	//adaptor_t sharedmemAdaptor;
 	//sharedmemAdaptor.accessor = sharedmemAccessor;
 
-	using ConstevalParameters = workgroup2::fft::ConstevalParameters<ElementsPerThread, SubgroupSizeLog2, WorkgroupSizeLog2, ShuffledChannelsPerRound, false, 0, float32_t>;
+	using ConstevalParameters = workgroup2::fft::ConstevalParameters<ElementsPerThread, SubgroupSizeLog2, WorkgroupSizeLog2, ShuffledChannelsPerRound, false, 0, true, float32_t>;
 	using FFT = workgroup2::FFT<false, ConstevalParameters>;
 	using IFFT = workgroup2::FFT<true, ConstevalParameters>;
 

From a69d8e8d5ab45d6bc58986dafa777539689517f1 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Tue, 5 May 2026 20:19:56 -0300
Subject: [PATCH 7/8] Implementing channels

---
 XX_NewFFT/app_resources/common.hlsl      | 4 +++-
 XX_NewFFT/app_resources/shader.comp.hlsl | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl
index 8272894ac..39d12f994 100644
--- a/XX_NewFFT/app_resources/common.hlsl
+++ b/XX_NewFFT/app_resources/common.hlsl
@@ -21,4 +21,6 @@ NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) <<
 NBL_CONSTEXPR uint32_t complexElementCountPerChannel = uint32_t(1) << (WorkgroupSizeLog2 + 1);
 NBL_CONSTEXPR uint32_t Channels = ElementsPerThread / 2;
 
-NBL_CONSTEXPR uint32_t ShuffledChannelsPerRound = nbl::hlsl::mpl::min_v<uint32_t, Channels, 4>;
\ No newline at end of file
+NBL_CONSTEXPR uint32_t ShuffledChannelsPerRound = nbl::hlsl::mpl::min_v<uint32_t, Channels, 4>;
+
+NBL_CONSTEXPR bool ShareTwiddles = true;
\ No newline at end of file
diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl
index 5663395a8..9933591fb 100644
--- a/XX_NewFFT/app_resources/shader.comp.hlsl
+++ b/XX_NewFFT/app_resources/shader.comp.hlsl
@@ -96,7 +96,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 	//adaptor_t sharedmemAdaptor;
 	//sharedmemAdaptor.accessor = sharedmemAccessor;
 
-	using ConstevalParameters = workgroup2::fft::ConstevalParameters<ElementsPerThread, SubgroupSizeLog2, WorkgroupSizeLog2, ShuffledChannelsPerRound, false, 0, true, float32_t>;
+	using ConstevalParameters = workgroup2::fft::ConstevalParameters<ElementsPerThread, SubgroupSizeLog2, WorkgroupSizeLog2, ShuffledChannelsPerRound, false, 0, ShareTwiddles, float32_t>;
 	using FFT = workgroup2::FFT<false, ConstevalParameters>;
 	using IFFT = workgroup2::FFT<true, ConstevalParameters>;
 
@@ -115,8 +115,8 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 	}
 
 	FFT::__call(loAcc, hiAcc, sharedmemAccessor);
-	//sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
-	//IFFT::__call(loAcc, hiAcc, sharedmemAccessor);
+	sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
+	IFFT::__call(loAcc, hiAcc, sharedmemAccessor);
 
 	[unroll]
 	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)

From 1d6aae3f97998e36199645a6b08b76e39189b615 Mon Sep 17 00:00:00 2001
From: Fletterio <fj.letterio@gmail.com>
Date: Wed, 6 May 2026 18:07:43 -0300
Subject: [PATCH 8/8] Channels added, lacks support on cpp side for further
 examples

---
 XX_NewFFT/app_resources/common.hlsl      | 17 ++++++-----
 XX_NewFFT/app_resources/shader.comp.hlsl | 38 +++++++++++++-----------
 XX_NewFFT/main.cpp                       |  2 +-
 3 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl
index 39d12f994..8cd0cf44f 100644
--- a/XX_NewFFT/app_resources/common.hlsl
+++ b/XX_NewFFT/app_resources/common.hlsl
@@ -13,14 +13,17 @@ NBL_CONSTEXPR uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2;
 NBL_CONSTEXPR uint32_t SubgroupSizeLog2 = 5; // hardcoded to my nvidia gpu, should be queried at compilation time
 NBL_CONSTEXPR uint32_t SubgroupSize = uint32_t(1) << SubgroupSizeLog2;
 
-NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationLog2 = 1;
+NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationPerChannelLog2 = 1;
 NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(1);
-NBL_CONSTEXPR uint32_t ElementsPerThread = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationLog2);
+NBL_CONSTEXPR uint32_t ElementsPerInvocationPerChannel = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationPerChannelLog2);
 
-NBL_CONSTEXPR uint32_t complexElementCount = ElementsPerThread * (uint32_t(1) << WorkgroupSizeLog2);
-NBL_CONSTEXPR uint32_t complexElementCountPerChannel = uint32_t(1) << (WorkgroupSizeLog2 + 1);
-NBL_CONSTEXPR uint32_t Channels = ElementsPerThread / 2;
+NBL_CONSTEXPR uint32_t Channels = 1;
+NBL_CONSTEXPR uint32_t complexElementCountPerChannel = ElementsPerInvocationPerChannel * (uint32_t(1) << WorkgroupSizeLog2);
+NBL_CONSTEXPR uint32_t complexElementCount = Channels * complexElementCountPerChannel;
 
-NBL_CONSTEXPR uint32_t ShuffledChannelsPerRound = nbl::hlsl::mpl::min_v<uint32_t, Channels, 4>;
+NBL_CONSTEXPR uint16_t InnerVirtualChannels = Channels * (ElementsPerInvocationPerChannel >> 1);
+NBL_CONSTEXPR uint32_t ShuffledVirtualChannelsPerRound = nbl::hlsl::mpl::min_v<uint32_t, InnerVirtualChannels, 4>;
 
-NBL_CONSTEXPR bool ShareTwiddles = true;
\ No newline at end of file
+NBL_CONSTEXPR bool ShareTwiddles = true;
+
+using ConstevalParameters = workgroup2::fft::ConstevalParameters<ElementsPerInvocationPerChannel, Channels, SubgroupSizeLog2, WorkgroupSizeLog2, ShuffledVirtualChannelsPerRound, false, true, 0, scalar_t>;
\ No newline at end of file
diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl
index 9933591fb..c1db53f30 100644
--- a/XX_NewFFT/app_resources/shader.comp.hlsl
+++ b/XX_NewFFT/app_resources/shader.comp.hlsl
@@ -9,7 +9,7 @@ using namespace nbl::hlsl;
 
 //using ConstevalParameters = workgroup::fft::ConstevalParameters<ElementsPerThreadLog2, WorkgroupSizeLog2, scalar_t>;
 
-groupshared uint32_t sharedmem[4 * ((sizeof(complex_t<float32_t>) / sizeof(uint32_t)) << WorkgroupSizeLog2) ];
+groupshared uint32_t sharedmem[4 * ((sizeof(complex_t<scalar_t>) / sizeof(uint32_t)) << WorkgroupSizeLog2) ];
 
 struct SharedMemoryAccessor 
 {
@@ -60,25 +60,28 @@ struct Accessor
 };
 
 
-template<uint16_t Size>
+template<uint16_t Channels, uint16_t Size>
 struct InvocationElementsAccessor
 {
-	float32_t real[Size];
-	float32_t imag[Size];
+	scalar_t real[Channels][Size];
+	scalar_t imag[Channels][Size];
 
-	void get(uint32_t channel, NBL_REF_ARG(complex_t<float32_t>) value)
+	void get(uint32_t channel, uint32_t pair, NBL_REF_ARG(complex_t<scalar_t>) value)
 	{
-		value.real(real[channel]);
-		value.imag(imag[channel]);
+		value.real(real[channel][pair]);
+		value.imag(imag[channel][pair]);
 	}
 
-	void set(uint32_t channel, complex_t<float32_t> value)
+	void set(uint32_t channel, uint32_t pair, NBL_CONST_REF_ARG(complex_t<scalar_t>) value)
 	{
-		real[channel] = value.real();
-		imag[channel] = value.imag();
+		real[channel][pair] = value.real();
+		imag[channel][pair] = value.imag();
 	}
 };
 
+using _InvocationElementsAccessor = InvocationElementsAccessor<Channels, ElementsPerInvocationPerChannel / 2>;
+using ElementsAccessorAdaptor = workgroup2::fft::WorkgroupRadix2AccessorAdaptor<Channels, scalar_t, _InvocationElementsAccessor>;
+
 //[numthreads(ConstevalParameters::WorkgroupSize,1,1)]
 [numthreads(WorkgroupSize, 1, 1)]
 [shader("compute")]
@@ -87,8 +90,10 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 	// global mem read write
 	Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress);
 	// Load elements into the accessor
-	InvocationElementsAccessor<ElementsPerThread / 2> loAcc;
-	InvocationElementsAccessor<ElementsPerThread / 2> hiAcc;
+	_InvocationElementsAccessor loElementAccessor;
+	ElementsAccessorAdaptor loAcc = ElementsAccessorAdaptor::create(loElementAccessor);
+	_InvocationElementsAccessor hiElementAccessor;
+	ElementsAccessorAdaptor hiAcc = ElementsAccessorAdaptor::create(hiElementAccessor);
 
 	// Set up the memory adaptor
 	SharedMemoryAccessor sharedmemAccessor;
@@ -96,13 +101,12 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 	//adaptor_t sharedmemAdaptor;
 	//sharedmemAdaptor.accessor = sharedmemAccessor;
 
-	using ConstevalParameters = workgroup2::fft::ConstevalParameters<ElementsPerThread, SubgroupSizeLog2, WorkgroupSizeLog2, ShuffledChannelsPerRound, false, 0, ShareTwiddles, float32_t>;
-	using FFT = workgroup2::FFT<false, ConstevalParameters>;
-	using IFFT = workgroup2::FFT<true, ConstevalParameters>;
+	using FFT = workgroup2::impl::InnerFFT<false, ConstevalParameters>;
+	using IFFT = workgroup2::impl::InnerFFT<true, ConstevalParameters>;
 
 	// Invert last channel to ensure ping pong works
 	[unroll]
-	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)
+	for (uint32_t pair = 0u; pair < ElementsPerInvocationPerChannel / 2; pair++)
 	{
 		complex_t<float32_t> lo, hi;
 		accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo);
@@ -119,7 +123,7 @@ void main(uint32_t3 ID : SV_DispatchThreadID)
 	IFFT::__call(loAcc, hiAcc, sharedmemAccessor);
 
 	[unroll]
-	for (uint32_t pair = 0u; pair < ElementsPerThread / 2; pair++)
+	for (uint32_t pair = 0u; pair < ElementsPerInvocationPerChannel / 2; pair++)
 	{
 		complex_t<float32_t> lo, hi;
 		loAcc.get(pair, lo);
diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp
index c61b354eb..087117f47 100644
--- a/XX_NewFFT/main.cpp
+++ b/XX_NewFFT/main.cpp
@@ -278,7 +278,7 @@ class FFT_Test final : public application_templates::MonoDeviceApplication, publ
 		m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment);
 
 		// Run DIF Ordering test
-		DIFOrderTester<Radix2ElementsPerInvocationLog2, WorkgroupSizeLog2, ExtraPrimeFactor>();
+		DIFOrderTester<Radix2ElementsPerInvocationPerChannelLog2, WorkgroupSizeLog2, ExtraPrimeFactor>();
 
 		// Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example!
 		{