diff --git a/CMakeLists.txt b/CMakeLists.txt
index a93a86a4f..b3f986b46 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,6 +106,7 @@ if(NBL_BUILD_EXAMPLES)
   	add_subdirectory(70_FLIPFluids)
 	add_subdirectory(71_RayTracingPipeline)
 	add_subdirectory(72_CooperativeBinarySearch)
+	add_subdirectory(XX_NewFFT)
 
 	if (NBL_BUILD_MITSUBA_LOADER)
 		add_subdirectory(73_GeometryInspector)
diff --git a/XX_NewFFT/CMakeLists.txt b/XX_NewFFT/CMakeLists.txt
new file mode 100644
index 000000000..6b6304ed8
--- /dev/null
+++ b/XX_NewFFT/CMakeLists.txt
@@ -0,0 +1,62 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+
+set(SM 6_8)
+set(JSON [=[
+[
+    {
+        "INPUT": "app_resources/shader.comp.hlsl",
+        "KEY": "shader",
+    }
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+set(COMPILE_OPTIONS
+    -I "${CMAKE_CURRENT_SOURCE_DIR}"
+    -T lib_${SM}
+)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+    TARGET ${EXECUTABLE_NAME}SPIRV
+    LINK_TO ${EXECUTABLE_NAME}
+    BINARY_DIR ${OUTPUT_DIRECTORY}
+    MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+    COMMON_OPTIONS ${COMPILE_OPTIONS}
+    OUTPUT_VAR KEYS
+    INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+    NAMESPACE nbl::this_example::builtin::build
+    INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+    NAMESPACE nbl::this_example::builtin::build
+    TARGET ${EXECUTABLE_NAME}_builtinsBuild
+    LINK_TO ${EXECUTABLE_NAME}
+    BIND ${OUTPUT_DIRECTORY}
+    BUILTINS ${KEYS}
+)
diff --git a/XX_NewFFT/app_resources/common.hlsl b/XX_NewFFT/app_resources/common.hlsl
new file mode 100644
index 000000000..8cd0cf44f
--- /dev/null
+++ b/XX_NewFFT/app_resources/common.hlsl
@@ -0,0 +1,29 @@
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/fft.hlsl"
+
+using scalar_t = nbl::hlsl::float32_t;
+
+struct PushConstantData
+{
+	uint64_t deviceBufferAddress;
+};
+
+NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 9;
+NBL_CONSTEXPR uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2;
+NBL_CONSTEXPR uint32_t SubgroupSizeLog2 = 5; // hardcoded to my nvidia gpu, should be queried at compilation time
+NBL_CONSTEXPR uint32_t SubgroupSize = uint32_t(1) << SubgroupSizeLog2;
+
+NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationPerChannelLog2 = 1;
+NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(1);
+NBL_CONSTEXPR uint32_t ElementsPerInvocationPerChannel = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationPerChannelLog2);
+
+NBL_CONSTEXPR uint32_t Channels = 1;
+NBL_CONSTEXPR uint32_t complexElementCountPerChannel = ElementsPerInvocationPerChannel * (uint32_t(1) << WorkgroupSizeLog2);
+NBL_CONSTEXPR uint32_t complexElementCount = Channels * complexElementCountPerChannel;
+
+NBL_CONSTEXPR uint16_t InnerVirtualChannels = Channels * (ElementsPerInvocationPerChannel >> 1);
+NBL_CONSTEXPR uint32_t ShuffledVirtualChannelsPerRound = nbl::hlsl::mpl::min_v<uint32_t, InnerVirtualChannels, 4>;
+
+NBL_CONSTEXPR bool ShareTwiddles = true;
+
+using ConstevalParameters = workgroup2::fft::ConstevalParameters<ElementsPerInvocationPerChannel, Channels, SubgroupSizeLog2, WorkgroupSizeLog2, ShuffledVirtualChannelsPerRound, false, true, 0, scalar_t>;
\ No newline at end of file
diff --git a/XX_NewFFT/app_resources/shader.comp.hlsl b/XX_NewFFT/app_resources/shader.comp.hlsl
new file mode 100644
index 000000000..c1db53f30
--- /dev/null
+++ b/XX_NewFFT/app_resources/shader.comp.hlsl
@@ -0,0 +1,134 @@
+#include "common.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/fft.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/fft.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+
+[[vk::push_constant]] PushConstantData pushConstants;
+
+using namespace nbl::hlsl;
+
+//using ConstevalParameters = workgroup::fft::ConstevalParameters<ElementsPerThreadLog2, WorkgroupSizeLog2, scalar_t>;
+
+groupshared uint32_t sharedmem[4 * ((sizeof(complex_t<scalar_t>) / sizeof(uint32_t)) << WorkgroupSizeLog2) ];
+
+struct SharedMemoryAccessor 
+{
+	template <typename AccessType, typename IndexType>
+	void set(IndexType idx, AccessType value)
+	{
+		sharedmem[idx] = value;
+	}
+
+	template <typename AccessType, typename IndexType>
+	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
+	{
+		value = sharedmem[idx];
+	}
+
+	void workgroupExecutionAndMemoryBarrier()
+	{
+		glsl::barrier();
+	}
+
+};
+
+// Almost a LegacyBdaAccessor, but since we need `uint32_t index` getter and setter it's the same as writing one ourselves
+
+struct Accessor
+{
+	static Accessor create(const uint64_t address)
+    {
+        Accessor accessor;
+        accessor.address = address;
+        return accessor;
+    }
+
+	// TODO: can't use our own BDA yet, because it doesn't support the types `workgroup::FFT` will invoke these templates with
+	template <typename AccessType, typename IndexType>
+	void get(const IndexType index, NBL_REF_ARG(AccessType) value)
+	{
+		value = vk::RawBufferLoad<AccessType>(address + index * sizeof(AccessType));
+	}
+
+	template <typename AccessType, typename IndexType>
+	void set(const IndexType index, const AccessType value)
+	{
+		vk::RawBufferStore<AccessType>(address + index * sizeof(AccessType), value);
+	}
+
+	uint64_t address;
+};
+
+
+template<uint16_t Channels, uint16_t Size>
+struct InvocationElementsAccessor
+{
+	scalar_t real[Channels][Size];
+	scalar_t imag[Channels][Size];
+
+	void get(uint32_t channel, uint32_t pair, NBL_REF_ARG(complex_t<scalar_t>) value)
+	{
+		value.real(real[channel][pair]);
+		value.imag(imag[channel][pair]);
+	}
+
+	void set(uint32_t channel, uint32_t pair, NBL_CONST_REF_ARG(complex_t<scalar_t>) value)
+	{
+		real[channel][pair] = value.real();
+		imag[channel][pair] = value.imag();
+	}
+};
+
+using _InvocationElementsAccessor = InvocationElementsAccessor<Channels, ElementsPerInvocationPerChannel / 2>;
+using ElementsAccessorAdaptor = workgroup2::fft::WorkgroupRadix2AccessorAdaptor<Channels, scalar_t, _InvocationElementsAccessor>;
+
+//[numthreads(ConstevalParameters::WorkgroupSize,1,1)]
+[numthreads(WorkgroupSize, 1, 1)]
+[shader("compute")]
+void main(uint32_t3 ID : SV_DispatchThreadID)
+{
+	// global mem read write
+	Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress);
+	// Load elements into the accessor
+	_InvocationElementsAccessor loElementAccessor;
+	ElementsAccessorAdaptor loAcc = ElementsAccessorAdaptor::create(loElementAccessor);
+	_InvocationElementsAccessor hiElementAccessor;
+	ElementsAccessorAdaptor hiAcc = ElementsAccessorAdaptor::create(hiElementAccessor);
+
+	// Set up the memory adaptor
+	SharedMemoryAccessor sharedmemAccessor;
+	//using adaptor_t = accessor_adaptors::StructureOfArrays<SharedMemoryAccessor, uint32_t, uint32_t, 1, WorkgroupSize>;
+	//adaptor_t sharedmemAdaptor;
+	//sharedmemAdaptor.accessor = sharedmemAccessor;
+
+	using FFT = workgroup2::impl::InnerFFT<false, ConstevalParameters>;
+	using IFFT = workgroup2::impl::InnerFFT<true, ConstevalParameters>;
+
+	// Invert last channel to ensure ping pong works
+	[unroll]
+	for (uint32_t pair = 0u; pair < ElementsPerInvocationPerChannel / 2; pair++)
+	{
+		complex_t<float32_t> lo, hi;
+		accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo);
+		loAcc.set(pair, lo);
+		accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi);
+		hiAcc.set(pair, hi);
+		//printf("Pair %d is lo: %f, %f hi: %f, %f", pair, lo.real(), lo.imag(), hi.real(), hi.imag());
+		//printf("SharedmemSize: %d", 4 * ((sizeof(complex_t<float32_t>) / sizeof(uint32_t)) << WorkgroupSizeLog2));
+		//printf("ShuffleRounds: %d", ConstevalParameters::ShuffleRounds);
+	}
+
+	FFT::__call(loAcc, hiAcc, sharedmemAccessor);
+	sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
+	IFFT::__call(loAcc, hiAcc, sharedmemAccessor);
+
+	[unroll]
+	for (uint32_t pair = 0u; pair < ElementsPerInvocationPerChannel / 2; pair++)
+	{
+		complex_t<float32_t> lo, hi;
+		loAcc.get(pair, lo);
+		accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo);
+		hiAcc.get(pair, hi);
+		accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi);
+	}
+}
\ No newline at end of file
diff --git a/XX_NewFFT/config.json.template b/XX_NewFFT/config.json.template
new file mode 100644
index 000000000..717d05d53
--- /dev/null
+++ b/XX_NewFFT/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/XX_NewFFT/main.cpp b/XX_NewFFT/main.cpp
new file mode 100644
index 000000000..087117f47
--- /dev/null
+++ b/XX_NewFFT/main.cpp
@@ -0,0 +1,491 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
+
+#include "nbl/examples/examples.hpp"
+
+using namespace nbl;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
+
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bit.hlsl"
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+
+// Function implemented in workgroup2::FFTIndexingUtils is meant to be fast based on the observation that reordering can still be performed fast in the case of
+// a single prime factor. However, we need to test that the implemented fast version matches the real ordering
+template<uint32_t Radix2ElementsPerInvocationLog2, uint32_t WorkgroupSizeLog2, uint32_t ExtraPrimeFactor>
+void DIFOrderTester()
+{
+	using IndexingUtils = nbl::hlsl::workgroup2::FFTIndexingUtils<Radix2ElementsPerInvocationLog2, WorkgroupSizeLog2, ExtraPrimeFactor>;
+	using IndexingUtilsHelper = typename IndexingUtils::helper_t;
+	const uint32_t FFTSize = IndexingUtils::FFTSize;
+	const uint32_t Radix2FFTSizeLog2 = IndexingUtils::Radix2FFTSizeLog2;
+	const uint32_t Radix2FFTSize = IndexingUtils::Radix2FFTSize;
+
+	// Check fast div correctness
+	if constexpr (ExtraPrimeFactor > 1)
+	{
+		bool correct = true;
+		for (auto idx = 0u; idx < FFTSize; idx++)
+		{
+			if (idx / ExtraPrimeFactor != IndexingUtilsHelper::fastDiv(idx)) correct = false;
+		}
+		std::cout << "Fast div test " << (correct ? "passed\n" : "did not pass\n");
+	}
+
+	// Check whether the forward ordering is computed properly
+	{
+		bool correct = true;
+		for (auto idx = 0u; idx < FFTSize; idx++)
+		{
+			uint32_t fastIdx = IndexingUtilsHelper::mapLaneToFreq(idx);
+			if constexpr (ExtraPrimeFactor == 1)
+			{
+				if (fastIdx != nbl::hlsl::bitReverseAs<uint32_t>(idx, Radix2FFTSizeLog2)) correct = false;
+			}
+			else
+			{
+				uint32_t index = idx;
+				std::vector<uint32_t> digits;
+				for (auto i = 0u; i < Radix2FFTSizeLog2; i++)
+				{
+					digits.push_back(index & 1);
+					index >>= 1;
+				}
+				digits.push_back(index);
+				// Reconstruct mapping
+				uint32_t correctIdx = 0;
+				uint32_t multiplier = ExtraPrimeFactor * Radix2FFTSize;
+				for (auto i = 0u; i < Radix2FFTSizeLog2; i++)
+				{
+					multiplier >>= 1;
+					correctIdx += multiplier * digits[i];
+				}
+				multiplier /= ExtraPrimeFactor;
+				correctIdx += multiplier * digits[Radix2FFTSizeLog2];
+				if (fastIdx != correctIdx)
+					correct = false;
+			}
+		}
+		std::cout << "Forward test " << (correct ? "passed\n" : "did not pass\n");
+	}
+
+	// Check whether the inverse actually computes the inverse
+	{
+		bool correct = true;
+		for (auto idx = 0; idx < FFTSize; idx++)
+		{
+			if (idx != IndexingUtilsHelper::mapFreqToLane(IndexingUtilsHelper::mapLaneToFreq(idx))) correct = false;
+		}
+		std::cout << "Inverse test " << (correct ? "passed\n" : "did not pass\n");
+	}
+}
+
+// Simple showcase of how to run FFT on a 1D array
+class FFT_Test final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
+{
+	using device_base_t = application_templates::MonoDeviceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
+
+	smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
+
+	smart_refctd_ptr<nbl::video::IUtilities> m_utils;
+
+	nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
+	StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
+	smart_refctd_ptr<nbl::video::IGPUBuffer> m_deviceLocalBuffer;
+
+	// These are Buffer Device Addresses
+	uint64_t m_upStreamingBufferAddress;
+	uint64_t m_downStreamingBufferAddress;
+	uint64_t m_deviceLocalBufferAddress;
+
+	// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
+	uint32_t m_alignment;
+
+	// This example really lets the advantages of a timeline semaphore shine through!
+	smart_refctd_ptr<ISemaphore> m_timeline;
+	uint64_t semaphorValue = 0;
+
+public:
+	// Yay thanks to multiple inheritance we cannot forward ctors anymore
+	FFT_Test(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	inline core::smart_refctd_ptr<IShader> createShader(const char* includeMainName)
+	{
+		auto HLSLShader = core::make_smart_refctd_ptr<IShader>((std::string("#include \"") + includeMainName + "\"\n").c_str(),
+			IShader::E_CONTENT_TYPE::ECT_HLSL,
+			includeMainName);
+		assert(HLSLShader);
+
+		ILogicalDevice::SShaderCreationParameters shaderCreationParams{ .source = HLSLShader.get(),
+																		.preprocessedOutputPath = (localOutputCWD / "preprocessed.hlsl").string(),
+																		.spvOutputPath = (localOutputCWD / "out.spv").string() };
+#ifndef _NBL_DEBUG
+		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+		auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+		shaderCreationParams.optimizer = opt.get();
+		shaderCreationParams.optimizerIsExtraPasses = true;
+#endif
+		return m_device->compileShader(shaderCreationParams);
+	}
+	
+	// useful for debugging compiler issues
+	inline core::smart_refctd_ptr<IShader> createSpirvShader(const char* spirvName)
+	{
+		core::smart_refctd_ptr<ICPUBuffer> shaderBuffer;
+		{
+			core::smart_refctd_ptr<system::IFile> shaderReadFile;
+			system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> future;
+			m_system->createFile(future, spirvName, system::IFile::ECF_READ);
+			if (future.wait())
+			{
+				future.acquire().move_into(shaderReadFile);
+				if (shaderReadFile)
+				{
+					const size_t size = shaderReadFile->getSize();
+					if (size > 0ull)
+					{
+						asset::IBuffer::SCreationParams bufferCreationParams{ .size = size };
+						asset::ICPUBuffer::SCreationParams foo;
+						foo = bufferCreationParams;
+						shaderBuffer = ICPUBuffer::create(std::move(foo));
+						system::IFile::success_t succ;
+						shaderReadFile->read(succ, shaderBuffer->getPointer(), 0, size);
+						if (!succ)
+							m_logger->log("Failed Reading From Shader File.", ILogger::ELL_ERROR);
+					}
+				}
+				else
+				{
+					m_logger->log("Failed Opening Shader File.", ILogger::ELL_ERROR);
+				}
+			}
+			else
+			{
+				m_logger->log("Failed Opening Shader Cache File.", ILogger::ELL_ERROR);
+			}
+		}
+
+		auto SPIRVShader = core::make_smart_refctd_ptr<IShader>(std::move(shaderBuffer), IShader::E_CONTENT_TYPE::ECT_SPIRV, spirvName);
+		assert(SPIRVShader);
+
+		return m_device->compileShader({ .source = SPIRVShader.get() });
+	}
+
+	// we stuff all our work here because its a "single shot" app
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		// Remember to call the base class initialization!
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		smart_refctd_ptr<IShader> shader = createShader("app_resources/shader.comp.hlsl");
+		// DEBUG
+		//smart_refctd_ptr<IShader> shader = createSpirvShader("app_resources/optimized.spv");
+		if (!shader)
+			return logFail("Invalid shader!");
+
+		// Create massive upload/download buffers
+		constexpr uint32_t DownstreamBufferSize = sizeof(scalar_t) << 23;
+		constexpr uint32_t UpstreamBufferSize = sizeof(scalar_t) << 23;
+
+		m_utils = IUtilities::create(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize);
+		if (!m_utils)
+			return logFail("Failed to create Utilities!");
+		m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
+		m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
+		m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress();
+		m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
+
+		// Create device-local buffer
+		{
+			const uint32_t scalarElementCount = 2 * complexElementCount;
+			IGPUBuffer::SCreationParams deviceLocalBufferParams = {};
+			
+			IQueue* const queue = getComputeQueue();
+			uint32_t queueFamilyIndex = queue->getFamilyIndex();
+			
+			deviceLocalBufferParams.queueFamilyIndexCount = 1;
+			deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex;
+			deviceLocalBufferParams.size = sizeof(scalar_t) * scalarElementCount;
+			deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			
+			m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams));
+			auto mreqs = m_deviceLocalBuffer->getMemoryReqs();
+			mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+			auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+
+			m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress();
+		}
+		
+		const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(PushConstantData) };
+
+		{
+			auto layout = m_device->createPipelineLayout({ &pcRange,1 });
+			IGPUComputePipeline::SCreationParams params = {};
+			params.layout = layout.get();
+			params.shader.shader = shader.get();
+			params.shader.entryPoint = "main";
+			params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
+			params.cached.requireFullSubgroups = true;
+			if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
+				return logFail("Failed to create compute pipeline!\n");
+		}
+
+		const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
+		// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
+		// which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets.
+		// Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc.
+		// there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those.
+		// We'll align to max of coherent atom size even if the memory is coherent,
+		// and we also need to take into account BDA shader loads need to be aligned to the type being loaded.
+		m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(float));
+
+		// Semaphor used here to know the FFT is done before download
+		m_timeline = m_device->createSemaphore(semaphorValue);
+
+		IQueue* const queue = getComputeQueue();
+
+		// Note that I'm using the sample struct with methods that have identical code which compiles as both C++ and HLSL
+		auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({ semaphorValue ^ 0xdeadbeefu,std::hash<string>()(_NBL_APP_NAME_) });
+
+		const uint32_t scalarElementCount = 2 * complexElementCount;
+		const uint32_t inputSize = sizeof(scalar_t) * scalarElementCount;
+
+		// Just need a single suballocation in this example
+		const uint32_t AllocationCount = 1;
+
+		// It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value
+		// this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args.
+		auto inputOffset = m_upStreamingBuffer->invalid_value;
+
+		// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
+		// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
+		std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
+		// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
+		m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment);
+
+		// Run DIF Ordering test
+		DIFOrderTester<Radix2ElementsPerInvocationPerChannelLog2, WorkgroupSizeLog2, ExtraPrimeFactor>();
+
+		// Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example!
+		{
+			auto* const inputPtr = reinterpret_cast<scalar_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer()) + inputOffset);
+			std::cout << "Begin array CPU\n";
+			for (auto channel = 0; channel < Channels; channel++)
+			{
+				std::cout << "Begin channel " << channel << "\n";
+				for (auto j = 0; j < complexElementCountPerChannel; j++)
+				{
+					//Random array
+
+					//scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max);
+					//#define DIVIDE
+
+
+					//FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),...
+
+
+					scalar_t x = j > 0 ? 0.f : 1.f, y = 0;
+
+					// FFT( (c,0), (c,0), (c,0),... ) = (Nc,0), (0,0), (0,0),...
+
+
+					//scalar_t x = 1.f, y = 0.f;
+
+					inputPtr[2 * complexElementCountPerChannel * channel + 2 * j] = x;
+					inputPtr[2 * complexElementCountPerChannel * channel + 2 * j + 1] = y;
+					std::cout << "(" << x << ", " << y << "), ";
+				}
+				std::cout << "\nEnd channel " << channel << "\n";
+			}
+			/*
+			for (auto j = 0; j < complexElementCount; j++)
+			{
+				//Random array
+
+				scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max);
+				#define DIVIDE
+
+				//FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),...
+
+
+				//scalar_t x = j > 0 ? 0.f : 1.f, y = 0;
+					
+				// FFT( (c,0), (c,0), (c,0),... ) = (Nc,0), (0,0), (0,0),...
+
+
+				//scalar_t x = 1.f, y = 0.f;
+
+				inputPtr[2 * j] = x;
+				inputPtr[2 * j + 1] = y;
+				std::cout << "(" << x << ", " << y << "), ";
+			}
+			*/
+			std::cout << "\nEnd array CPU\n";
+			
+			// Always remember to flush!
+			if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
+			{
+				const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory();
+				const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize);
+				m_device->flushMappedMemoryRanges(1, &range);
+			}
+		}
+
+		// finally allocate our output range
+		const uint32_t outputSize = inputSize;
+
+		auto outputOffset = m_downStreamingBuffer->invalid_value;
+		m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment);
+
+		smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+		{
+			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) {
+				return logFail("Failed to create Command Buffers!\n");
+			}
+			cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger));
+			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdbuf->bindComputePipeline(m_pipeline.get());
+			// This is the new fun part, pushing constants
+			const PushConstantData pc = {.deviceBufferAddress = m_deviceLocalBufferAddress};
+			IGPUCommandBuffer::SBufferCopy copyInfo = {};
+			copyInfo.srcOffset = 0;
+			copyInfo.dstOffset = 0;
+			copyInfo.size = m_deviceLocalBuffer->getSize();
+			cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, &copyInfo);
+			cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+			// Remember we do a single workgroup per 1D array in these parts
+			cmdbuf->dispatch(1, 1, 1);
+
+			// Pipeline barrier: wait for FFT shader to be done before copying to downstream buffer 
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo = {};
+
+			decltype(pipelineBarrierInfo)::buffer_barrier_t barrier = {}; 
+			pipelineBarrierInfo.bufBarriers = { &barrier, 1u };
+
+			barrier.range.buffer = m_deviceLocalBuffer;
+
+			barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS;
+			barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+			barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS;
+
+			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo);
+			cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, &copyInfo);
+			cmdbuf->end();
+		}
+
+		semaphorValue++;
+		{
+			const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
+			{
+				.cmdbuf = cmdbuf.get()
+			};
+			const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
+			{
+				.semaphore = m_timeline.get(),
+				.value = semaphorValue,
+				.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+			};
+
+			const IQueue::SSubmitInfo submitInfo = {
+				.waitSemaphores = {},
+				.commandBuffers = {&cmdbufInfo,1},
+				.signalSemaphores = {&signalInfo,1}
+			};
+
+			m_api->startCapture();
+			queue->submit({ &submitInfo,1 });
+			m_api->endCapture();
+		}
+
+		// We let all latches know what semaphore and counter value has to be passed for the functors to execute
+		const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue };
+
+		// As promised, we can defer an upstreaming buffer deallocation until a fence is signalled
+		// You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation.
+		m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait);
+
+		// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
+		// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
+		// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
+		auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
+			IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize),
+			// Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals
+			[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
+			{
+				// The unused variable is used for letting the consumer know the subsection of the output we've managed to download
+				// But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves.
+				assert(dstOffset == 0 && size == outputSize);
+
+				std::cout << "Begin array GPU\n";
+				scalar_t* const data = reinterpret_cast<scalar_t*>(const_cast<void*>(bufSrc));
+				for (auto channel = 0; channel < Channels; channel++)
+				{
+					std::cout << "Begin channel " << channel << "\n";
+					for (auto j = 0; j < complexElementCountPerChannel; j++)
+					{
+						std::cout << "(" << data[2 * complexElementCountPerChannel * channel + 2 * j] << ", " << data[2 * complexElementCountPerChannel * channel + 2 * j + 1] << "), ";
+					}
+					std::cout << "\nEnd channel " << channel << "\n";
+				}
+
+				/*
+				#ifdef DIVIDE
+				for (auto j = 0; j < complexElementCount; j++)
+				{
+					std::cout << "(" << data[2 * j] / complexElementCount << ", " << data[2 * j + 1] / complexElementCount << "), ";
+				}
+				#else
+				for (auto j = 0; j < complexElementCount; j++)
+				{
+					std::cout << "(" << data[2 * j] << ", " << data[2 * j + 1] << "), ";
+				}
+				#endif		
+				*/
+
+				std::cout << "\nEnd array GPU\n";
+			},
+			// Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it
+			// hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands.
+			// It could also be latched in the upstreaming deallocate, because its the same fence.
+			std::move(cmdbuf), m_downStreamingBuffer
+		);
+		// We put a function we want to execute 
+		m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get());
+
+		return true;
+	}
+
+	// One-shot App
+	bool keepRunning() override { return false; }
+
+	// One-shot App
+	void workLoopBody() override{}
+
+	// Cleanup
+	bool onAppTerminated() override
+	{
+		// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
+		// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
+		while (m_downStreamingBuffer->cull_frees()) {}
+		return device_base_t::onAppTerminated();
+	}
+};
+
+
+NBL_MAIN_FUNC(FFT_Test)
\ No newline at end of file
diff --git a/XX_NewFFT/pipeline.groovy b/XX_NewFFT/pipeline.groovy
new file mode 100644
index 000000000..1a7b043a4
--- /dev/null
+++ b/XX_NewFFT/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
+{
+	public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file