Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ if(NBL_BUILD_EXAMPLES)
add_subdirectory(70_FLIPFluids)
add_subdirectory(71_RayTracingPipeline)
add_subdirectory(72_CooperativeBinarySearch)
add_subdirectory(XX_NewFFT)

if (NBL_BUILD_MITSUBA_LOADER)
add_subdirectory(73_GeometryInspector)
Expand Down
62 changes: 62 additions & 0 deletions XX_NewFFT/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()

set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")

set(SM 6_8)
set(JSON [=[
[
{
"INPUT": "app_resources/shader.comp.hlsl",
"KEY": "shader",
}
]
]=])
string(CONFIGURE "${JSON}" JSON)

set(COMPILE_OPTIONS
-I "${CMAKE_CURRENT_SOURCE_DIR}"
-T lib_${SM}
)

NBL_CREATE_NSC_COMPILE_RULES(
TARGET ${EXECUTABLE_NAME}SPIRV
LINK_TO ${EXECUTABLE_NAME}
BINARY_DIR ${OUTPUT_DIRECTORY}
MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
COMMON_OPTIONS ${COMPILE_OPTIONS}
OUTPUT_VAR KEYS
INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
NAMESPACE nbl::this_example::builtin::build
INPUTS ${JSON}
)

NBL_CREATE_RESOURCE_ARCHIVE(
NAMESPACE nbl::this_example::builtin::build
TARGET ${EXECUTABLE_NAME}_builtinsBuild
LINK_TO ${EXECUTABLE_NAME}
BIND ${OUTPUT_DIRECTORY}
BUILTINS ${KEYS}
)
29 changes: 29 additions & 0 deletions XX_NewFFT/app_resources/common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include "nbl/builtin/hlsl/cpp_compat.hlsl"
#include "nbl/builtin/hlsl/workgroup2/fft.hlsl"

using scalar_t = nbl::hlsl::float32_t;

struct PushConstantData
{
uint64_t deviceBufferAddress;
};

NBL_CONSTEXPR uint32_t WorkgroupSizeLog2 = 9;
NBL_CONSTEXPR uint32_t WorkgroupSize = uint32_t(1) << WorkgroupSizeLog2;
NBL_CONSTEXPR uint32_t SubgroupSizeLog2 = 5; // hardcoded to my nvidia gpu, should be queried at compilation time
NBL_CONSTEXPR uint32_t SubgroupSize = uint32_t(1) << SubgroupSizeLog2;

NBL_CONSTEXPR uint32_t Radix2ElementsPerInvocationPerChannelLog2 = 1;
NBL_CONSTEXPR uint32_t ExtraPrimeFactor = uint32_t(1);
NBL_CONSTEXPR uint32_t ElementsPerInvocationPerChannel = ExtraPrimeFactor * (uint32_t(1) << Radix2ElementsPerInvocationPerChannelLog2);

NBL_CONSTEXPR uint32_t Channels = 1;
NBL_CONSTEXPR uint32_t complexElementCountPerChannel = ElementsPerInvocationPerChannel * (uint32_t(1) << WorkgroupSizeLog2);
NBL_CONSTEXPR uint32_t complexElementCount = Channels * complexElementCountPerChannel;

NBL_CONSTEXPR uint16_t InnerVirtualChannels = Channels * (ElementsPerInvocationPerChannel >> 1);
NBL_CONSTEXPR uint32_t ShuffledVirtualChannelsPerRound = nbl::hlsl::mpl::min_v<uint32_t, InnerVirtualChannels, 4>;

NBL_CONSTEXPR bool ShareTwiddles = true;

using ConstevalParameters = workgroup2::fft::ConstevalParameters<ElementsPerInvocationPerChannel, Channels, SubgroupSizeLog2, WorkgroupSizeLog2, ShuffledVirtualChannelsPerRound, false, true, 0, scalar_t>;
134 changes: 134 additions & 0 deletions XX_NewFFT/app_resources/shader.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#include "common.hlsl"
#include "nbl/builtin/hlsl/subgroup2/fft.hlsl"
#include "nbl/builtin/hlsl/workgroup2/fft.hlsl"
#include "nbl/builtin/hlsl/workgroup/basic.hlsl"

[[vk::push_constant]] PushConstantData pushConstants;

using namespace nbl::hlsl;

//using ConstevalParameters = workgroup::fft::ConstevalParameters<ElementsPerThreadLog2, WorkgroupSizeLog2, scalar_t>;

groupshared uint32_t sharedmem[4 * ((sizeof(complex_t<scalar_t>) / sizeof(uint32_t)) << WorkgroupSizeLog2) ];

struct SharedMemoryAccessor
{
template <typename AccessType, typename IndexType>
void set(IndexType idx, AccessType value)
{
sharedmem[idx] = value;
}

template <typename AccessType, typename IndexType>
void get(IndexType idx, NBL_REF_ARG(AccessType) value)
{
value = sharedmem[idx];
}

void workgroupExecutionAndMemoryBarrier()
{
glsl::barrier();
}

};

// Almost a LegacyBdaAccessor, but since we need `uint32_t index` getter and setter it's the same as writing one ourselves

struct Accessor
{
static Accessor create(const uint64_t address)
{
Accessor accessor;
accessor.address = address;
return accessor;
}

// TODO: can't use our own BDA yet, because it doesn't support the types `workgroup::FFT` will invoke these templates with
template <typename AccessType, typename IndexType>
void get(const IndexType index, NBL_REF_ARG(AccessType) value)
{
value = vk::RawBufferLoad<AccessType>(address + index * sizeof(AccessType));
}

template <typename AccessType, typename IndexType>
void set(const IndexType index, const AccessType value)
{
vk::RawBufferStore<AccessType>(address + index * sizeof(AccessType), value);
}

uint64_t address;
};


template<uint16_t Channels, uint16_t Size>
struct InvocationElementsAccessor
{
scalar_t real[Channels][Size];
scalar_t imag[Channels][Size];

void get(uint32_t channel, uint32_t pair, NBL_REF_ARG(complex_t<scalar_t>) value)
{
value.real(real[channel][pair]);
value.imag(imag[channel][pair]);
}

void set(uint32_t channel, uint32_t pair, NBL_CONST_REF_ARG(complex_t<scalar_t>) value)
{
real[channel][pair] = value.real();
imag[channel][pair] = value.imag();
}
};

using _InvocationElementsAccessor = InvocationElementsAccessor<Channels, ElementsPerInvocationPerChannel / 2>;
using ElementsAccessorAdaptor = workgroup2::fft::WorkgroupRadix2AccessorAdaptor<Channels, scalar_t, _InvocationElementsAccessor>;

//[numthreads(ConstevalParameters::WorkgroupSize,1,1)]
[numthreads(WorkgroupSize, 1, 1)]
[shader("compute")]
void main(uint32_t3 ID : SV_DispatchThreadID)
{
// global mem read write
Accessor accessor = Accessor::create(pushConstants.deviceBufferAddress);
// Load elements into the accessor
_InvocationElementsAccessor loElementAccessor;
ElementsAccessorAdaptor loAcc = ElementsAccessorAdaptor::create(loElementAccessor);
_InvocationElementsAccessor hiElementAccessor;
ElementsAccessorAdaptor hiAcc = ElementsAccessorAdaptor::create(hiElementAccessor);

// Set up the memory adaptor
SharedMemoryAccessor sharedmemAccessor;
//using adaptor_t = accessor_adaptors::StructureOfArrays<SharedMemoryAccessor, uint32_t, uint32_t, 1, WorkgroupSize>;
//adaptor_t sharedmemAdaptor;
//sharedmemAdaptor.accessor = sharedmemAccessor;

using FFT = workgroup2::impl::InnerFFT<false, ConstevalParameters>;
using IFFT = workgroup2::impl::InnerFFT<true, ConstevalParameters>;

// Invert last channel to ensure ping pong works
[unroll]
for (uint32_t pair = 0u; pair < ElementsPerInvocationPerChannel / 2; pair++)
{
complex_t<float32_t> lo, hi;
accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo);
loAcc.set(pair, lo);
accessor.get(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi);
hiAcc.set(pair, hi);
//printf("Pair %d is lo: %f, %f hi: %f, %f", pair, lo.real(), lo.imag(), hi.real(), hi.imag());
//printf("SharedmemSize: %d", 4 * ((sizeof(complex_t<float32_t>) / sizeof(uint32_t)) << WorkgroupSizeLog2));
//printf("ShuffleRounds: %d", ConstevalParameters::ShuffleRounds);
}

FFT::__call(loAcc, hiAcc, sharedmemAccessor);
sharedmemAccessor.workgroupExecutionAndMemoryBarrier();
IFFT::__call(loAcc, hiAcc, sharedmemAccessor);

[unroll]
for (uint32_t pair = 0u; pair < ElementsPerInvocationPerChannel / 2; pair++)
{
complex_t<float32_t> lo, hi;
loAcc.get(pair, lo);
accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + 2 * pair * WorkgroupSize, lo);
hiAcc.get(pair, hi);
accessor.set(uint32_t(workgroup::SubgroupContiguousIndex()) + (2 * pair + 1) * WorkgroupSize, hi);
}
}
28 changes: 28 additions & 0 deletions XX_NewFFT/config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"enableParallelBuild": true,
"threadsPerBuildProcess" : 2,
"isExecuted": false,
"scriptPath": "",
"cmake": {
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
"profiles": [
{
"backend": "vulkan", // should be none
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
"gpuArchitectures": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
}
Loading