Skip to content

Commit bb6b7a6

Browse files
committed
GPU: Add GPUCA_DETERMINISTIC_NO_FTC
1 parent c40b6e4 commit bb6b7a6

9 files changed

Lines changed: 69 additions & 44 deletions

File tree

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -263,17 +263,17 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
263263
if (GetProcessingSettings().debugLevel > 0) {
264264
mProcessingSettings->recoTaskTiming = true;
265265
}
266-
if (GetProcessingSettings().deterministicGPUReconstruction == -1) {
266+
bool detMode = false;
267267
#ifdef GPUCA_DETERMINISTIC_MODE
268-
mProcessingSettings->deterministicGPUReconstruction = 1;
269-
#else
270-
mProcessingSettings->deterministicGPUReconstruction = GetProcessingSettings().debugLevel >= 6;
268+
detMode = true;
271269
#endif
270+
if (GetProcessingSettings().deterministicGPUReconstruction == -1) {
271+
mProcessingSettings->deterministicGPUReconstruction = detMode ? 1 : (GetProcessingSettings().debugLevel >= 6);
272272
}
273273
if (GetProcessingSettings().deterministicGPUReconstruction) {
274-
#ifndef GPUCA_DETERMINISTIC_MODE
275-
GPUError("WARNING, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
276-
#endif
274+
if (!detMode) {
275+
GPUError("WARNING, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
276+
}
277277
if (mProcessingSettings->debugLevel >= 6 && ((mProcessingSettings->debugMask + 1) & mProcessingSettings->debugMask)) {
278278
GPUError("WARNING: debugMask %d - debug output might not be deterministic with intermediate steps missing", mProcessingSettings->debugMask);
279279
}
@@ -283,9 +283,9 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
283283
}
284284
mProcessingSettings->rtc.deterministic = 1;
285285
} else {
286-
#ifdef GPUCA_DETERMINISTIC_MODE
287-
GPUError("WARNING, compiled with GPUCA_DETERMINISTIC_MODE but deterministicGPUReconstruction not set, only compile-time determinism and deterministic math enforced, not fully deterministic!");
288-
#endif
286+
if (detMode) {
287+
GPUError("WARNING, compiled with GPUCA_DETERMINISTIC_MODE but deterministicGPUReconstruction not set, only compile-time determinism and deterministic math enforced, not fully deterministic!");
288+
}
289289
}
290290
if (GetProcessingSettings().deterministicGPUReconstruction && GetProcessingSettings().debugLevel >= 6) {
291291
mProcessingSettings->nTPCClustererLanes = 1;

GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
271271
#ifdef GPUCA_GPUCODE
272272
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached) * 2 <= constants::TPC_COMP_CHUNK_SIZE);
273273
#endif
274-
#ifdef GPUCA_DETERMINISTIC_MODE // Not using GPUCA_DETERMINISTIC_CODE, which is enforced in TPC compression
274+
#ifdef GPUCA_DETERMINISTIC_MODE
275275
CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
276276
#else // GPUCA_DETERMINISTIC_MODE
277277
if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,7 @@ AddOption(inputcontrolmem, uint64_t, 0, "inputMemory", 0, "Use predefined input
613613
AddOption(cpuAffinity, int32_t, -1, "", 0, "Pin CPU affinity to this CPU core", min(-1))
614614
AddOption(fifoScheduler, bool, false, "", 0, "Use FIFO realtime scheduler", message("Setting FIFO scheduler: %s"))
615615
AddOption(fpe, int8_t, -1, "", 0, "Trap on floating point exceptions (-1 = if no ffast-math)")
616-
AddOption(flushDenormals, bool, true, "", 0, "Enable FTZ and DAZ (Flush all denormals to zero)")
616+
AddOption(flushDenormals, int8_t, -1, "", 0, "Enable FTZ and DAZ (Flush all denormals to zero), -1 = enable automatically if not prevented by deterministic mode")
617617
AddOption(solenoidBzNominalGPU, float, -1e6f, "", 0, "Field strength of solenoid Bz in kGaus")
618618
AddOption(constBz, bool, false, "", 0, "Force constand Bz")
619619
AddOption(overrideMaxTimebin, bool, false, "", 0, "Override max time bin setting for continuous data with max time bin in time frame")

GPU/GPUTracking/Standalone/Benchmark/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ install(DIRECTORY ../tools DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/GPU)
3535
install(DIRECTORY ../../Definitions/Parameters/ DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/GPU/tools FILES_MATCHING REGEX "\\.(python|sh|cmake)")
3636
target_compile_definitions(${targetName} PRIVATE $<TARGET_PROPERTY:O2::GPUTracking,COMPILE_DEFINITIONS>)
3737

38+
if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH} AND GPUCA_DETERMINISTIC_NO_FTZ)
39+
target_compile_definitions(${targetName} PRIVATE GPUCA_DETERMINISTIC_NO_FTZ)
40+
endif()
41+
3842
if(ROOT_FOUND)
3943
target_sources(${targetName} PRIVATE ../../qa/genEvents.cxx)
4044
endif()

GPU/GPUTracking/Standalone/Benchmark/standalone.cxx

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,14 @@ int32_t ReadConfiguration(int argc, char** argv)
141141
#endif
142142
feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
143143
}
144-
if (configStandalone.flushDenormals) {
144+
bool detMode = false, noFTZMode = false;
145+
#ifdef GPUCA_DETERMINISTIC_MODE
146+
detMode = true;
147+
#endif
148+
#ifdef GPUCA_DETERMINISTIC_NO_FTZ
149+
noFTZMode = true;
150+
#endif
151+
if (configStandalone.flushDenormals >= 1 || (configStandalone.flushDenormals == -1 && (configStandalone.proc.deterministicGPUReconstruction >= 1 || (configStandalone.proc.deterministicGPUReconstruction == -1 && detMode)) && !noFTZMode)) {
145152
disable_denormals();
146153
}
147154

GPU/GPUTracking/Standalone/cmake/config.cmake

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,27 @@ set(GPUCA_CONFIG_VC 1)
1919
set(GPUCA_CONFIG_FMT 1)
2020
set(GPUCA_CONFIG_ROOT 1)
2121
set(GPUCA_CONFIG_ONNX 0)
22-
set(GPUCA_BUILD_EVENT_DISPLAY 1)
23-
set(GPUCA_BUILD_EVENT_DISPLAY_FREETYPE 1)
24-
set(GPUCA_BUILD_EVENT_DISPLAY_VULKAN 1)
25-
set(GPUCA_BUILD_EVENT_DISPLAY_WAYLAND 1)
26-
set(GPUCA_BUILD_EVENT_DISPLAY_QT 1)
27-
set(GPUCA_CONFIG_GL3W 0)
28-
set(GPUCA_CONFIG_O2 1)
29-
set(GPUCA_BUILD_DEBUG 0)
30-
set(GPUCA_BUILD_DEBUG_SANITIZE 0)
31-
set(GPUCA_BUILD_DEBUG_HOSTONLY 0)
32-
set(GPUCA_DETERMINISTIC_MODE 0) # OFF / NO_FAST_MATH / OPTO2 / GPU / WHOLEO2
33-
#set(GPUCA_CUDA_GCCBIN c++-14)
34-
#set(GPUCA_OPENCL_CLANGBIN clang-20)
35-
set(HIP_AMDGPUTARGET "default") # "gfx906;gfx908;gfx90a"
36-
set(CUDA_COMPUTETARGET "default") # 86 89
37-
#set(GPUCA_CUDA_COMPILE_MODE perkernel) # onefile / perkernel / rtc
38-
#set(GPUCA_HIP_COMPILE_MODE perkernel)
39-
#set(GPUCA_RTC_NO_COMPILED_KERNELS 1)
40-
#set(GPUCA_KERNEL_RESOURCE_USAGE_VERBOSE 1)
41-
#set(GPUCA_CONFIG_COMPILER gcc) # gcc / clang
42-
#set(GPUCA_CONFIG_WERROR 1)
43-
#add_definitions(-DGPUCA_GPU_DEBUG_PRINT)
44-
#set(GPUCA_OVERRIDE_PARAMETER_FILE "foo.csv")
22+
set(GPUCA_BUILD_EVENT_DISPLAY 1) # Enable compilation of event display
23+
set(GPUCA_BUILD_EVENT_DISPLAY_FREETYPE 1) # Use FreeType library to render fonts for event display
24+
set(GPUCA_BUILD_EVENT_DISPLAY_VULKAN 1) # Enable Vulkan backend for event display (otherwise only OpenGL / Win32)
25+
set(GPUCA_BUILD_EVENT_DISPLAY_WAYLAND 1) # Enable native wayland frontend for event display
26+
set(GPUCA_BUILD_EVENT_DISPLAY_QT 1) # Use QT for Event Display GUI
27+
set(GPUCA_CONFIG_GL3W 0) # Use GL3W instead of glew
28+
set(GPUCA_CONFIG_O2 1) # Compile for O2 data, 0 for Run 2 data
29+
set(GPUCA_BUILD_DEBUG 0) # Enable debug mode (-O0, -ggdb, enable asserts)
30+
set(GPUCA_BUILD_DEBUG_SANITIZE 0) # Enable undefined behavior and address sanitizers
31+
set(GPUCA_BUILD_DEBUG_HOSTONLY 0) # Only compile host code in debug mode, GPU code compiled normally
32+
set(GPUCA_DETERMINISTIC_MODE 0) # OFF / NO_FAST_MATH / OPTO2 / GPU / WHOLEO2
33+
set(GPUCA_DETERMINISTIC_NO_FTZ 0) # If 1 and deterministic mode active, do not apply flush denormals to zero
34+
#set(GPUCA_CUDA_GCCBIN c++-14) # Override which GCC to use for CUDA
35+
#set(GPUCA_OPENCL_CLANGBIN clang-20) # Override which clang to use for OpenCL
36+
set(HIP_AMDGPUTARGET "default") # Set AMD GPU tragets to compile for: e.g. "gfx906;gfx908;gfx90a"
37+
set(CUDA_COMPUTETARGET "default") # Set NVIDIA GPU targets to compile for: e.g. "89;120"
38+
#set(GPUCA_CUDA_COMPILE_MODE perkernel) # Mode to compile kernels for CUDA: onefile / perkernel / rtc
39+
#set(GPUCA_HIP_COMPILE_MODE perkernel) # Mode to compile kernels for HIP: onefile / perkernel / rtc
40+
#set(GPUCA_RTC_NO_COMPILED_KERNELS 1) # Do not compile "perkernel" kernels at compile time, support only RTC
41+
#set(GPUCA_KERNEL_RESOURCE_USAGE_VERBOSE 1) # Verbose resource usage output during kernel compilation
42+
#set(GPUCA_CONFIG_COMPILER gcc) # Compiler to use for standalone compilation: gcc / clang
43+
#set(GPUCA_CONFIG_WERROR 1) # Enforce Werror
44+
#add_definitions(-DGPUCA_GPU_DEBUG_PRINT) # Enable LOG(...) macros and GPUInfo(...) etc. in GPU code
45+
#set(GPUCA_OVERRIDE_PARAMETER_FILE "foo.csv") # Override the CSV or JSON file that contains GPU parameters

GPU/GPUTracking/utils/qmaths_helpers.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
static void disable_denormals()
2828
{
29-
#if !(defined(__ARM_NEON) || defined(__aarch64__)) && __has_include(<xmmintrin.h>) // clang-format off
29+
#if !(defined(__ARM_NEON) || defined(__aarch64__)) && __has_include(<xmmintrin.h>)
3030
#if defined(_MM_FLUSH_ZERO_OFF) && defined(_MM_DENORMALS_ZERO_ON)
3131
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
3232
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);

GPU/documentation/deterministic-mode.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,9 @@ Beyond comparing only the number of clusters and number of tracks, it is also po
3030
It will create a dump container all (most) intermediate results in text form, which can be compared. The output files is called `CPU.out` if using the CPU backend, and `GPU.out` for the GPU backend.
3131
Note that the dump files will be huge and the processing will be slow and consume much more memory than normal with `--debug 6 . It has been tested with datasets containing up to 50 Pb-Pb collisions, and might fail for larger data.
3232
The dump files (if the deterministic mode is used with both compile- and runtime-activation), the files should be 100% identical and can just be compared with `diff`.
33+
34+
By default, the deterministic mode will apply flush-to-zero and denormals-are-zero to denormal floats.
35+
This can be disabled bia `-DDGPUCA_DETERMINISTIC_MODE`.
36+
Note that some GPUs cannot do precise float computation with denormals flushed to zero, while other GPUs do not support denormals at all.
37+
Thus, comparison between CPU and GPU deterministic results might require that this setting is either set or not set.
38+
CPU results for the 2 cases will always differ, since the floating point math will be slightly different.

dependencies/FindO2GPU.cmake

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# or submit itself to any jurisdiction.
1111

1212
# NOTE!!!! - Whenever this file is changed, move it over to alidist/resources
13-
# FindO2GPU.cmake Version 14
13+
# FindO2GPU.cmake Version 15
1414

1515
set(CUDA_COMPUTETARGET_DEFAULT_FULL 80-real 86-real 89-real 120-real 75-virtual)
1616
set(HIP_AMDGPUTARGET_DEFAULT_FULL gfx906;gfx908)
@@ -137,16 +137,23 @@ elseif(NOT GPUCA_DETERMINISTIC_MODE MATCHES "^[0-9]+$")
137137
endif()
138138
set(GPUCA_DETERMINISTIC_MODE ${GPUCA_DETERMINISTIC_MODE_MAP_${GPUCA_DETERMINISTIC_MODE}})
139139
endif()
140-
if (CMAKE_SYSTEM_NAME MATCHES Darwin OR NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
140+
if(GPUCA_DETERMINISTIC_NO_FTZ)
141141
set(GPUCA_CXX_DENORMALS_FLAGS "")
142+
set(GPUCA_CUDA_DENORMALS_FLAGS "--ftz=false")
143+
set(GPUCA_OCL_DENORMALS_FLAGS "")
144+
set(GPUCA_HIP_DENORMALS_FLAGS "-fno-gpu-flush-denormals-to-zero")
142145
else()
143-
set(GPUCA_CXX_DENORMALS_FLAGS "-mdaz-ftz")
146+
if (CMAKE_SYSTEM_NAME MATCHES Darwin OR NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
147+
set(GPUCA_CXX_DENORMALS_FLAGS "")
148+
else()
149+
set(GPUCA_CXX_DENORMALS_FLAGS "-mdaz-ftz")
150+
endif()
151+
set(GPUCA_CUDA_DENORMALS_FLAGS "--ftz=true")
152+
set(GPUCA_OCL_DENORMALS_FLAGS "-cl-denorms-are-zero")
153+
set(GPUCA_HIP_DENORMALS_FLAGS "-fgpu-flush-denormals-to-zero")
144154
endif()
145-
set(GPUCA_CUDA_DENORMALS_FLAGS "--ftz=true")
146-
set(GPUCA_OCL_DENORMALS_FLAGS "-cl-denorms-are-zero")
147-
set(GPUCA_HIP_DENORMALS_FLAGS "-fgpu-flush-denormals-to-zero")
148155
set(GPUCA_CXX_NO_FAST_MATH_FLAGS "-fno-fast-math -ffp-contract=off")
149-
set(GPUCA_CUDA_NO_FAST_MATH_FLAGS "--prec-div=true --prec-sqrt=true --fmad false")
156+
set(GPUCA_CUDA_NO_FAST_MATH_FLAGS "--prec-div=true --prec-sqrt=true --fmad false -Xcompiler -fno-fast-math -Xcompiler -ffp-contract=off")
150157
set(GPUCA_OCL_NO_FAST_MATH_FLAGS -cl-fp32-correctly-rounded-divide-sqrt )
151158
if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_WHOLEO2})
152159
add_definitions(-DGPUCA_DETERMINISTIC_MODE)

0 commit comments

Comments
 (0)