diff --git a/.github/scripts/setup_cuda.ps1 b/.github/scripts/setup_cuda.ps1 index 21cc2303c..2304e1217 100644 --- a/.github/scripts/setup_cuda.ps1 +++ b/.github/scripts/setup_cuda.ps1 @@ -108,4 +108,4 @@ Write-Output "CMAKE_CUDA_COMPILER=$dst\bin\nvcc.exe" >> $env:GITHUB_ENV Write-Output "NVCC_APPEND_FLAGS=-allow-unsupported-compiler" >> $env:GITHUB_ENV Write-Output "CUDA_VERSION=$CUDA_VERSION_FULL" >> $env:GITHUB_ENV -Write-Output "Setup completed." \ No newline at end of file +Write-Output "Setup completed." diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index c86de84f1..45c3f3fbc 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -60,7 +60,7 @@ jobs: # compile QuEST with all combinations of below flags matrix: - os: [windows-latest, ubuntu-latest, macos-latest] + os: [windows-2022, ubuntu-latest, macos-latest] precision: [1, 2, 4] omp: [ON, OFF] mpi: [ON, OFF] @@ -80,7 +80,7 @@ jobs: - os: macos-latest compiler: clang++ deprecated: ON - - os: windows-latest + - os: windows-2022 compiler: cl deprecated: OFF @@ -107,7 +107,7 @@ jobs: # cannot use cuquantum on Windows or MacOS - cuquantum: ON - os: windows-latest + os: windows-2022 - cuquantum: ON os: macos-latest @@ -130,14 +130,14 @@ jobs: mpilib: 'msmpi' # MacOS: [MPICH, OpenMPI] - os: macos-latest mpilib: 'impi' - - os: windows-latest + - os: windows-2022 mpilib: 'mpich' # Windows: [Intel MPI, MS MPI] - - os: windows-latest + - os: windows-2022 mpilib: 'ompi' # cannot presently install HIP on Windows CI (times out) - hip: ON - os: windows-latest + os: windows-2022 # cannot presently compile HIP + MPI; the linker fails with # "undefined reference to 'vtable for thrust::system::system_error' @@ -261,7 +261,7 @@ jobs: # run all compiled isolated examples to test for link-time errors, # continuing if any fail (since some deliberately fail) - name: Run isolated examples (Windows) - if: ${{ matrix.os == 'windows-latest' }} + if: ${{ matrix.os == 'windows-2022' }} working-directory: ${{ env.isolated_dir }}/Release/ shell: pwsh run: | @@ -271,7 +271,7 @@ jobs: & $_.FullName } - name: Run isolated examples (Unix) - if: ${{ matrix.os != 'windows-latest' }} + if: ${{ matrix.os != 'windows-2022' }} working-directory: ${{ env.isolated_dir }} run: | for fn in *_c *_cpp; do @@ -281,7 +281,7 @@ jobs: # run all compiled 'automated' examples - name: Run automated examples (Windows) - if: ${{ matrix.os == 'windows-latest' }} + if: ${{ matrix.os == 'windows-2022' }} working-directory: ${{ env.automated_dir }}/Release/ shell: pwsh run: | @@ -291,7 +291,7 @@ jobs: & $_.FullName } - name: Run automated examples (Unix) - if: ${{ matrix.os != 'windows-latest' }} + if: ${{ matrix.os != 'windows-2022' }} working-directory: ${{ env.automated_dir }} run: | for fn in *_c *_cpp; do diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index b6954f701..65f277483 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -12,6 +12,7 @@ * mapped to HIP symbols by cuda_to_hip.h * * @author Tyson Jones + * @author James Richings (optimised away qubit-list allocs) * @author Ania (Anna) Brown (developed QuEST v1 logic) */ @@ -29,6 +30,13 @@ #error "A file being compiled somehow included gpu_kernels.hpp despite QuEST not being compiled in GPU-accelerated mode." #endif + + +/* + * OPTIMISATION MACROS + */ + + // cuda keyword 'register' is misinterpreted by HIP #if defined(__NVCC__) #define REGISTER register @@ -37,11 +45,20 @@ #endif +// optimise qubit-list passing in CUDA v11.7+ (we round to 12), benefitting CC >= 7.0 +#if defined(__NVCC__) && defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12) + #define _GRID_CONST_OPT __grid_constant__ +#else + #define _GRID_CONST_OPT +#endif + + /* * THREAD MANAGEMENT */ + __forceinline__ __device__ qindex getThreadInd() { return blockIdx.x*blockDim.x + threadIdx.x; } @@ -86,15 +103,15 @@ __forceinline__ __device__ int cudaGetBitMaskParity(qindex mask) { template __global__ void kernel_statevec_packAmpsIntoBuffer( gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, - int* qubits, int numQubits, qindex qubitStateMask + _GRID_CONST_OPT const List64 qubits, qindex qubitStateMask ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numBits, NumCtrls, numQubits); + SET_VAR_AT_COMPILE_TIME(int, numBits, NumCtrls, qubits.size()); // i = nth local index where qubits are active - qindex i = insertBitsWithMaskedValues(n, qubits, numBits, qubitStateMask); + qindex i = insertBitsWithMaskedValues(n, qubits.data(), numBits, qubitStateMask); // caller offsets buffer by sub-buffer send-index buffer[n] = amps[i]; @@ -125,16 +142,20 @@ __global__ void kernel_statevec_packPairSummedAmpsIntoBuffer( template __global__ void kernel_statevec_anyCtrlSwap_subA( gpu_qcomp* amps, qindex numThreads, - int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int targ1, int targ2 + _GRID_CONST_OPT const List64 ctrlsAndTargs, qindex ctrlsAndTargsMask, + int targ1, int targ2 ) { GET_THREAD_IND(n, numThreads); + // beware ctrlsAndTargs contains the two targets + constexpr int numTargs = 2; + // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); - int numQubitBits = 2 + numCtrlBits; + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTargs.size() - numTargs); + int numQubitBits = numCtrlBits + numTargs; // i01 = nth local index where ctrls are active, targ2=0 and targ1=1 - qindex i01 = insertBitsWithMaskedValues(n, ctrlsAndTargs, numQubitBits, ctrlsAndTargsMask); + qindex i01 = insertBitsWithMaskedValues(n, ctrlsAndTargs.data(), numQubitBits, ctrlsAndTargsMask); qindex i10 = flipTwoBits(i01, targ2, targ1); // swap amps @@ -147,15 +168,15 @@ __global__ void kernel_statevec_anyCtrlSwap_subA( template __global__ void kernel_statevec_anyCtrlSwap_subB( gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, - int* ctrls, int numCtrls, qindex ctrlStateMask + _GRID_CONST_OPT const List64 ctrls, qindex ctrlStateMask ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); // i = nth local index where ctrls are active - qindex i = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex i = insertBitsWithMaskedValues(n, ctrls.data(), numCtrlBits, ctrlStateMask); // caller offsets buffer if necessary amps[i] = buffer[n]; @@ -165,16 +186,19 @@ __global__ void kernel_statevec_anyCtrlSwap_subB( template __global__ void kernel_statevec_anyCtrlSwap_subC( gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, - int* ctrlsAndTarg, int numCtrls, qindex ctrlsAndTargMask + _GRID_CONST_OPT const List64 ctrlsAndTarg, qindex ctrlsAndTargMask ) { GET_THREAD_IND(n, numThreads); + // beware ctrlsAndTarg contains the single target + constexpr int numTargs = 1; + // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); - int numQubitBits = numCtrlBits + 1; + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTarg.size() - numTargs); + int numQubitBits = numCtrlBits + numTargs; // i = nth local index where ctrls and targ are in specified states - qindex i = insertBitsWithMaskedValues(n, ctrlsAndTarg, numQubitBits, ctrlsAndTargMask); + qindex i = insertBitsWithMaskedValues(n, ctrlsAndTarg.data(), numQubitBits, ctrlsAndTargMask); // caller offsets buffer if necessary amps[i] = buffer[n]; @@ -190,16 +214,21 @@ __global__ void kernel_statevec_anyCtrlSwap_subC( template __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( gpu_qcomp* amps, qindex numThreads, - int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ, + _GRID_CONST_OPT const List64 ctrlsAndTarg, + qindex ctrlStateMask, int targ, gpu_qcomp m00, gpu_qcomp m01, gpu_qcomp m10, gpu_qcomp m11 ) { GET_THREAD_IND(n, numThreads); + // beware ctrlsAndTarg contains the single target + constexpr int numTargs = 1; + // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTarg.size() - numTargs); + int numQubitBits = numCtrlBits + numTargs; // i0 = nth local index where ctrls are active and targ is 0 - qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTarg, numCtrlBits + 1, ctrlStateMask); + qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTarg.data(), numQubitBits, ctrlStateMask); qindex i1 = flipBit(i0, targ); // note amps are strided by 2^targ @@ -214,16 +243,16 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( template __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subB( gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, - int* ctrls, int numCtrls, qindex ctrlStateMask, + _GRID_CONST_OPT const List64 ctrls, qindex ctrlStateMask, gpu_qcomp fac0, gpu_qcomp fac1 ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); // i = nth local index where ctrl bits are active - qindex i = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex i = insertBitsWithMaskedValues(n, ctrls.data(), numCtrlBits, ctrlStateMask); // caller offsets buffer by receive-index amps[i] = fac0*amps[i] + fac1*buffer[n]; @@ -239,7 +268,8 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subB( template __global__ void kernel_statevec_anyCtrlTwoTargDenseMatr_sub( gpu_qcomp* amps, qindex numThreads, - int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ1, int targ2, + _GRID_CONST_OPT const List64 ctrlsAndTargs, qindex ctrlStateMask, + int targ1, int targ2, gpu_qcomp m00, gpu_qcomp m01, gpu_qcomp m02, gpu_qcomp m03, gpu_qcomp m10, gpu_qcomp m11, gpu_qcomp m12, gpu_qcomp m13, gpu_qcomp m20, gpu_qcomp m21, gpu_qcomp m22, gpu_qcomp m23, @@ -247,11 +277,15 @@ __global__ void kernel_statevec_anyCtrlTwoTargDenseMatr_sub( ) { GET_THREAD_IND(n, numThreads); + // beware ctrlsAndTargs contains the two targets + constexpr int numTargs = 2; + // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTargs.size() - numTargs); + int numQubitBits = numCtrlBits + numTargs; // i00 = nth local index where ctrls are active and both targs are 0 - qindex i00 = insertBitsWithMaskedValues(n, ctrlsAndTarg, numCtrlBits + 2, ctrlStateMask); + qindex i00 = insertBitsWithMaskedValues(n, ctrlsAndTargs.data(), numQubitBits, ctrlStateMask); qindex i01 = flipBit(i00, targ1); qindex i10 = flipBit(i00, targ2); qindex i11 = flipBit(i01, targ2); @@ -288,7 +322,7 @@ __forceinline__ __device__ qindex getThreadsNthGlobalArrInd(qindex n, qindex thr template __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( gpu_qcomp* amps, qindex numThreads, - int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int* targs, + _GRID_CONST_OPT const List64 ctrlsAndTargs, qindex ctrlsAndTargsMask, _GRID_CONST_OPT const List64 targs, gpu_qcomp* flatMatrElems ) { GET_THREAD_IND(n, numThreads); @@ -309,18 +343,18 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr( REGISTER gpu_qcomp privateCache[1 << NumTargs]; // we know NumTargs <= 5, though NumCtrls is permitted anything (including -1) - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTargs.size() - targs.size()); constexpr qindex numTargAmps = (1 << NumTargs); // explicit, in lieu of powerOf2 // i0 = nth local index where ctrls are active and targs are all zero - qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTargs, numCtrlBits + NumTargs, ctrlsAndTargsMask); // loop may be unrolled + qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTargs.data(), numCtrlBits + NumTargs, ctrlsAndTargsMask); // loop may be unrolled // populate cache (force unroll to ensure compile-time cache indices) #pragma unroll for (qindex k=0; k __global__ void kernel_statevec_anyCtrlManyTargDenseMatr( gpu_qcomp* globalCache, gpu_qcomp* amps, qindex numThreads, qindex numBatchesPerThread, - int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, - int* targs, int numTargBits, qindex numTargAmps, - gpu_qcomp* flatMatrElems -) { + _GRID_CONST_OPT const List64 ctrlsAndTargs, qindex ctrlsAndTargsMask, + _GRID_CONST_OPT const List64 targs, qindex numTargAmps, + gpu_qcomp* flatMatrElems) { GET_THREAD_IND(t, numThreads); // NumCtrls might be compile-time known, but numTargBits>5 is always unknown/runtime - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTargs.size() - targs.size()); // unlike all other kernels, each thread modifies multiple batches of amplitudes for (qindex b=0; b __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( - gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - int* ctrls, int numCtrls, qindex ctrlStateMask, int targ, + gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + _GRID_CONST_OPT const List64 ctrls, + qindex ctrlStateMask, int targ, gpu_qcomp m1, gpu_qcomp m2 ) { GET_THREAD_IND(n, numThreads); @@ -449,10 +483,10 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( /// We should verify this! // use template params to compile-time unroll loops in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrls.data(), numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); @@ -470,8 +504,9 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( - gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - int* ctrls, int numCtrls, qindex ctrlStateMask, int targ1, int targ2, + gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + _GRID_CONST_OPT const List64 ctrls, qindex ctrlStateMask, + int targ1, int targ2, gpu_qcomp m1, gpu_qcomp m2, gpu_qcomp m3, gpu_qcomp m4 ) { GET_THREAD_IND(n, numThreads); @@ -488,10 +523,10 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( /// We should verify this! // use template params to compile-time unroll loops in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrls.data(), numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); @@ -511,8 +546,9 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( - gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - int* ctrls, int numCtrls, qindex ctrlStateMask, int* targs, int numTargs, + gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, + _GRID_CONST_OPT const List64 ctrls, qindex ctrlStateMask, + _GRID_CONST_OPT const List64 targs, gpu_qcomp* elems, gpu_qcomp exponent ) { GET_THREAD_IND(n, numThreads); @@ -529,17 +565,17 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( /// We should verify this! // use template params to compile-time unroll loops in insertBits() and getValueOfBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); - SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, numTargs); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); + SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, targs.size()); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrls.data(), numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); // t = value of targeted bits, which may be in the prefix substate - qindex t = getValueOfBits(i, targs, numTargBits); + qindex t = getValueOfBits(i, targs.data(), numTargBits); gpu_qcomp elem = elems[t]; @@ -607,15 +643,15 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub( template __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( gpu_qcomp* amps, qindex numThreads, - int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsStateMask, - int* targsXY, int numXY, qindex maskXY, qindex maskYZ, + _GRID_CONST_OPT const List64 ctrlsAndTargs, qindex ctrlsAndTargsStateMask, + _GRID_CONST_OPT const List64 targsXY, qindex maskXY, qindex maskYZ, gpu_qcomp powI, gpu_qcomp ampFac, gpu_qcomp pairAmpFac ) { GET_THREAD_IND(t, numThreads); // use template params to compile-time unroll loops in insertBits() and setBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); - SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, numXY); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrlsAndTargs.size() - targsXY.size()); + SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, targsXY.size()); // n = local index of amp sub-batch with common i0, v = value of target bits qindex numInnerIts = powerOf2(numTargBits) / 2; @@ -623,10 +659,10 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( qindex v = t % numInnerIts; // i0 = nth local index where ctrls are active and targs are all zero (loop therein may be unrolled) - qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTargs, numCtrlBits + numTargBits, ctrlsAndTargsStateMask); + qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTargs.data(), numCtrlBits + numTargBits, ctrlsAndTargsStateMask); // iA = nth local index where targs have value v, iB = (last - nth) such index - qindex iA = setBits(i0, targsXY, numTargBits, v); // may be unrolled + qindex iA = setBits(i0, targsXY.data(), numTargBits, v); // may be unrolled qindex iB = flipBits(iA, maskXY); // determine whether to multiply amps by +-1 or +-i @@ -647,17 +683,17 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA( template __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB( gpu_qcomp* amps, gpu_qcomp* buffer, qindex numThreads, - int* ctrls, int numCtrls, qindex ctrlStateMask, + _GRID_CONST_OPT const List64 ctrls, qindex ctrlStateMask, qindex maskXY, qindex maskYZ, qindex bufferMaskXY, gpu_qcomp powI, gpu_qcomp thisAmpFac, gpu_qcomp otherAmpFac ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); // i = nth local index where ctrl bits are in specified states - qindex i = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex i = insertBitsWithMaskedValues(n, ctrls.data(), numCtrlBits, ctrlStateMask); // j = buffer index of amp to be mixed with i qindex j = flipBits(n, bufferMaskXY); @@ -682,16 +718,17 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB( template __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub( gpu_qcomp* amps, qindex numThreads, - int* ctrls, int numCtrls, qindex ctrlStateMask, qindex targMask, + _GRID_CONST_OPT const List64 ctrls, qindex ctrlStateMask, + qindex targMask, gpu_qcomp fac0, gpu_qcomp fac1 ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); + SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size()); // i = nth local index where ctrl bits are in specified states - qindex i = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex i = insertBitsWithMaskedValues(n, ctrls.data(), numCtrlBits, ctrlStateMask); // apply phase to amp depending on parity of targets in global index int p = cudaGetBitMaskParity(i & targMask); @@ -1130,12 +1167,14 @@ __global__ void kernel_densmatr_oneQubitDamping_subD( template __global__ void kernel_densmatr_partialTrace_sub( gpu_qcomp* ampsIn, gpu_qcomp* ampsOut, qindex numThreads, - int* ketTargs, int* pairTargs, int* allTargs, int numKetTargs + _GRID_CONST_OPT const List64 ketTargs, + _GRID_CONST_OPT const List64 pairTargs, + _GRID_CONST_OPT const List64 allTargs ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll below loops - SET_VAR_AT_COMPILE_TIME(int, numTargPairs, NumTargs, numKetTargs); + SET_VAR_AT_COMPILE_TIME(int, numTargPairs, NumTargs, ketTargs.size()); // may be inferred at compile-time int numAllTargs = 2 * numTargPairs; @@ -1147,7 +1186,7 @@ __global__ void kernel_densmatr_partialTrace_sub( /// should change the parallelisation axis in this scenario, or preclude it with validation! // k = nth local index of inQureg where all targs and pairs are zero - qindex k = insertBits(n, allTargs, numAllTargs, 0); // loop may be unrolled + qindex k = insertBits(n, allTargs.data(), numAllTargs, 0); // loop may be unrolled // each outQureg amp results from summing 2^targs inQureg amps gpu_qcomp outAmp = getGpuQcomp(0, 0); @@ -1157,8 +1196,8 @@ __global__ void kernel_densmatr_partialTrace_sub( // i = nth local index of inQureg where targs=j and pairTargs=j qindex i = k; - i = setBits(i, ketTargs, numTargPairs, j); // loops may be unrolled - i = setBits(i, pairTargs, numTargPairs, j); + i = setBits(i, ketTargs.data(), numTargPairs, j); // loops may be unrolled + i = setBits(i, pairTargs.data(), numTargPairs, j); outAmp += ampsIn[i]; } @@ -1177,7 +1216,7 @@ template __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( qreal* outProbs, gpu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - int* qubits, int numQubits + _GRID_CONST_OPT const List64 qubits ) { GET_THREAD_IND(n, numThreads); @@ -1188,7 +1227,7 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( /// whether this is worthwhile and faster! // use template param to compile-time unroll below loops - SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, numQubits); + SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); qreal prob = norm(amps[n]); @@ -1196,7 +1235,7 @@ __global__ void kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub( qindex i = concatenateBits(rank, n, logNumAmpsPerNode); // j = outcome index corresponding to prob - qindex j = getValueOfBits(i, qubits, numBits); // loop therein may be unrolled + qindex j = getValueOfBits(i, qubits.data(), numBits); // loop therein may be unrolled atomicAdd(&outProbs[j], prob); } @@ -1207,12 +1246,12 @@ __global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub( qreal* outProbs, gpu_qcomp* amps, qindex numThreads, qindex firstDiagInd, qindex numAmpsPerCol, int rank, qindex logNumAmpsPerNode, - int* qubits, int numQubits + _GRID_CONST_OPT const List64 qubits ) { GET_THREAD_IND(n, numThreads); // use template param to compile-time unroll loop in insertBits() - SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, numQubits); + SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size()); // i = index of nth local diagonal elem qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol); @@ -1222,7 +1261,7 @@ __global__ void kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub( qindex j = concatenateBits(rank, i, logNumAmpsPerNode); // k = outcome index corresponding to - qindex k = getValueOfBits(j, qubits, numBits); // loop therein may be unrolled + qindex k = getValueOfBits(j, qubits.data(), numBits); // loop therein may be unrolled atomicAdd(&outProbs[k], prob); } diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 296fa6704..e9936c9c3 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -30,6 +30,7 @@ * mapped to HIP symbols by cuda_to_hip.h * * @author Tyson Jones + * @author James Richings (optimised away qubit-list allocs) */ // obtain preprocessors from config.h prior to validation @@ -144,12 +145,12 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubits, ConstLis qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex sendInd = getSubBufferSendInd(qureg); - devints sortedQubits = getDevInts(util_getSorted(qubits)); + List64 sortedQubits = util_getSorted(qubits); qindex qubitStateMask = util_getBitMask(qubits, qubitStates); kernel_statevec_packAmpsIntoBuffer <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + sendInd, numThreads, - getPtr(sortedQubits), qubits.size(), qubitStateMask + sortedQubits, qubitStateMask ); // return the number of packed amps @@ -212,12 +213,12 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, ConstList64 ctrls, ConstList64 c int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ2, targ1})); + List64 sortedQubits = util_getSorted(ctrls, {targ2, targ1}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); kernel_statevec_anyCtrlSwap_subA <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, - getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2 + sortedQubits, qubitStateMask, targ1, targ2 ); #else @@ -238,12 +239,12 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, ConstList64 ctrls, ConstList64 c qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); - devints sortedCtrls = getDevInts(util_getSorted(ctrls)); + List64 sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlSwap_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, - getPtr(sortedCtrls), ctrls.size(), ctrlStateMask + sortedCtrls, ctrlStateMask ); #else @@ -264,12 +265,12 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, ConstList64 ctrls, ConstList64 c qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); - devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ})); + List64 sortedQubits = util_getSorted(ctrls, {targ}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); kernel_statevec_anyCtrlSwap_subC <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, - getPtr(sortedQubits), ctrls.size(), qubitStateMask + sortedQubits, qubitStateMask ); #else @@ -306,15 +307,15 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, ConstList64 ctrls, C qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1); int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - - devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ})); + + List64 sortedQubits = util_getSorted(ctrls, {targ}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); - auto [m00, m01, m10, m11] = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grr! + auto [m00, m01, m10, m11] = getFlattenedGpuQcompMatrix<2>(matr.elems); // explicit template for MSVC, grrr! kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( - getGpuQcompPtr(qureg.gpuAmps), numThreads, - getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, + getGpuQcompPtr(qureg.gpuAmps), numThreads, sortedQubits, + qubitStateMask, targ, m00, m01, m10, m11 ); @@ -336,12 +337,12 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, ConstList64 ctrls, C qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); - devints sortedCtrls = getDevInts(util_getSorted(ctrls)); + List64 sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, - getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, + sortedCtrls, ctrlStateMask, getGpuQcomp(fac0), getGpuQcomp(fac1) ); @@ -379,7 +380,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - devints sortedQubits = getDevInts(util_getSorted(ctrls, {targ1,targ2})); + List64 sortedQubits = util_getSorted(ctrls, {targ1,targ2}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0}); // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...) @@ -387,7 +388,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, - getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2, + sortedQubits, qubitStateMask, targ1, targ2, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], m[8], m[9], m[10], m[11], m[12], m[13], m[14], m[15] ); @@ -441,16 +442,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co // task each thread with processing more than a single batch qindex numBatches = qureg.numAmpsPerNode / powerOf2(ctrls.size() + targs.size()); - devints deviceTargs = getDevInts(targs); - devints deviceQubits = getDevInts(util_getSorted(ctrls, targs)); + List64 sortedQubits = util_getSorted(ctrls, targs); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targs, util_getConstantList(0,targs.size())); // unpacking args (to better distinguish below signatures) auto ampsPtr = getGpuQcompPtr(qureg.gpuAmps); auto matrPtr = getGpuQcompPtr(matr.gpuElemsFlat); - auto qubitsPtr = getPtr(deviceQubits); - auto targsPtr = getPtr(deviceTargs); - auto nCtrls = ctrls.size(); // this function updates amplitudes in batches of 2^NumTargs, where each is // determined by distinct mixtures of the existing 2^NumTargs values, which @@ -482,8 +479,8 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co <<>> ( ampsPtr, numThreads, - qubitsPtr, nCtrls, qubitStateMask, - targsPtr, matrPtr + sortedQubits, qubitStateMask, + targs, matrPtr ); } else { @@ -522,8 +519,8 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co <<>> ( getGpuQcompPtr(cache), ampsPtr, numThreads, numBatchesPerThread, - qubitsPtr, nCtrls, qubitStateMask, - targsPtr, targs.size(), powerOf2(targs.size()), matrPtr + sortedQubits, qubitStateMask, + targs, powerOf2(targs.size()), matrPtr ); } @@ -584,13 +581,13 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, Con int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - devints deviceCtrls = getDevInts(util_getSorted(ctrls)); + List64 sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = getGpuQcompArray<2>(matr.elems); // explicit template for MSVC, grr! kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( - getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, sortedCtrls, + ctrlStateMask, targ, elems[0], elems[1] ); // explicitly return to avoid runtime error below @@ -655,13 +652,13 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, Con int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - devints deviceCtrls = getDevInts(util_getSorted(ctrls)); + List64 sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = getGpuQcompArray<4>(matr.elems); // explicit template for MSVC, grr! kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( - getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2, + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, sortedCtrls, + ctrlStateMask, targ1, targ2, elems[0], elems[1], elems[2], elems[3] ); @@ -724,13 +721,12 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, Con int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - devints deviceTargs = getDevInts(targs); - devints deviceCtrls = getDevInts(util_getSorted(ctrls)); + List64 sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( - getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), + getGpuQcompPtr(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, sortedCtrls, + ctrlStateMask, targs, getGpuQcompPtr(util_getGpuMemPtr(matr)), getGpuQcomp(exponent) ); @@ -831,9 +827,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, ConstList64 ct auto targsXY = util_getConcatenated(x, y); auto maskXY = util_getBitMask(targsXY); auto maskYZ = util_getBitMask(util_getConcatenated(y, z)); - - devints deviceTargs = getDevInts(targsXY); - devints deviceQubits = getDevInts(util_getSorted(ctrls, targsXY)); + + List64 sortedQubits = util_getSorted(ctrls, targsXY); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, targsXY, util_getConstantList(0,targsXY.size())); // unlike the analogous cpu routine, this function has only a single parallelisation @@ -844,11 +839,12 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, ConstList64 ct qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); + kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, - getPtr(deviceQubits), ctrls.size(), qubitStateMask, - getPtr(deviceTargs), deviceTargs.size(), - maskXY, maskYZ, getGpuQcomp(powI), getGpuQcomp(ampFac), getGpuQcomp(pairAmpFac) + sortedQubits, qubitStateMask, + targsXY, maskXY, maskYZ, + getGpuQcomp(powI), getGpuQcomp(ampFac), getGpuQcomp(pairAmpFac) ); #else @@ -873,12 +869,12 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, ConstList64 ct auto maskXY = util_getBitMask(util_getConcatenated(x, y)); auto maskYZ = util_getBitMask(util_getConcatenated(y, z)); - devints sortedCtrls = getDevInts(util_getSorted(ctrls)); + List64 sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( getGpuQcompPtr(qureg.gpuAmps), getGpuQcompPtr(qureg.gpuCommBuffer) + recvInd, numThreads, - getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, + sortedCtrls, ctrlStateMask, maskXY, maskYZ, bufferMaskXY, getGpuQcomp(powI), getGpuQcomp(ampFac), getGpuQcomp(pairAmpFac) ); @@ -910,13 +906,13 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, ConstList64 c int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - devints sortedCtrls = getDevInts(util_getSorted(ctrls)); + List64 sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); qindex targMask = util_getBitMask(targs); kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <<>> ( getGpuQcompPtr(qureg.gpuAmps), numThreads, - getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask, + sortedCtrls, ctrlStateMask, targMask, getGpuQcomp(fac0), getGpuQcomp(fac1) ); @@ -950,7 +946,8 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs for (auto& qureg : inQuregs) ptrs.push_back(getGpuQcompPtr(qureg.gpuAmps)); - // copy coeff and qureg lists into GPU memory + // copy coeff and qureg lists into GPU memory, allocating new device memory + // which will be a visible overhead when the Qureg are small. But eh! devgpuqcompptrs devQuregAmps = ptrs; devcomps devCoeffs = coeffs; @@ -1484,13 +1481,11 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, ConstList64 ta int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - devints devTargs = getDevInts(targs); - devints devPairTargs = getDevInts(pairTargs); - devints devAllTargs = getDevInts(util_getSorted(targs, pairTargs)); + List64 allTargs = util_getSorted(targs, pairTargs); kernel_densmatr_partialTrace_sub <<>> ( getGpuQcompPtr(inQureg.gpuAmps), getGpuQcompPtr(outQureg.gpuAmps), numThreads, - getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size() + targs, pairTargs, allTargs ); #else @@ -1606,12 +1601,11 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); // allocate exponentially-big temporary memory (error if failed) - devints devQubits = getDevInts(qubits); devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( - getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, - qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size() + getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, + qureg.rank, qureg.logNumAmpsPerNode, qubits ); // overwrite outProbs with GPU memory @@ -1644,14 +1638,13 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu qindex numAmpsPerCol = powerOf2(qureg.numQubits); // allocate exponentially-big temporary memory (error if failed) - devints devQubits = getDevInts(qubits); devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( getPtr(devProbs), getGpuQcompPtr(qureg.gpuAmps), numThreads, firstDiagInd, numAmpsPerCol, qureg.rank, qureg.logNumAmpsPerNode, - getPtr(devQubits), devQubits.size() + qubits ); // overwrite outProbs with GPU memory diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh index 3cf33e52d..b6c113e73 100644 --- a/quest/src/gpu/gpu_thrust.cuh +++ b/quest/src/gpu/gpu_thrust.cuh @@ -66,28 +66,12 @@ * copy constructor (devicevec d_vec = hostvec). The pointer * to the data (d_vec.data()) can be cast into a raw pointer * and passed directly to CUDA kernels (though qcomp must be - * reinterpreted to gpu_qcomp). + * reinterpreted to gpu_qcomp). Note this induces a device + * allocation which can dominate costs for small Quregs; such + * costs have been eradicated for bounded-size qubit lists. */ -using devints = thrust::device_vector; - -devints getDevInts(ConstList64 h_list) { - - // DEBUG: this is a placeholder! James' GPU refactor should make it redundant, - // and we can pass List64 directly to a CUDA kernel, paying no heap allocs, - // nor CUDA memcpy costs - - devints d_list = std::vector(h_list.data(), h_list.data() + h_list.size()); - return d_list; -} - -int* getPtr(devints& qubits) { - - return thrust::raw_pointer_cast(qubits.data()); -} - - using devreals = thrust::device_vector; qreal* getPtr(devreals& reals) { @@ -1071,4 +1055,4 @@ void thrust_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(Qureg qure -#endif // GPU_THRUST_HPP \ No newline at end of file +#endif // GPU_THRUST_HPP diff --git a/tests/unit/experimental.cpp b/tests/unit/experimental.cpp index 943645831..a8497eb8f 100644 --- a/tests/unit/experimental.cpp +++ b/tests/unit/experimental.cpp @@ -84,7 +84,7 @@ TEST_CASE( "setQuESTNumGpuThreadsPerBlock", TEST_CATEGORY ) { SECTION( "Exceeds device maximum" ) { - int badNumTPB = 999999; // exceeds expected 1024 max + int badNumTPB = 102400; // exceeds expected 1024 max // Cannot be tested (since validation not imposed) when GPU is not actively used if (getQuESTEnv().isGpuAccelerated)