From c4cbdc8b7d9f558d2d410de27e49f5c25b681a25 Mon Sep 17 00:00:00 2001 From: LwhJesse <256257451+LwhJesse@users.noreply.github.com> Date: Sun, 17 May 2026 15:41:33 +0800 Subject: [PATCH] Use cuSPARSE BSR SpMV for CUDA matrix-vector products --- Common/src/linear_algebra/CSysMatrixGPU.cu | 128 +++++++++++++++------ meson.build | 5 +- 2 files changed, 95 insertions(+), 38 deletions(-) diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu index 7e0c81ca54f..a3f1a77ca40 100644 --- a/Common/src/linear_algebra/CSysMatrixGPU.cu +++ b/Common/src/linear_algebra/CSysMatrixGPU.cu @@ -28,32 +28,43 @@ #include "../../include/linear_algebra/CSysMatrix.hpp" #include "../../include/linear_algebra/GPUComms.cuh" -template -__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn) -{ - int row = (blockIdx.x * blockDim.x + threadIdx.x)/32; - int threadNo = threadIdx.x%32; - int activeThreads = nVar * (32/nVar); - - int blockRow = (threadNo/nVar)%nVar; - - if(row +#include +#include + +inline void cusparseAssert(cusparseStatus_t code, const char* file, int line, bool abort = true) { + if (code != CUSPARSE_STATUS_SUCCESS) { + fprintf(stderr, "cuSPARSEassert: %s %s %d\n", cusparseGetErrorString(code), file, line); + if (abort) exit(static_cast(code)); + } +} - for(int index = d_row_ptr[row] * nVar * nEqn + threadNo; index < d_row_ptr[row+1] * nVar * nEqn; index+=activeThreads) - { - int blockCol = index%nEqn; - int blockNo = index/(nVar * nEqn); - res += matrix[index] * vec[(d_col_ind[blockNo])*nVar + blockCol]; - } +#define cusparseErrChk(ans) \ + { \ + cusparseAssert((ans), __FILE__, __LINE__); \ + } + +inline cusparseIndexType_t GetCusparseIndexType() { + if constexpr (sizeof(unsigned long) == 4) { + return CUSPARSE_INDEX_32I; + } else if constexpr (sizeof(unsigned long) == 8) { + return CUSPARSE_INDEX_64I; + } else { + static_assert(sizeof(unsigned long) == 4 || sizeof(unsigned long) == 8, + "cuSPARSE BSR SpMV only supports 32-bit or 64-bit index arrays in this path."); + } +} - atomicAdd(&prod[row * nVar + blockRow], res); - } +template +constexpr cudaDataType GetCudaDataType() { + if constexpr (std::is_same::value) { + return CUDA_R_32F; + } else if constexpr (std::is_same::value) { + return CUDA_R_64F; + } else { + static_assert(std::is_same::value || std::is_same::value, + "cuSPARSE BSR SpMV only supports float and double in this path."); + } } template @@ -62,26 +73,69 @@ void CSysMatrix::HtDTransfer(bool trigger) const if(trigger) gpuErrChk(cudaMemcpy((void*)(d_matrix), (void*)&matrix[0], (sizeof(ScalarType)*nnz*nVar*nEqn), cudaMemcpyHostToDevice)); } -template +template void CSysMatrix::GPUMatrixVectorProduct(const CSysVector& vec, CSysVector& prod, - CGeometry* geometry, const CConfig* config) const - { + CGeometry* geometry, const CConfig* config) const { + if (nVar != nEqn) { + SU2_MPI::Error("CUDA CSysMatrix matvec with cuSPARSE BSR requires square blocks.", CURRENT_FUNCTION); + } - ScalarType* d_vec = vec.GetDevicePointer(); - ScalarType* d_prod = prod.GetDevicePointer(); + ScalarType* d_vec = vec.GetDevicePointer(); + ScalarType* d_prod = prod.GetDevicePointer(); - vec.HtDTransfer(); - prod.GPUSetVal(0.0); + vec.HtDTransfer(); - dim3 blockDim(KernelParameters::MVP_BLOCK_SIZE,1,1); - int gridx = KernelParameters::round_up_division(KernelParameters::MVP_WARP_SIZE, nPointDomain); - dim3 gridDim(gridx, 1, 1); + const auto indexType = GetCusparseIndexType(); + const auto valueType = GetCudaDataType(); - GPUMatrixVectorProductAdd<<>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, nPointDomain, nVar, nEqn); - gpuErrChk( cudaPeekAtLastError() ); + const std::int64_t blockSize = static_cast(nVar); - prod.DtHTransfer(); + const std::int64_t brows = static_cast(nPointDomain); + const std::int64_t bcols = static_cast(nPoint); + const std::int64_t bnnz = static_cast(nnz); + + const std::int64_t xSize = static_cast(nPoint) * blockSize; + const std::int64_t ySize = static_cast(nPointDomain) * blockSize; + + const ScalarType alpha = 1.0; + const ScalarType beta = 0.0; + cusparseHandle_t handle = nullptr; + cusparseConstSpMatDescr_t matA = nullptr; + cusparseDnVecDescr_t vecX = nullptr; + cusparseDnVecDescr_t vecY = nullptr; + + cusparseErrChk(cusparseCreate(&handle)); + + cusparseErrChk(cusparseCreateConstBsr(&matA, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr, d_col_ind, d_matrix, + indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, valueType, CUSPARSE_ORDER_ROW)); + + cusparseErrChk(cusparseCreateDnVec(&vecX, xSize, d_vec, valueType)); + cusparseErrChk(cusparseCreateDnVec(&vecY, ySize, d_prod, valueType)); + + size_t bufferSize = 0; + void* dBuffer = nullptr; + + cusparseErrChk(cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, + valueType, CUSPARSE_SPMV_BSR_ALG1, &bufferSize)); + + if (bufferSize > 0) { + gpuErrChk(cudaMalloc(&dBuffer, bufferSize)); + } + + cusparseErrChk(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, valueType, + CUSPARSE_SPMV_BSR_ALG1, dBuffer)); + + if (dBuffer != nullptr) { + gpuErrChk(cudaFree(dBuffer)); + } + + cusparseErrChk(cusparseDestroyDnVec(vecY)); + cusparseErrChk(cusparseDestroyDnVec(vecX)); + cusparseErrChk(cusparseDestroySpMat(matA)); + cusparseErrChk(cusparseDestroy(handle)); + + prod.DtHTransfer(); } template class CSysMatrix; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits. diff --git a/meson.build b/meson.build index 0620578a163..f4cf6c215d2 100644 --- a/meson.build +++ b/meson.build @@ -20,10 +20,13 @@ python = pymod.find_installation() if get_option('enable-cuda') add_languages('cuda') add_global_arguments('-arch=sm_86', language : 'cuda') + cuda_deps = [meson.get_compiler('cuda').find_library('cusparse', required : true)] +else + cuda_deps = [] endif su2_cpp_args = [] -su2_deps = [declare_dependency(include_directories: 'externals/CLI11')] +su2_deps = [declare_dependency(include_directories: 'externals/CLI11')] + cuda_deps default_warning_flags = [] if build_machine.system() != 'windows'