From c4cbdc8b7d9f558d2d410de27e49f5c25b681a25 Mon Sep 17 00:00:00 2001
From: LwhJesse <256257451+LwhJesse@users.noreply.github.com>
Date: Sun, 17 May 2026 15:41:33 +0800
Subject: [PATCH] Use cuSPARSE BSR SpMV for CUDA matrix-vector products

---
 Common/src/linear_algebra/CSysMatrixGPU.cu | 128 +++++++++++++++------
 meson.build                                |   5 +-
 2 files changed, 95 insertions(+), 38 deletions(-)
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
index 7e0c81ca54f..a3f1a77ca40 100644
--- a/Common/src/linear_algebra/CSysMatrixGPU.cu
+++ b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -28,32 +28,43 @@
 #include "../../include/linear_algebra/CSysMatrix.hpp"
 #include "../../include/linear_algebra/GPUComms.cuh"
 
-template<typename matrixType, typename vectorType>
-__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
-{
-   int row = (blockIdx.x * blockDim.x + threadIdx.x)/32;
-   int threadNo = threadIdx.x%32;
-   int activeThreads = nVar * (32/nVar);
-
-   int blockRow = (threadNo/nVar)%nVar;
-
-   if(row<nPointDomain && threadNo<activeThreads) prod[row * nVar + blockRow] = 0.0;
-
-   __syncthreads();
-
-   if(row<nPointDomain && threadNo<activeThreads)
-   {
-      vectorType res = 0.0;
+#include <cusparse.h>
+#include <cstdint>
+#include <type_traits>
+
+inline void cusparseAssert(cusparseStatus_t code, const char* file, int line, bool abort = true) {
+  if (code != CUSPARSE_STATUS_SUCCESS) {
+    fprintf(stderr, "cuSPARSEassert: %s %s %d\n", cusparseGetErrorString(code), file, line);
+    if (abort) exit(static_cast<int>(code));
+  }
+}
 
-      for(int index = d_row_ptr[row] * nVar * nEqn + threadNo; index < d_row_ptr[row+1] * nVar * nEqn; index+=activeThreads)
-      {
-         int blockCol = index%nEqn;
-         int blockNo = index/(nVar * nEqn);
-         res += matrix[index] * vec[(d_col_ind[blockNo])*nVar + blockCol];
-      }
+#define cusparseErrChk(ans)                    \
+  {                                            \
+    cusparseAssert((ans), __FILE__, __LINE__); \
+  }
+
+inline cusparseIndexType_t GetCusparseIndexType() {
+  if constexpr (sizeof(unsigned long) == 4) {
+    return CUSPARSE_INDEX_32I;
+  } else if constexpr (sizeof(unsigned long) == 8) {
+    return CUSPARSE_INDEX_64I;
+  } else {
+    static_assert(sizeof(unsigned long) == 4 || sizeof(unsigned long) == 8,
+                  "cuSPARSE BSR SpMV only supports 32-bit or 64-bit index arrays in this path.");
+  }
+}
 
-      atomicAdd(&prod[row * nVar + blockRow], res);
-   }
+template <class ScalarType>
+constexpr cudaDataType GetCudaDataType() {
+  if constexpr (std::is_same<ScalarType, float>::value) {
+    return CUDA_R_32F;
+  } else if constexpr (std::is_same<ScalarType, double>::value) {
+    return CUDA_R_64F;
+  } else {
+    static_assert(std::is_same<ScalarType, float>::value || std::is_same<ScalarType, double>::value,
+                  "cuSPARSE BSR SpMV only supports float and double in this path.");
+  }
 }
 
 template<class ScalarType>
@@ -62,26 +73,69 @@ void CSysMatrix<ScalarType>::HtDTransfer(bool trigger) const
    if(trigger) gpuErrChk(cudaMemcpy((void*)(d_matrix), (void*)&matrix[0], (sizeof(ScalarType)*nnz*nVar*nEqn), cudaMemcpyHostToDevice));
 }
 
-template<class ScalarType>
+template <class ScalarType>
 void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
-                                                 CGeometry* geometry, const CConfig* config) const
-                                                 {
+                                                    CGeometry* geometry, const CConfig* config) const {
+  if (nVar != nEqn) {
+    SU2_MPI::Error("CUDA CSysMatrix matvec with cuSPARSE BSR requires square blocks.", CURRENT_FUNCTION);
+  }
 
-   ScalarType* d_vec = vec.GetDevicePointer();
-   ScalarType* d_prod = prod.GetDevicePointer();
+  ScalarType* d_vec = vec.GetDevicePointer();
+  ScalarType* d_prod = prod.GetDevicePointer();
 
-   vec.HtDTransfer();
-   prod.GPUSetVal(0.0);
+  vec.HtDTransfer();
 
-  dim3 blockDim(KernelParameters::MVP_BLOCK_SIZE,1,1);
-  int gridx = KernelParameters::round_up_division(KernelParameters::MVP_WARP_SIZE, nPointDomain);
-  dim3 gridDim(gridx, 1, 1);
+  const auto indexType = GetCusparseIndexType();
+  const auto valueType = GetCudaDataType<ScalarType>();
 
-  GPUMatrixVectorProductAdd<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, nPointDomain, nVar, nEqn);
-  gpuErrChk( cudaPeekAtLastError() );
+  const std::int64_t blockSize = static_cast<std::int64_t>(nVar);
 
-  prod.DtHTransfer();
+  const std::int64_t brows = static_cast<std::int64_t>(nPointDomain);
+  const std::int64_t bcols = static_cast<std::int64_t>(nPoint);
+  const std::int64_t bnnz = static_cast<std::int64_t>(nnz);
+
+  const std::int64_t xSize = static_cast<std::int64_t>(nPoint) * blockSize;
+  const std::int64_t ySize = static_cast<std::int64_t>(nPointDomain) * blockSize;
+
+  const ScalarType alpha = 1.0;
+  const ScalarType beta = 0.0;
 
+  cusparseHandle_t handle = nullptr;
+  cusparseConstSpMatDescr_t matA = nullptr;
+  cusparseDnVecDescr_t vecX = nullptr;
+  cusparseDnVecDescr_t vecY = nullptr;
+
+  cusparseErrChk(cusparseCreate(&handle));
+
+  cusparseErrChk(cusparseCreateConstBsr(&matA, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr, d_col_ind, d_matrix,
+                                        indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, valueType, CUSPARSE_ORDER_ROW));
+
+  cusparseErrChk(cusparseCreateDnVec(&vecX, xSize, d_vec, valueType));
+  cusparseErrChk(cusparseCreateDnVec(&vecY, ySize, d_prod, valueType));
+
+  size_t bufferSize = 0;
+  void* dBuffer = nullptr;
+
+  cusparseErrChk(cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
+                                         valueType, CUSPARSE_SPMV_BSR_ALG1, &bufferSize));
+
+  if (bufferSize > 0) {
+    gpuErrChk(cudaMalloc(&dBuffer, bufferSize));
+  }
+
+  cusparseErrChk(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, valueType,
+                              CUSPARSE_SPMV_BSR_ALG1, dBuffer));
+
+  if (dBuffer != nullptr) {
+    gpuErrChk(cudaFree(dBuffer));
+  }
+
+  cusparseErrChk(cusparseDestroyDnVec(vecY));
+  cusparseErrChk(cusparseDestroyDnVec(vecX));
+  cusparseErrChk(cusparseDestroySpMat(matA));
+  cusparseErrChk(cusparseDestroy(handle));
+
+  prod.DtHTransfer();
 }
 
 template class CSysMatrix<su2mixedfloat>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
diff --git a/meson.build b/meson.build
index 0620578a163..f4cf6c215d2 100644
--- a/meson.build
+++ b/meson.build
@@ -20,10 +20,13 @@ python = pymod.find_installation()
 if get_option('enable-cuda')
   add_languages('cuda')
   add_global_arguments('-arch=sm_86', language : 'cuda')
+  cuda_deps = [meson.get_compiler('cuda').find_library('cusparse', required : true)]
+else
+  cuda_deps = []
 endif
 
 su2_cpp_args = []
-su2_deps     = [declare_dependency(include_directories: 'externals/CLI11')]
+su2_deps     = [declare_dependency(include_directories: 'externals/CLI11')] + cuda_deps
 
 default_warning_flags = []
 if build_machine.system() != 'windows'