diff --git a/README.md b/README.md index d5a2886..e2efdc4 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,29 @@ **Points**: 100 --- - +### Results table +| Test Case | Dimensions (m × n × p) | Naive CPU (s) | Blocked CPU (s) | Parallel CPU (s) | Naive CUDA | Tiled CUDA | Tiled CUDA Speedup (vs. Naive CUDA) | Tiled CUDA Speedupt (vs. Parallel CPU) | +|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------| +| 0 | 64 × 64 × 64 | 0.000999928 | 0.00199986 | 0.00200009 | 0.000183423995622 | 0.000117343995952| 1.56x | 17.04x | +|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------| +| 1 | 128 x 64 x 128 | 0.00300002 | 0.00500011 | 0.000999928 | 0.0000004696054247| 0.000000073160730| 6.02x | 12 807x | +|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------| +| 2 | 100 x 128 x 56 | 0.00200009 | 0.00300002 | 0.000999928 | 0.0000000000143761| 0 | inf | inf | +|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------| +| 3 | 128 x 64 x 128 | 0.00300002 | 0.00500011 | 0.00100017 | 0.0002397119969828| 0 | inf | inf | +|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------| +| 4 | 32 x 128 x 32 | 0.000999928 | 0.000999928 | 0 | 0.0000000000353951| 0 | inf | inf | +|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------| +| 5 | 200 x 100 x 256 | 0.0190001 | 0.0249999 | 0.00500011 | 0.0000000000132943| 0.000000000008350| 1.59x | 598 802 395x | +|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------| +| 6 | 256 X 256 X 256 | 0.0580001 | 0.0799999 | 0.013 | 0.0029166832100600| 0.001119550740122| 2.43x | 10.87x | +|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------| +| 7 | 256 X 300 256 | 0.0669999 | 0.095 | 0.0170002 | 0.000000000053352 | 0.00000000002155 | 2.48x | 790 697 674x | +|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------| +| 8 | 64 x 128 x 64 | 0.00200009 | 0.00299978 | 0.000999928 | 0.0000000000000076| 0 | inf | inf | +|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------| +| 9 | 256 x 256 x 257 | 0.0580001 | 0.0819998 | 0.0140002 | 0.0000000000001401| 0.000000000000375| 0.37x | 37 333 333 333x | +--- ### Assignment Overview Welcome to the third homework assignment of the Parallel Programming course! diff --git a/main.cu b/main.cu index 50df6a1..53bf780 100644 --- a/main.cu +++ b/main.cu @@ -2,35 +2,310 @@ #include #include #include +#include +#include -__global__ void naive_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - // TODO: Implement naive CUDA matrix multiplication + +__global__ void naive_cuda_matmul(float *C, float *A, float *B, int m, int n, int p) { + // Implement naive CUDA matrix multiplication + // Define row and col size + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + + // Iterate block + if (row < m && col < p) { + float value = 0.0f; + for (int i = 0; i < n; ++i) { + // Calculate value of C by multiplying cells from A and B + value += A[row * n + i] * B[i * p + col]; + } + // Set calculated value to C + C[row * p + col] = value; + } } -__global__ void tiled_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t tile_width) { - // TODO: Implement tiled CUDA matrix multiplication +__global__ void tiled_cuda_matmul(float *C, float *A, float *B, int m, int n, int p, int tile_width) { + // Implement tiled CUDA matrix multiplication + extern __shared__ float sharedMemory[]; + float *sA = sharedMemory; + float *sB = &sharedMemory[tile_width * tile_width]; + + // Define row and col size + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + + float value = 0.0f; + + for (int i = 0; i < n; i += tile_width) { + // Load tiles into shared memory for faster calculations + if (row < m && (threadIdx.x + i) < n) { + sA[threadIdx.y * tile_width + threadIdx.x] = A[row * n + (threadIdx.x + i)]; + } else { + sA[threadIdx.y * tile_width + threadIdx.x] = 0.0f; + } + // Same with matrix B + if (col < p && (threadIdx.y + i) < n) { + sB[threadIdx.y * tile_width + threadIdx.x] = B[(threadIdx.y + i) * p + col]; + } else { + sB[threadIdx.y * tile_width + threadIdx.x] = 0.0f; + } + + // Sync to ensure all threads in a block have loaded data before computation + __syncthreads(); + + for (int i = 0; i < tile_width; ++i) { + // Calculate value of C by multiplying tiles from shared memory A and B + value += sA[threadIdx.y * tile_width + i] * sB[i * tile_width + threadIdx.x]; + } + + __syncthreads(); + } + // Set calculated value to C + if (row < m && col < p) { + C[row * p + col] = value; + } } -bool validate_result(const std::string &result_file, const std::string &reference_file) { - // TODO: Implement result validation (same as Assignment 2) +bool validate_result(const std::string &result_file, const std::string &reference_file, int m, int p) { + // Implement result validation (same as Assignment 2) + std::ifstream comparison(reference_file); + if (!comparison.is_open()) { + // Validate that file opened correctly + std::cerr << "Unable to open file"; + exit(1); + } + + std::ifstream res(result_file); + if (!res.is_open()) { + // Validate that file opened correctly + std::cerr << "Unable to open file"; + exit(1); + } + + float Comp, ResValue; + + // Iterate using the dimensions of C. + for (int i = 0; i < m; ++i) { + for (int j = 0; j < p; ++j) { + // Get element from both matrix by index, compare values and throw error if values don't match + res >> ResValue; + comparison >> Comp; + if (ResValue != Comp) { + std::cerr << "Value mismatch"; + exit(1); + } + } + } + + // Close both files once comparison is done + comparison.close(); + res.close(); + return true; } int main(int argc, char *argv[]) { + // Read input0.raw (matrix A) and input1.raw (matrix B) + int m, n, p; + + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + int case_number = std::atoi(argv[1]); + if (case_number < 0 || case_number > 9) { + std::cerr << "Case number must be between 0 and 9" << std::endl; + return 1; + } + + // Construct file paths + std::string folder = "data/" + std::to_string(case_number) + "/"; + std::string input0_file = folder + "input0.raw"; + std::string input1_file = folder + "input1.raw"; + std::string result_file = folder + "result.raw"; + std::string reference_file = folder + "output.raw"; + + // Read input0.raw (matrix A) + std::ifstream FileA(input0_file); + // Validate that file is opened correctly + if (!FileA.is_open()) { + std::cerr << "Error opening file"; + return 1; + } + + // Read input1.raw (matrix B) + std::ifstream FileB(input1_file); + // Validate that file is opened correctly + if (!FileB.is_open()) { + std::cerr << "Error opening file"; + return 1; + } + // Get matrix dimensions + FileA >> m >> n; + FileB >> n >> p; - // TODO: Read input0.raw (matrix A) and input1.raw (matrix B) + + // Allocate memory for matrices A and B, to read data from files + float* A = (float*)malloc(m * n * sizeof(float)); + // Validate that memory is allocated correctly + if (A == NULL) { + std::cerr << "Memory allocation failed"; + return 1; + } - // TODO: Use cudaMalloc and cudaMemcpy for GPU memory + float* B = (float*)malloc(n * p * sizeof(float)); + if (B == NULL) { + std::cerr << "Memory allocation failed"; + return 1; + } + + //Read matrix elements into A and B (row-major order), close file after reading + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + FileA >> A[i * n + j]; + } + } + FileA.close(); + + for (int i = 0; i < n; ++i) { + for (int j = 0; j < p; ++j) { + FileB >> B[i * p + j]; + } + } + FileB.close(); + + // Allocate memory for result matrices + float *C_naive = new float[m * p]; + float *C_tiled = new float[m * p]; + + // Use cudaMalloc and cudaMemcpy for GPU memory + // Allocate device memory for both naive and tiled + float* cu_naive_A; + float* cu_tiled_A; + float* cu_naive_B; + float* cu_tiled_B; + float* cu_naive_C; + float* cu_tiled_C; + + cudaMalloc((void**)&cu_naive_A, m * n * sizeof(float)); + cudaMalloc((void**)&cu_tiled_A, m * n * sizeof(float)); + cudaMalloc((void**)&cu_naive_B, n * p * sizeof(float)); + cudaMalloc((void**)&cu_tiled_B, n * p * sizeof(float)); + cudaMalloc((void**)&cu_naive_C, m * p * sizeof(float)); + cudaMalloc((void**)&cu_tiled_C, m * p * sizeof(float)); // Measure naive CUDA performance - // TODO: Launch naive_cuda_matmul kernel + // Create cuda events and start timing + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + // Copy data from initial matrices + cudaMemcpy(cu_naive_A, A, m * n * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(cu_naive_B, B, n * p * sizeof(float), cudaMemcpyHostToDevice); + + // Set tile size, + int TILE_SIZE = 16; + // block size and number of blocks. + dim3 numThreadsPerBlock(TILE_SIZE, TILE_SIZE); + dim3 numBlocks((p + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x, (m + numThreadsPerBlock.y - 1) / numThreadsPerBlock.y); + + // Launch naive_cuda_matmul kernel + naive_cuda_matmul<<< numBlocks, numThreadsPerBlock >>> (cu_naive_C, cu_naive_A, cu_naive_B, m, n, p); + + // Copy results back from device to host + cudaMemcpy(C_naive, cu_naive_C, m * p * sizeof(float), cudaMemcpyDeviceToHost); + + // Stop timing, write down result and destroy events + cudaEventRecord(stop); + float naive_cuda_time = 0; + cudaEventElapsedTime(&naive_cuda_time, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + // Write naive CUDA result to file and validate + // Write dimensions and elements to result.raw + std::ofstream result(result_file); + // Validate that file is created correctly + if (!result) { + std::cerr << "Unable to open file"; + exit(1); + } + + // Write the dimensions of C on the first line + result << m << " " << p << std::endl; + result << std::fixed << std::setprecision(2); + // Iterate C and write each element to result.raw + for (int i = 0; i < m; ++i) { + for (int j = 0; j < p; ++j) { + result << C_naive[i * p + j] << " "; + } + result << std::endl; + } + // Close file after writing + result.close(); + + // Validate naive CUDA results + bool naive_correct = validate_result(result_file, reference_file, m, p); + if (!naive_correct) { + std::cerr << "Naive result validation failed for case " << case_number << std::endl; + } - // TODO: Write naive CUDA result to file and validate // Measure tiled CUDA performance + // Create cuda events and start timing + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + // Copy data from initial matrices + cudaMemcpy(cu_tiled_A, A, m * n * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(cu_tiled_B, B, n * p * sizeof(float), cudaMemcpyHostToDevice); + size_t sharedMemorySize = 2 * TILE_SIZE * TILE_SIZE * sizeof(float); - // TODO: Launch tiled_cuda_matmul kernel + // Launch tiled_cuda_matmul kernel + tiled_cuda_matmul<<< numBlocks, numThreadsPerBlock, sharedMemorySize >>> (cu_tiled_C, cu_tiled_A, cu_tiled_B, m, n, p, TILE_SIZE); + + // Copy results back from device to host + cudaMemcpy(C_tiled, cu_tiled_C, m * p * sizeof(float), cudaMemcpyDeviceToHost); + // Stop timing, write down results and destroy events + cudaEventRecord(stop); + float tiled_cuda_time = 0; + cudaEventElapsedTime(&tiled_cuda_time, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); - // TODO: Write tiled CUDA result to file and validate + // Free memory of initial matrices, they are no longer needed at this point + free(A); + free(B); + + // Write tiled CUDA result to file and validate + std::ofstream result_tiled(result_file); + // Validate that file is created correctly + if (!result_tiled) { + std::cerr << "Unable to open file"; + exit(1); + } + + // Write the dimensions of C on the first line + result_tiled << m << " " << p << std::endl; + result_tiled << std::fixed << std::setprecision(2); + // Iterate C and write each element to result.raw + for (int i = 0; i < m; ++i) { + for (int j = 0; j < p; ++j) { + result_tiled << C_tiled[i * p + j] << " "; + } + result_tiled << std::endl; + } + // Close file after writing + result_tiled.close(); + + // Validate tiled CUDA results + bool tiled_correct = validate_result(result_file, reference_file, m, p); + if (!tiled_correct) { + std::cerr << "Tiled result validation failed for case " << case_number << std::endl; + } // Print performance results std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n"; @@ -38,6 +313,13 @@ int main(int argc, char *argv[]) { std::cout << "Tiled CUDA time: " << tiled_cuda_time << " seconds\n"; // Clean up - + cudaFree(cu_naive_A); + cudaFree(cu_tiled_A); + cudaFree(cu_naive_B); + cudaFree(cu_tiled_B); + cudaFree(cu_naive_C); + cudaFree(cu_tiled_C); + delete[] C_naive; + delete[] C_tiled; return 0; } \ No newline at end of file