diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..ce48fa6 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "files.associations": { + "iosfwd": "cpp" + } +} \ No newline at end of file diff --git a/README.md b/README.md index d5a2886..984ee67 100644 --- a/README.md +++ b/README.md @@ -9,199 +9,29 @@ --- -### Assignment Overview - -Welcome to the third homework assignment of the Parallel Programming course! -In Assignment 2, you optimized matrix multiplication using cache-friendly blocked multiplication and OpenMP for CPU -parallelism. In this assignment, you will take matrix multiplication to the GPU using **CUDA**, NVIDIA’s parallel -computing platform. Your task is to implement matrix multiplication on the GPU, optimize it using CUDA-specific -techniques, and compare its performance with your CPU-based implementations from Assignment 2. - -You will implement: - -1. **Naive CUDA Matrix Multiplication**: A basic GPU implementation using CUDA kernels. -2. **Tiled CUDA Matrix Multiplication**: An optimized version using shared memory to improve memory access patterns. -3. **Performance Comparison**: Measure and compare the performance of both CUDA implementations against your Assignment - 2 implementations (naive, blocked, and parallel). - -This assignment introduces CUDA programming, including kernel launches, thread grids, blocks, and memory management, -while reinforcing the importance of data locality and parallelism. - ---- - -### Technical Requirements - -#### 1. Naive CUDA Matrix Multiplication - -**Why CUDA?** - -CUDA allows you to execute parallel computations on NVIDIA GPUs, which have thousands of cores designed for -data-parallel tasks. Matrix multiplication is an ideal workload for GPUs because it involves independent computations -for each element of the output matrix. - -In the naive CUDA implementation, each thread computes one element of the output matrix \( C \). The GPU organizes -threads into a grid of thread blocks, where each block contains a group of threads (e.g., 16x16 threads). - -**Naive CUDA Matrix Multiplication** - -Assume matrices \( A \) \( m x n \), \( B \) \( n x p \), and \( C \) \( m x p \) are stored in -row-major order in GPU global memory: - -```c -__global__ void naive_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - -} -``` - -- **Grid and Block Configuration**: Launch a 2D grid of 2D thread blocks (e.g., 16x16 threads per block). -- **Memory**: Matrices are stored in GPU global memory. Use `cudaMalloc` and `cudaMemcpy` to allocate and transfer data - between host (CPU) and device (GPU). -- **Task**: Implement the `naive_cuda_matmul` kernel and its host code in the provided `main.cu`. Measure the wall clock - time, including data transfer times (host-to-device and device-to-host). - -#### 2. Tiled CUDA Matrix Multiplication - -**Why Tiling?** - -The naive CUDA implementation accesses global memory frequently, which is slow (hundreds of cycles per access). CUDA -GPUs have **shared memory**, a fast, on-chip memory shared by threads in a block. Tiled matrix multiplication divides -matrices into tiles (submatrices) that fit into shared memory, reducing global memory accesses and improving -performance. - -**Tiled CUDA Matrix Multiplication** - -Assume a tile size of `TILE_WIDTH` (e.g., 16 or 32): - -```c -__global__ void tiled_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t tile_width) { - -} -``` - -- **Shared Memory**: Each block loads tiles of \( A \) and \( B \) into shared memory, computes partial results, and - accumulates the sum. -- **Synchronization**: Use `__syncthreads()` to ensure all threads in a block have loaded data before computation. -- **Task**: Implement the `tiled_cuda_matmul` kernel and its host code in `main.cu`. Experiment with different tile - sizes (e.g., 16, 32) and report the best performance. - -#### 3. Performance Measurement - -For each test case (0 through 9, using the same `data` folder from Assignment 2): - -- Measure the wall clock time for: - - **Naive CUDA matrix multiplication** (`naive_cuda_matmul`), including data transfer times. - - **Tiled CUDA matrix multiplication** (`tiled_cuda_matmul`), including data transfer times. -- Compare with Assignment 2 results (naive, blocked, and parallel CPU implementations). -- Use `cudaEventRecord` and `cudaEventElapsedTime` for accurate GPU timing. -- Report the times in a table in your `README.md`, including: - - Test case number. - - Matrix dimensions (\( m \times n \times p \)). - - Wall clock time for naive CUDA, tiled CUDA, and Assignment 2 implementations (in seconds). - - Speedup of tiled CUDA over naive CUDA and over Assignment 2’s parallel implementation. - -**Example Table Format**: - | Test Case | Dimensions (\( m \times n \times p \)) | Naive CPU (s) | Blocked CPU (s) | Parallel CPU (s) | Naive CUDA (s) | Tiled CUDA (s) | Tiled CUDA Speedup (vs. Naive CUDA) | Tiled CUDA Speedup (vs. Parallel CPU) | |-----------|----------------------------------------|---------------|-----------------|------------------|----------------|----------------|-------------------------------------|---------------------------------------| -| | | | | | | | | | - ---- - -### Matrix Storage and Memory Management - -- Continue using row-major order for matrices. -- Use CUDA memory management (`cudaMalloc`, `cudaMemcpy`, `cudaFree`) for GPU data. -- Reuse the same input/output format as Assignment 2: - - Input files: `data//input0.raw` (matrix \( A \)) and `input1.raw` (matrix \( B \)). - - Output file: `data//result.raw` (matrix \( C \)). - - Reference file: `data//output.raw` for validation. - ---- - -### Build Instructions - -- Use the provided `CMakeLists.txt`, which includes CUDA support. -- **Requirements**: - - NVIDIA GPU with CUDA support. - - CUDA Toolkit installed (version 11.x or later recommended). - - CMake with CUDA language support. -- **Linux/Mac**: - - Run `cmake -DCMAKE_CUDA_COMPILER=nvcc .` to generate a Makefile, then `make`. -- **Windows**: - - Use Visual Studio with CUDA toolkit or MinGW with `cmake -G "MinGW Makefiles"`. -- Test with the same test cases (0–9) as Assignment 2. - ---- - -### Submission Requirements - -#### Fork and Clone the Repository - -- Fork the Assignment 3 repository (provided separately). -- Clone your fork: - ```bash - git clone https://github.com/parallelcomputingabo/Homework-3.git - cd Homework-3 - ``` - -#### Create a New Branch - -```bash -git checkout -b student-name -``` - -#### Implement Your Solution - -- Modify the provided `main.cu` to implement `naive_cuda_matmul` and `tiled_cuda_matmul`. -- Update `README.md` with your performance results table. - -#### Commit and Push - -```bash -git add . -git commit -m "student-name: Implemented CUDA matrix multiplication" -git push origin student-name -``` - -#### Submit a Pull Request (PR) - -- Create a pull request from your branch to the base repository’s `main` branch. -- Include a description of your CUDA optimizations and any challenges faced. - ---- - -### Grading (100 Points Total) - -| Subtask | Points | -|-----------------------------------------------|---------| -| Correct implementation of `naive_cuda_matmul` | 30 | -| Correct implementation of `tiled_cuda_matmul` | 30 | -| Accurate performance measurements | 20 | -| Performance results table in `README.md` | 10 | -| Code clarity, commenting, and organization | 10 | -| **Total** | **100** | - ---- - -### Tips for Success - -- **Naive CUDA**: - - Ensure correct grid and block dimensions (e.g., `dim3 threadsPerBlock(16, 16)`). - - Check for CUDA errors using `cudaGetLastError` and `cudaDeviceSynchronize`. -- **Tiled CUDA**: - - Experiment with tile sizes (e.g., 16, 32) to balance shared memory usage and thread divergence. - - Minimize shared memory bank conflicts by ensuring contiguous thread access. -- **Performance**: - - Include data transfer times in measurements, as they are significant for GPU workloads. - - Run multiple iterations per test case to reduce timing variability. -- **Debugging**: - - Validate CUDA results against `output.raw` to ensure correctness. - - Use small matrices for initial testing (e.g., 64x64). - - Check CUDA documentation for memory management and kernel launch syntax. - ---- - - +| 0 | 64x64x64 | 0 | 0 | 0 | 0.15 | 0.135 | 1.11 | 0 | +| 1 | 128x64x128 | 0.00199986 | 0.00200009 | 0.000999928 | 0.18 | 0.2 | 0.9 | 0 | +| 2 | 100x128x56 | 0.00199986 | 0.00200009 | 0.00100017 | 0.15 | 0.172 | 0.87 | 0 | +| 3 | 128x64x128 | 0.00300002 | 0.00199986 | 0.000999928 | 0.167 | 0.193 | 0.86 | 0 | +| 4 | 32x128x32 | 0 | 0 | 0 | 0.125 | 0.113 | 0.904 | 0 | +| 5 | 200x100x256 | 0.0120001 | 0.0109999 | 0.00399995 | 0.279 | 0.473 | 0 | 0 | +| 6 | 256x256x256 | 0.036 | 0.0419998 | 0.00600004 | 0.385 | 0.88 | 0.589 | 0 | +| 7 | 256x300x256 | 0.0409999 | 0.043 | 0.00999999 | 0.386 | 0.982 | 0.393 | 0 | +| 8 | 64x128x64 | 0.000999928 | 0.00200009 | 0.000999928 | 0.141 | 0.164 | 0.859 | 0 | +| 9 | 256x256x257 | 0.0319998 | 0.036 | 0.0079999 | 0.369 | 0.905 | 0.407 | 0 | + +--- +## Some extra explanation about the assignment +### Files +Output of the calculations nodes as a text, executable, cuda source file and the bash file for jub run. +### Tiled implementation +The implementation of the the tiled multiplication has been borrowed from [Here](https://medium.com/@dhanushg295/mastering-cuda-matrix-multiplication-an-introduction-to-shared-memory-tile-memory-coalescing-and-d7979499b9c5). Two values (16 and 32) has been tested for the TILE_WIDTH, however, 32 didn't work and was giving wrong answers, it may be to large for the shared memory. +### Transfer time +Transfer time has been calculated seperately and added to the calculation time. It included time to copy from host to device and vice versa. The seperated times are showed in the text output. +### Timings and the table +I don't know why but my naive is faster in almost all of the scenarios. Even though I checked the implementation with other resources. +The timings of assignment2, seem much better, I don't know what is the reason for that, it might be an issue with the timer in one of the versions but due to this difference, speed up of GPU was meaningless and close to zero for all of the cases. -Good luck, and enjoy accelerating matrix multiplication with CUDA! diff --git a/main b/main new file mode 100644 index 0000000..f420150 Binary files /dev/null and b/main differ diff --git a/main.cu b/main.cu index 50df6a1..2bba477 100644 --- a/main.cu +++ b/main.cu @@ -2,42 +2,210 @@ #include #include #include +#include +#include +#include +#define USECPSEC 1000000ULL +const int TILE_WIDTH=16; +using namespace std; __global__ void naive_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { // TODO: Implement naive CUDA matrix multiplication + int row=blockIdx.y*blockDim.y+threadIdx.y; + int col=blockIdx.x*blockDim.x+threadIdx.x; + if(row>temp1; + fptr_refer>>temp2; + // I used deficit because of precision issue with floating numbers + if(abs(temp1-temp2)>=0.5) { + cout<<"Failed comparison: "<" << std::endl; + return 1; + } - // TODO: Use cudaMalloc and cudaMemcpy for GPU memory + int case_number = std::atoi(argv[1]); + if (case_number < 0 || case_number > 9) { + std::cerr << "Case number must be between 0 and 9" << std::endl; + return 1; + } + // Construct file paths + std::string folder = "data/" + std::to_string(case_number) + "/"; + std::string input0_file = folder + "input0.raw"; + std::string input1_file = folder + "input1.raw"; + std::string result_file = folder + "result.raw"; + std::string reference_file = folder + "output.raw"; + int m, n, p; // A is m x n, B is n x p, C is m x p + + + + // TODO Read input0.raw (matrix A) + ifstream fptr_input0(input0_file); + fptr_input0>>m>>n; + + float* A = new float[m*n]; + for(int i=0;i>A[i*n+j]; + // TODO Read input1.raw (matrix B) + ifstream fptr_input1(input1_file); + fptr_input1>>n>>p; + float* B = new float[n*p]; + for(int i=0;i>B[i*p+j]; + // Allocate memory for result matrices + float *C_naive = new float[m * p]; + float *C_tiled = new float[m * p]; + + // TODO: Use cudaMalloc and cudaMemcpy for GPU memory + float *A_d,*B_d,*C_d; + cudaMalloc((void**)&A_d,m*n*sizeof(float)); + cudaMalloc((void**)&B_d,n*p*sizeof(float)); + cudaMalloc((void**)&C_d,m*p*sizeof(float)); + // + int host_to_device_transfer_time=myCPUTimer(); + cudaMemcpy(A_d,A,m*n*sizeof(float),cudaMemcpyHostToDevice); + cudaMemcpy(B_d,B,n*p*sizeof(float),cudaMemcpyHostToDevice); + host_to_device_transfer_time=myCPUTimer()-host_to_device_transfer_time; // Measure naive CUDA performance // TODO: Launch naive_cuda_matmul kernel - + dim3 numThreadsPerBlock(16, 16); + dim3 numBlocks((p + numThreadsPerBlock.x - 1)/numThreadsPerBlock.x, + (m + numThreadsPerBlock.y - 1)/numThreadsPerBlock.y+1000); + int start_time = myCPUTimer(); + naive_cuda_matmul <<< numBlocks, numThreadsPerBlock >>> + (C_d, A_d, B_d, m, n, p); + cudaError_t err=cudaDeviceSynchronize(); + int naive_cuda_time = myCPUTimer()-start_time; + error_check(err); // TODO: Write naive CUDA result to file and validate // Measure tiled CUDA performance - + + // Write multiplication result to the file + int device_to_host_transfer_time=myCPUTimer(); + cudaMemcpy(C_naive,C_d,m*p*sizeof(float),cudaMemcpyDeviceToHost); + device_to_host_transfer_time=myCPUTimer()-device_to_host_transfer_time; + ofstream fptr_result(result_file); + fptr_result.close(); + fptr_result.clear(); + fptr_result.open(result_file); + fptr_result << m<<" "<

>> + (C_d, A_d, B_d, m, n, p,TILE_WIDTH); + err=cudaDeviceSynchronize(); + int tiled_cuda_time=myCPUTimer()-start_time; + error_check(err); // TODO: Write tiled CUDA result to file and validate + cudaMemcpy(C_tiled,C_d,m*p*sizeof(float),cudaMemcpyDeviceToHost); + fptr_result.close(); + fptr_result.clear(); + fptr_result.open(result_file); + fptr_result << m<<" "<