diff --git a/README.md b/README.md index d5a2886..d756ced 100644 --- a/README.md +++ b/README.md @@ -205,3 +205,29 @@ git push origin student-name Good luck, and enjoy accelerating matrix multiplication with CUDA! +## Results + +| Test Case | Dimensions (m × n × p) | Naive CPU (s) | Blocked CPU (s) | Parallel CPU (s) | Naive CUDA (s) | Tiled CUDA (s) | Tiled CUDA Speedup (vs. Naive CUDA) | Tiled CUDA Speedup (vs. Parallel CPU) | +|-----------|------------------------|---------------|-----------------|------------------|----------------|----------------|-------------------------------------|---------------------------------------| +| 1 | 64 × 64 × 64 | 0.000771433 | 0.000670545 | 0.000350771 | 0.000048128 | 0.00001856 | 2.59× | 18.899× | +| 2 | 64 × 64 × 64 | 0.0006139 | 0.000563352 | 0.00023961 | 0.000048128 | 0.00001856 | 2.59× | 12.91× | +| 3 | 128 × 64 × 128 | 0.0022845 | 0.00227195 | 0.000763971 | 0.000050176 | 0.000017824 | 2.815× | 42.86× | +| 4 | 128 × 64 × 128 | 0.00295904 | 0.00237161 | 0.0010892 | 0.000050176 | 0.000017824 | 2.815× | 60.755× | +| 5 | 100 × 128 × 56 | 0.00227332 | 0.00243213 | 0.000535479 | 0.000050176 | 0.000019168 | 2.6177× | 27.936× | +| 6 | 100 × 128 × 56 | 0.00155518 | 0.00154541 | 0.000579957 | 0.000050176 | 0.000019168 | 2.6177× | 28.067× | +| 7 | 128 × 64 × 128 | 0.00235209 | 0.00226329 | 0.00071353 | 0.000054272 | 0.0000184 | 2.95× | 38.7788× | +| 8 | 128 × 64 × 128 | 0.00305947 | 0.00229758 | 0.000697379 | 0.000054272 | 0.0000184 | 2.95× | 37.9× | +| 9 | 32 × 128 × 32 | 0.000327104 | 0.000290786 | 0.000148961 | 0.000050176 | 0.000018688 | 2.685× | 7.97× | +| 10 | 32 × 128 × 32 | 0.00027893 | 0.000284768 | 0.000204402 | 0.000050176 | 0.000018688 | 2.685× | 10.9376× | +| 11 | 200 × 100 × 256 | 0.011683 | 0.0114896 | 0.00344001 | 0.000051104 | 0.0000224 | 2.28× | 151.79× | +| 12 | 200 × 100 × 256 | 0.0117548 | 0.0110553 | 0.00319123 | 0.000051104 | 0.0000224 | 2.28× | 142.4656× | +| 13 | 256 × 256 × 256 | 0.0383598 | 0.0370295 | 0.00940384 | 0.000066912 | 0.000033504 | 1.997× | 280.678× | +| 14 | 256 × 256 × 256 | 0.0380587 | 0.0346864 | 0.0095684 | 0.000066912 | 0.000033504 | 1.997× | 285.59× | +| 15 | 256 × 300 × 256 | 0.0430276 | 0.0413115 | 0.0111705 | 0.000068608 | 0.000038176 | 1.79× | 292.605× | +| 16 | 256 × 300 × 256 | 0.0419544 | 0.0414222 | 0.0112128 | 0.000068608 | 0.000038176 | 1.79× | 293.713× | +| 17 | 64 × 128 × 64 | 0.00114947 | 0.00112288 | 0.000451304 | 0.000050144 | 0.00001872 | 2.6786× | 24.108× | +| 18 | 64 × 128 × 64 | 0.00127682 | 0.00113251 | 0.000446731 | 0.000050144 | 0.00001872 | 2.6786× | 23.864× | +| 19 | 256 × 256 × 257 | 0.0366115 | 0.0334168 | 0.00918976 | 0.000071360 | 0.000034112 | 2.092× | 269.399× | +| 20 | 256 × 256 × 257 | 0.0407559 | 0.0336556 | 0.00928626 | 0.000071360 | 0.000034112 | 2.092× | 272.228× | + + diff --git a/main.cu b/main.cu index 50df6a1..4e310f1 100644 --- a/main.cu +++ b/main.cu @@ -3,34 +3,265 @@ #include #include +#define BLOCK_SIZE 16 + __global__ void naive_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - // TODO: Implement naive CUDA matrix multiplication + // Implement naive CUDA matrix multiplication + uint32_t row = blockIdx.y * blockDim.y + threadIdx.y; + uint32_t col = blockIdx.x * blockDim.x + threadIdx.x; + + if (row < m && col < p) { + float sum = 0.00f; + + for (uint32_t k = 0; k < n; ++k) { + sum += A[row * n + k] * B[k * p + col]; + } + + C[row * p + col] = sum; + } } __global__ void tiled_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t tile_width) { - // TODO: Implement tiled CUDA matrix multiplication + // Implement tiled CUDA matrix multiplication + __shared__ float a_shared[BLOCK_SIZE*BLOCK_SIZE]; + __shared__ float b_shared[BLOCK_SIZE*BLOCK_SIZE]; + + uint32_t row = blockIdx.y*BLOCK_SIZE + threadIdx.y; + uint32_t col = blockIdx.x*BLOCK_SIZE + threadIdx.x; + + float temp = 0.00f; + + for (uint32_t i = 0; i < (BLOCK_SIZE + n - 1)/BLOCK_SIZE; i++) { + + if (i * BLOCK_SIZE + threadIdx.x < n && row < m){ + a_shared[threadIdx.y * BLOCK_SIZE + threadIdx.x] = A[(row * n) + (i * BLOCK_SIZE) + threadIdx.x]; + } else { + a_shared[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0.00f; + } + + if (i * BLOCK_SIZE + threadIdx.y < n && col < p){ + b_shared[threadIdx.y * BLOCK_SIZE + threadIdx.x] = B[(i*BLOCK_SIZE + threadIdx.y)*p + col]; + } else{ + b_shared[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0.00f; + } + + __syncthreads(); + + for (uint32_t j = 0; j < BLOCK_SIZE; ++j){ + temp += a_shared[threadIdx.y * BLOCK_SIZE + j] * b_shared[j * BLOCK_SIZE + threadIdx.x]; + } + __syncthreads(); + } + + if (row < m && col < p) { + C[row * p + col] = temp; + } +} + +// Writing the files values into the matrix +void read_file(std::ifstream &input, float *matrix, uint32_t x, uint32_t y) { + for (uint32_t j = 0; j < x; j++) { + for (uint32_t k = 0; k < y; k++) { + input >> matrix[j * y + k]; + } + } } bool validate_result(const std::string &result_file, const std::string &reference_file) { - // TODO: Implement result validation (same as Assignment 2) + // Implement result validation (same as Assignment 2) + uint32_t x, y; + float *R, *O; + + // Implement result validation + std::ifstream result(result_file); + if (result.is_open()) { + result >> x >> y; + R = new float[x * y]; + + read_file(result, R, x, y); + } else { + exit(1); + } + + std::ifstream output(reference_file); + if (output.is_open()) { + output >> x >> y; + O = new float[x * y]; + + read_file(output, O, x, y); + } else { + exit(1); + } + + const float EPSILON = 1e-2f; + + for (uint32_t j = 0; j < x; j++) { + for (uint32_t k = 0; k < y; k++) { + // Checking that floating point numbers are the same + if (std::fabs(R[j * y + k] - O[j * y + k]) > EPSILON) { + std::cout << R[j * y + k] << " " << O[j * y + k] << std::endl; + delete[] R; + delete[] O; + return false; + } + } + } + + result.close(); + output.close(); + delete[] R; + delete[] O; + return true; } + + int main(int argc, char *argv[]) { + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + int case_number = std::atoi(argv[1]); + if (case_number < 0 || case_number > 9) { + std::cerr << "Case number must be between 0 and 9" << std::endl; + return 1; + } - // TODO: Read input0.raw (matrix A) and input1.raw (matrix B) + // Construct file paths + std::string folder = "data/" + std::to_string(case_number) + "/"; + std::string input0_file = folder + "input0.raw"; + std::string input1_file = folder + "input1.raw"; + std::string result_file_naive = folder + "result_naive.raw"; + std::string result_file_tiled = folder + "result_tiled.raw"; + std::string reference_file = folder + "output.raw"; + + // Read input0.raw (matrix A) and input1.raw (matrix B) + float *A, *B, *C_naive, *C_tiled; + uint32_t m, n, p; + + std::ifstream input0(input0_file); + if (input0.is_open()) { + input0 >> m >> n; + A = new float[m * n]; + + read_file(input0, A, m, n); + } else { + exit(1); + } + + input0.close(); + + std::ifstream input1(input1_file); + if (input1.is_open()) { + input1 >> n >> p; + B = new float[n * p]; + + read_file(input1, B, n, p); + } else { + exit(1); + } + + input1.close(); + + C_naive = new float[m * p]; + C_tiled = new float[m * p]; // TODO: Use cudaMalloc and cudaMemcpy for GPU memory + float *d_A, *d_B, *d_C_naive, *d_C_tiled; + cudaMalloc(&d_A, sizeof(float)*m*n); + cudaMalloc(&d_B, sizeof(float)*n*p); + cudaMalloc(&d_C_naive, sizeof(float)*m*p); + cudaMalloc(&d_C_tiled, sizeof(float)*m*p); + cudaMemcpy(d_A, A, sizeof(float)*m*n, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, sizeof(float)*n*p, cudaMemcpyHostToDevice); // Measure naive CUDA performance + cudaEvent_t start_naive, stop_naive; + cudaEventCreate(&start_naive); + cudaEventCreate(&stop_naive); + // TODO: Launch naive_cuda_matmul kernel + dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); + dim3 gridDim((p + BLOCK_SIZE - 1) / BLOCK_SIZE, (m + BLOCK_SIZE - 1) / BLOCK_SIZE); + + cudaEventRecord(start_naive); + naive_cuda_matmul<<>>(d_C_naive, d_A, d_B, m, n, p); + cudaDeviceSynchronize(); + cudaEventRecord(stop_naive); + + cudaMemcpy(C_naive, d_C_naive, sizeof(float) * m * p, cudaMemcpyDeviceToHost); + + cudaEventSynchronize(stop_naive); + + float naive_cuda_time = 0; + cudaEventElapsedTime(&naive_cuda_time, start_naive, stop_naive); + naive_cuda_time = naive_cuda_time / 1000; // TODO: Write naive CUDA result to file and validate + std::ofstream result_naive(result_file_naive); + + result_naive << m << " " << p << "\n"; + for (uint32_t j = 0; j < m; j++) { + for (uint32_t k = 0; k < p; k++) { + result_naive << C_naive[j * p + k]; + + if (k != p - 1) { + result_naive << " "; + } + } + if (j != m - 1) { + result_naive << "\n"; + } + } + result_naive.close(); + + // Validate naive result + bool naive_correct = validate_result(result_file_naive, reference_file); + if (!naive_correct) { + std::cerr << "Naive result validation failed for case " << case_number << std::endl; + } + // Measure tiled CUDA performance + cudaEvent_t start_tiled, stop_tiled; + cudaEventCreate(&start_tiled); + cudaEventCreate(&stop_tiled); // TODO: Launch tiled_cuda_matmul kernel + cudaEventRecord(start_tiled); + tiled_cuda_matmul<<>>(d_C_tiled, d_A, d_B, m, n, p, 16); + cudaDeviceSynchronize(); + cudaEventRecord(stop_tiled); + cudaMemcpy(C_tiled, d_C_tiled, sizeof(float) * m * p, cudaMemcpyDeviceToHost); + cudaEventSynchronize(stop_tiled); + + float tiled_cuda_time = 0; + cudaEventElapsedTime(&tiled_cuda_time, start_tiled, stop_tiled); + tiled_cuda_time = tiled_cuda_time / 1000; // TODO: Write tiled CUDA result to file and validate + std::ofstream result_tiled(result_file_tiled); + + result_tiled << m << " " << p << "\n"; + for (uint32_t j = 0; j < m; j++) { + for (uint32_t k = 0; k < p; k++) { + result_tiled << C_tiled[j * p + k]; + + if (k != p - 1) { + result_tiled << " "; + } + } + if (j != m - 1) { + result_tiled << "\n"; + } + } + result_tiled.close(); + + // Validate naive result + bool tiled_correct = validate_result(result_file_tiled, reference_file); + if (!tiled_correct) { + std::cerr << "Tiled result validation failed for case " << case_number << std::endl; + } // Print performance results std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n"; @@ -38,6 +269,15 @@ int main(int argc, char *argv[]) { std::cout << "Tiled CUDA time: " << tiled_cuda_time << " seconds\n"; // Clean up + delete[] A; + delete[] B; + delete[] C_naive; + delete[] C_tiled; + + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C_naive); + cudaFree(d_C_tiled); return 0; } \ No newline at end of file