diff --git a/README.md b/README.md index d5a2886..b7eea71 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,19 @@ For each test case (0 through 9, using the same `data` folder from Assignment 2) --- +| Test Case | Dimensions (m × n × p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Naive CUDA (s) | Tiled CUDA (s) | Tiled CUDA Speedup (vs. Naive CUDA) | Tiled CUDA Speedup (vs. Parallel CPU) | +|-----------|------------------------|----------------|------------------|-------------------|----------------|----------------|-------------------------------------|---------------------------------------| +| 0 | 64 x 64 x 64 | 0.00074 | 0.00142 | 0.00265 | 0.002085 | 0.001667 | 0.44x | 1.59x | +| 1 | 128 x 64 x 12 | 0.00333 | 0.00490 | 0.00252 | 0.002075 | 0.001693 | 1.97x | 1.49x | +| 2 | 100 x 128 x 56 | 0.00191 | 0.00336 | 0.00173 | 0.002324 | 0.001684 | 1.13x | 1.03x | +| 3 | 128 x 64 x 12 | 0.00419 | 0.00613 | 0.00256 | 0.002058 | 0.001661 | 2.52x | 1.54x | +| 4 | 32 x 128 x 32 | 0.00035 | 0.00068 | 0.00304 | 0.002003 | 0.001636 | 0.21x | 1.86x | +| 5 | 200 x 100 x 25 | 0.01434 | 0.02394 | 0.01791 | 0.002140 | 0.001622 | 8.84x | 11.042x | +| 6 | 256 x 256 x 256 | 0.05017 | 0.08211 | 0.04476 | 0.002060 | 0.001713 | 29.29x | 26.13x | +| 7 | 256 x 300 x 256 | 0.05772 | 0.09756 | 0.05502 | 0.002061 | 0.001634 | 35.32x | 33.67x | +| 8 | 64 x 128 x 64 | 0.00185 | 0.00287 | 0.00130 | 0.002148 | 0.001687 | 1.10x | 0.77x | +| 9 | 265 x 256 x 257 | 0.04809 | 0.08143 | 0.04981 | 0.002073 | 0.001684 | 28.56x | 29.57x | + ### Matrix Storage and Memory Management - Continue using row-major order for matrices. diff --git a/main.cu b/main.cu index 50df6a1..5353796 100644 --- a/main.cu +++ b/main.cu @@ -1,43 +1,148 @@ -#include -#include -#include #include +#include +#include // For uint32_t +#include // std::cout, std::cerr +#include // std::ifstream, std::ofstream +#include // std::vector +#include // std::string +#include // assert +#include // std::memcmp +#include // ceil, floor + +#define TILE_WIDTH 16 + +// Naive matrix multiplication kernel __global__ void naive_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - // TODO: Implement naive CUDA matrix multiplication + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + if (row < m && col < p) { + float sum = 0.0f; + for (int k = 0; k < n; ++k) { + sum += A[row * n + k] * B[k * p + col]; + } + C[row * p + col] = sum; + } } +// Tiled matrix multiplication kernel using shared memory __global__ void tiled_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t tile_width) { - // TODO: Implement tiled CUDA matrix multiplication + __shared__ float tile_A[TILE_WIDTH][TILE_WIDTH]; + __shared__ float tile_B[TILE_WIDTH][TILE_WIDTH]; + + int row = blockIdx.y * tile_width + threadIdx.y; + int col = blockIdx.x * tile_width + threadIdx.x; + + float sum = 0.0f; + for (int t = 0; t < (n + tile_width - 1) / tile_width; ++t) { + if (row < m && t * tile_width + threadIdx.x < n) + tile_A[threadIdx.y][threadIdx.x] = A[row * n + t * tile_width + threadIdx.x]; + else + tile_A[threadIdx.y][threadIdx.x] = 0.0f; + + if (t * tile_width + threadIdx.y < n && col < p) + tile_B[threadIdx.y][threadIdx.x] = B[(t * tile_width + threadIdx.y) * p + col]; + else + tile_B[threadIdx.y][threadIdx.x] = 0.0f; + + __syncthreads(); + + for (int k = 0; k < tile_width; ++k) + sum += tile_A[threadIdx.y][k] * tile_B[k][threadIdx.x]; + + __syncthreads(); + } + + if (row < m && col < p) + C[row * p + col] = sum; } -bool validate_result(const std::string &result_file, const std::string &reference_file) { - // TODO: Implement result validation (same as Assignment 2) +// Compare resulsts to validate code +bool compare_results(float *C1, float *C2, int size, float eps = 1e-3f) { + for (int i = 0; i < size; ++i) + if (fabs(C1[i] - C2[i]) > eps) + return false; + return true; } int main(int argc, char *argv[]) { + if (argc != 7) { + printf("Usage: ./app m n p input0.raw input1.raw reference.raw\n"); + return 1; + } + + // Parse command line arguments + int m = atoi(argv[1]), n = atoi(argv[2]), p = atoi(argv[3]); + const char *fileA = argv[4]; + const char *fileB = argv[5]; + const char *fileRef = argv[6]; + + // Allocate host memory + size_t size_A = m * n * sizeof(float); + size_t size_B = n * p * sizeof(float); + size_t size_C = m * p * sizeof(float); + + float *h_A = (float *)malloc(size_A); + float *h_B = (float *)malloc(size_B); + float *h_C_naive = (float *)malloc(size_C); + float *h_C_tiled = (float *)malloc(size_C); + float *h_C_ref = (float *)malloc(size_C); + // Load input matrices and reference output from files + FILE *fA = fopen(fileA, "rb"); + FILE *fB = fopen(fileB, "rb"); + FILE *fR = fopen(fileRef, "rb"); + fread(h_A, sizeof(float), m * n, fA); fclose(fA); + fread(h_B, sizeof(float), n * p, fB); fclose(fB); + fread(h_C_ref, sizeof(float), m * p, fR); fclose(fR); - // TODO: Read input0.raw (matrix A) and input1.raw (matrix B) + // Allocate device memory + float *d_A, *d_B, *d_C; + cudaMalloc(&d_A, size_A); + cudaMalloc(&d_B, size_B); + cudaMalloc(&d_C, size_C); - // TODO: Use cudaMalloc and cudaMemcpy for GPU memory + // Copy input matrices to device + cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice); - // Measure naive CUDA performance - // TODO: Launch naive_cuda_matmul kernel + dim3 dimBlock(TILE_WIDTH, TILE_WIDTH); + dim3 dimGrid((p + TILE_WIDTH - 1) / TILE_WIDTH, (m + TILE_WIDTH - 1) / TILE_WIDTH); - // TODO: Write naive CUDA result to file and validate - // Measure tiled CUDA performance + // Timing for naive kernel + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); - // TODO: Launch tiled_cuda_matmul kernel + cudaEventRecord(start); + naive_cuda_matmul<<>>(d_C, d_A, d_B, m, n, p); + cudaMemcpy(h_C_naive, d_C, size_C, cudaMemcpyDeviceToHost); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float time_naive; + cudaEventElapsedTime(&time_naive, start, stop); - // TODO: Write tiled CUDA result to file and validate + // Timing for tiled kernel + cudaMemset(d_C, 0, size_C); + cudaDeviceSynchronize(); + cudaEventRecord(start); + tiled_cuda_matmul<<>>(d_C, d_A, d_B, m, n, p, TILE_WIDTH); + cudaMemcpy(h_C_tiled, d_C, size_C, cudaMemcpyDeviceToHost); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float time_tiled; + cudaEventElapsedTime(&time_tiled, start, stop); - // Print performance results - std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n"; - std::cout << "Naive CUDA time: " << naive_cuda_time << " seconds\n"; - std::cout << "Tiled CUDA time: " << tiled_cuda_time << " seconds\n"; + // Output performance and validation + printf("Naive CUDA Time: %.6f seconds\n", time_naive / 1000.0); + printf("Tiled CUDA Time: %.6f seconds\n", time_tiled / 1000.0); + printf("Naive Validation: %s\n", compare_results(h_C_naive, h_C_ref, m * p) ? "Passed" : "Failed"); + printf("Tiled Validation: %s\n", compare_results(h_C_tiled, h_C_ref, m * p) ? "Passed" : "Failed"); - // Clean up + // Cleanup + cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); + free(h_A); free(h_B); free(h_C_naive); free(h_C_tiled); free(h_C_ref); + cudaEventDestroy(start); cudaEventDestroy(stop); return 0; } \ No newline at end of file diff --git a/runner.cpp b/runner.cpp new file mode 100644 index 0000000..3e7d101 --- /dev/null +++ b/runner.cpp @@ -0,0 +1,28 @@ +#include +#include +#include +#include + +// This runs the main.cu through every folder so it's a bit easier to all results + +namespace fs = std::filesystem; + +int main() { + std::string base_path = "data"; + + for (int i = 0; i <= 9; ++i) { + std::string folder = base_path + "/" + std::to_string(i); + std::string input0 = folder + "/input0.raw"; + std::string input1 = folder + "/input1.raw"; + std::string reference = folder + "/output.raw"; + + std::cout << "Running folder: " << folder << std::endl; + std::string cmd = "./app 512 512 512 " + input0 + " " + input1 + " " + reference; + int ret = std::system(cmd.c_str()); + if (ret != 0) { + std::cerr << "Failed to run: " << cmd << std::endl; + } + } + + return 0; +} \ No newline at end of file