Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,19 @@ For each test case (0 through 9, using the same `data` folder from Assignment 2)

---

| Test Case | Dimensions (m × n × p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Naive CUDA (s) | Tiled CUDA (s) | Tiled CUDA Speedup (vs. Naive CUDA) | Tiled CUDA Speedup (vs. Parallel CPU) |
|-----------|------------------------|----------------|------------------|-------------------|----------------|----------------|-------------------------------------|---------------------------------------|
| 0 | 64 x 64 x 64 | 0.00074 | 0.00142 | 0.00265 | 0.002085 | 0.001667 | 0.44x | 1.59x |
| 1 | 128 x 64 x 12 | 0.00333 | 0.00490 | 0.00252 | 0.002075 | 0.001693 | 1.97x | 1.49x |
| 2 | 100 x 128 x 56 | 0.00191 | 0.00336 | 0.00173 | 0.002324 | 0.001684 | 1.13x | 1.03x |
| 3 | 128 x 64 x 12 | 0.00419 | 0.00613 | 0.00256 | 0.002058 | 0.001661 | 2.52x | 1.54x |
| 4 | 32 x 128 x 32 | 0.00035 | 0.00068 | 0.00304 | 0.002003 | 0.001636 | 0.21x | 1.86x |
| 5 | 200 x 100 x 25 | 0.01434 | 0.02394 | 0.01791 | 0.002140 | 0.001622 | 8.84x | 11.042x |
| 6 | 256 x 256 x 256 | 0.05017 | 0.08211 | 0.04476 | 0.002060 | 0.001713 | 29.29x | 26.13x |
| 7 | 256 x 300 x 256 | 0.05772 | 0.09756 | 0.05502 | 0.002061 | 0.001634 | 35.32x | 33.67x |
| 8 | 64 x 128 x 64 | 0.00185 | 0.00287 | 0.00130 | 0.002148 | 0.001687 | 1.10x | 0.77x |
| 9 | 265 x 256 x 257 | 0.04809 | 0.08143 | 0.04981 | 0.002073 | 0.001684 | 28.56x | 29.57x |

### Matrix Storage and Memory Management

- Continue using row-major order for matrices.
Expand Down
145 changes: 125 additions & 20 deletions main.cu
Original file line number Diff line number Diff line change
@@ -1,43 +1,148 @@
#include <iostream>
#include <fstream>
#include <string>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#include <stdint.h> // For uint32_t
#include <iostream> // std::cout, std::cerr
#include <fstream> // std::ifstream, std::ofstream
#include <vector> // std::vector
#include <string> // std::string
#include <cassert> // assert
#include <cstring> // std::memcmp
#include <cmath> // ceil, floor

#define TILE_WIDTH 16

// Naive matrix multiplication kernel
__global__ void naive_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
// TODO: Implement naive CUDA matrix multiplication
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < m && col < p) {
float sum = 0.0f;
for (int k = 0; k < n; ++k) {
sum += A[row * n + k] * B[k * p + col];
}
C[row * p + col] = sum;
}
}

// Tiled matrix multiplication kernel using shared memory
__global__ void tiled_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t tile_width) {
// TODO: Implement tiled CUDA matrix multiplication
__shared__ float tile_A[TILE_WIDTH][TILE_WIDTH];
__shared__ float tile_B[TILE_WIDTH][TILE_WIDTH];

int row = blockIdx.y * tile_width + threadIdx.y;
int col = blockIdx.x * tile_width + threadIdx.x;

float sum = 0.0f;
for (int t = 0; t < (n + tile_width - 1) / tile_width; ++t) {
if (row < m && t * tile_width + threadIdx.x < n)
tile_A[threadIdx.y][threadIdx.x] = A[row * n + t * tile_width + threadIdx.x];
else
tile_A[threadIdx.y][threadIdx.x] = 0.0f;

if (t * tile_width + threadIdx.y < n && col < p)
tile_B[threadIdx.y][threadIdx.x] = B[(t * tile_width + threadIdx.y) * p + col];
else
tile_B[threadIdx.y][threadIdx.x] = 0.0f;

__syncthreads();

for (int k = 0; k < tile_width; ++k)
sum += tile_A[threadIdx.y][k] * tile_B[k][threadIdx.x];

__syncthreads();
}

if (row < m && col < p)
C[row * p + col] = sum;
}

bool validate_result(const std::string &result_file, const std::string &reference_file) {
// TODO: Implement result validation (same as Assignment 2)
// Compare resulsts to validate code
bool compare_results(float *C1, float *C2, int size, float eps = 1e-3f) {
for (int i = 0; i < size; ++i)
if (fabs(C1[i] - C2[i]) > eps)
return false;
return true;
}

int main(int argc, char *argv[]) {
if (argc != 7) {
printf("Usage: ./app m n p input0.raw input1.raw reference.raw\n");
return 1;
}

// Parse command line arguments
int m = atoi(argv[1]), n = atoi(argv[2]), p = atoi(argv[3]);
const char *fileA = argv[4];
const char *fileB = argv[5];
const char *fileRef = argv[6];

// Allocate host memory
size_t size_A = m * n * sizeof(float);
size_t size_B = n * p * sizeof(float);
size_t size_C = m * p * sizeof(float);

float *h_A = (float *)malloc(size_A);
float *h_B = (float *)malloc(size_B);
float *h_C_naive = (float *)malloc(size_C);
float *h_C_tiled = (float *)malloc(size_C);
float *h_C_ref = (float *)malloc(size_C);

// Load input matrices and reference output from files
FILE *fA = fopen(fileA, "rb");
FILE *fB = fopen(fileB, "rb");
FILE *fR = fopen(fileRef, "rb");
fread(h_A, sizeof(float), m * n, fA); fclose(fA);
fread(h_B, sizeof(float), n * p, fB); fclose(fB);
fread(h_C_ref, sizeof(float), m * p, fR); fclose(fR);

// TODO: Read input0.raw (matrix A) and input1.raw (matrix B)
// Allocate device memory
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, size_A);
cudaMalloc(&d_B, size_B);
cudaMalloc(&d_C, size_C);

// TODO: Use cudaMalloc and cudaMemcpy for GPU memory
// Copy input matrices to device
cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice);

// Measure naive CUDA performance
// TODO: Launch naive_cuda_matmul kernel
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
dim3 dimGrid((p + TILE_WIDTH - 1) / TILE_WIDTH, (m + TILE_WIDTH - 1) / TILE_WIDTH);

// TODO: Write naive CUDA result to file and validate
// Measure tiled CUDA performance
// Timing for naive kernel
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

// TODO: Launch tiled_cuda_matmul kernel
cudaEventRecord(start);
naive_cuda_matmul<<<dimGrid, dimBlock>>>(d_C, d_A, d_B, m, n, p);
cudaMemcpy(h_C_naive, d_C, size_C, cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float time_naive;
cudaEventElapsedTime(&time_naive, start, stop);

// TODO: Write tiled CUDA result to file and validate
// Timing for tiled kernel
cudaMemset(d_C, 0, size_C);
cudaDeviceSynchronize();
cudaEventRecord(start);
tiled_cuda_matmul<<<dimGrid, dimBlock>>>(d_C, d_A, d_B, m, n, p, TILE_WIDTH);
cudaMemcpy(h_C_tiled, d_C, size_C, cudaMemcpyDeviceToHost);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float time_tiled;
cudaEventElapsedTime(&time_tiled, start, stop);

// Print performance results
std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
std::cout << "Naive CUDA time: " << naive_cuda_time << " seconds\n";
std::cout << "Tiled CUDA time: " << tiled_cuda_time << " seconds\n";
// Output performance and validation
printf("Naive CUDA Time: %.6f seconds\n", time_naive / 1000.0);
printf("Tiled CUDA Time: %.6f seconds\n", time_tiled / 1000.0);
printf("Naive Validation: %s\n", compare_results(h_C_naive, h_C_ref, m * p) ? "Passed" : "Failed");
printf("Tiled Validation: %s\n", compare_results(h_C_tiled, h_C_ref, m * p) ? "Passed" : "Failed");

// Clean up
// Cleanup
cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
free(h_A); free(h_B); free(h_C_naive); free(h_C_tiled); free(h_C_ref);
cudaEventDestroy(start); cudaEventDestroy(stop);

return 0;
}
28 changes: 28 additions & 0 deletions runner.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include <iostream>
#include <filesystem>
#include <cstdlib>
#include <string>

// This runs the main.cu through every folder so it's a bit easier to all results

namespace fs = std::filesystem;

int main() {
std::string base_path = "data";

for (int i = 0; i <= 9; ++i) {
std::string folder = base_path + "/" + std::to_string(i);
std::string input0 = folder + "/input0.raw";
std::string input1 = folder + "/input1.raw";
std::string reference = folder + "/output.raw";

std::cout << "Running folder: " << folder << std::endl;
std::string cmd = "./app 512 512 512 " + input0 + " " + input1 + " " + reference;
int ret = std::system(cmd.c_str());
if (ret != 0) {
std::cerr << "Failed to run: " << cmd << std::endl;
}
}

return 0;
}