Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,29 @@ git push origin student-name

Good luck, and enjoy accelerating matrix multiplication with CUDA!

## Results

| Test Case | Dimensions (m × n × p) | Naive CPU (s) | Blocked CPU (s) | Parallel CPU (s) | Naive CUDA (s) | Tiled CUDA (s) | Tiled CUDA Speedup (vs. Naive CUDA) | Tiled CUDA Speedup (vs. Parallel CPU) |
|-----------|------------------------|---------------|-----------------|------------------|----------------|----------------|-------------------------------------|---------------------------------------|
| 1 | 64 × 64 × 64 | 0.000771433 | 0.000670545 | 0.000350771 | 0.000048128 | 0.00001856 | 2.59× | 18.899× |
| 2 | 64 × 64 × 64 | 0.0006139 | 0.000563352 | 0.00023961 | 0.000048128 | 0.00001856 | 2.59× | 12.91× |
| 3 | 128 × 64 × 128 | 0.0022845 | 0.00227195 | 0.000763971 | 0.000050176 | 0.000017824 | 2.815× | 42.86× |
| 4 | 128 × 64 × 128 | 0.00295904 | 0.00237161 | 0.0010892 | 0.000050176 | 0.000017824 | 2.815× | 60.755× |
| 5 | 100 × 128 × 56 | 0.00227332 | 0.00243213 | 0.000535479 | 0.000050176 | 0.000019168 | 2.6177× | 27.936× |
| 6 | 100 × 128 × 56 | 0.00155518 | 0.00154541 | 0.000579957 | 0.000050176 | 0.000019168 | 2.6177× | 28.067× |
| 7 | 128 × 64 × 128 | 0.00235209 | 0.00226329 | 0.00071353 | 0.000054272 | 0.0000184 | 2.95× | 38.7788× |
| 8 | 128 × 64 × 128 | 0.00305947 | 0.00229758 | 0.000697379 | 0.000054272 | 0.0000184 | 2.95× | 37.9× |
| 9 | 32 × 128 × 32 | 0.000327104 | 0.000290786 | 0.000148961 | 0.000050176 | 0.000018688 | 2.685× | 7.97× |
| 10 | 32 × 128 × 32 | 0.00027893 | 0.000284768 | 0.000204402 | 0.000050176 | 0.000018688 | 2.685× | 10.9376× |
| 11 | 200 × 100 × 256 | 0.011683 | 0.0114896 | 0.00344001 | 0.000051104 | 0.0000224 | 2.28× | 151.79× |
| 12 | 200 × 100 × 256 | 0.0117548 | 0.0110553 | 0.00319123 | 0.000051104 | 0.0000224 | 2.28× | 142.4656× |
| 13 | 256 × 256 × 256 | 0.0383598 | 0.0370295 | 0.00940384 | 0.000066912 | 0.000033504 | 1.997× | 280.678× |
| 14 | 256 × 256 × 256 | 0.0380587 | 0.0346864 | 0.0095684 | 0.000066912 | 0.000033504 | 1.997× | 285.59× |
| 15 | 256 × 300 × 256 | 0.0430276 | 0.0413115 | 0.0111705 | 0.000068608 | 0.000038176 | 1.79× | 292.605× |
| 16 | 256 × 300 × 256 | 0.0419544 | 0.0414222 | 0.0112128 | 0.000068608 | 0.000038176 | 1.79× | 293.713× |
| 17 | 64 × 128 × 64 | 0.00114947 | 0.00112288 | 0.000451304 | 0.000050144 | 0.00001872 | 2.6786× | 24.108× |
| 18 | 64 × 128 × 64 | 0.00127682 | 0.00113251 | 0.000446731 | 0.000050144 | 0.00001872 | 2.6786× | 23.864× |
| 19 | 256 × 256 × 257 | 0.0366115 | 0.0334168 | 0.00918976 | 0.000071360 | 0.000034112 | 2.092× | 269.399× |
| 20 | 256 × 256 × 257 | 0.0407559 | 0.0336556 | 0.00928626 | 0.000071360 | 0.000034112 | 2.092× | 272.228× |


248 changes: 244 additions & 4 deletions main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,281 @@
#include <string>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16

__global__ void naive_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
// TODO: Implement naive CUDA matrix multiplication
// Implement naive CUDA matrix multiplication
uint32_t row = blockIdx.y * blockDim.y + threadIdx.y;
uint32_t col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < m && col < p) {
float sum = 0.00f;

for (uint32_t k = 0; k < n; ++k) {
sum += A[row * n + k] * B[k * p + col];
}

C[row * p + col] = sum;
}
}

__global__ void tiled_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t tile_width) {
// TODO: Implement tiled CUDA matrix multiplication
// Implement tiled CUDA matrix multiplication
__shared__ float a_shared[BLOCK_SIZE*BLOCK_SIZE];
__shared__ float b_shared[BLOCK_SIZE*BLOCK_SIZE];

uint32_t row = blockIdx.y*BLOCK_SIZE + threadIdx.y;
uint32_t col = blockIdx.x*BLOCK_SIZE + threadIdx.x;

float temp = 0.00f;

for (uint32_t i = 0; i < (BLOCK_SIZE + n - 1)/BLOCK_SIZE; i++) {

if (i * BLOCK_SIZE + threadIdx.x < n && row < m){
a_shared[threadIdx.y * BLOCK_SIZE + threadIdx.x] = A[(row * n) + (i * BLOCK_SIZE) + threadIdx.x];
} else {
a_shared[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0.00f;
}

if (i * BLOCK_SIZE + threadIdx.y < n && col < p){
b_shared[threadIdx.y * BLOCK_SIZE + threadIdx.x] = B[(i*BLOCK_SIZE + threadIdx.y)*p + col];
} else{
b_shared[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0.00f;
}

__syncthreads();

for (uint32_t j = 0; j < BLOCK_SIZE; ++j){
temp += a_shared[threadIdx.y * BLOCK_SIZE + j] * b_shared[j * BLOCK_SIZE + threadIdx.x];
}
__syncthreads();
}

if (row < m && col < p) {
C[row * p + col] = temp;
}
}

// Writing the files values into the matrix
void read_file(std::ifstream &input, float *matrix, uint32_t x, uint32_t y) {
for (uint32_t j = 0; j < x; j++) {
for (uint32_t k = 0; k < y; k++) {
input >> matrix[j * y + k];
}
}
}

bool validate_result(const std::string &result_file, const std::string &reference_file) {
// TODO: Implement result validation (same as Assignment 2)
// Implement result validation (same as Assignment 2)
uint32_t x, y;
float *R, *O;

// Implement result validation
std::ifstream result(result_file);
if (result.is_open()) {
result >> x >> y;
R = new float[x * y];

read_file(result, R, x, y);
} else {
exit(1);
}

std::ifstream output(reference_file);
if (output.is_open()) {
output >> x >> y;
O = new float[x * y];

read_file(output, O, x, y);
} else {
exit(1);
}

const float EPSILON = 1e-2f;

for (uint32_t j = 0; j < x; j++) {
for (uint32_t k = 0; k < y; k++) {
// Checking that floating point numbers are the same
if (std::fabs(R[j * y + k] - O[j * y + k]) > EPSILON) {
std::cout << R[j * y + k] << " " << O[j * y + k] << std::endl;
delete[] R;
delete[] O;
return false;
}
}
}

result.close();
output.close();
delete[] R;
delete[] O;
return true;
}



int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
return 1;
}

int case_number = std::atoi(argv[1]);
if (case_number < 0 || case_number > 9) {
std::cerr << "Case number must be between 0 and 9" << std::endl;
return 1;
}

// TODO: Read input0.raw (matrix A) and input1.raw (matrix B)
// Construct file paths
std::string folder = "data/" + std::to_string(case_number) + "/";
std::string input0_file = folder + "input0.raw";
std::string input1_file = folder + "input1.raw";
std::string result_file_naive = folder + "result_naive.raw";
std::string result_file_tiled = folder + "result_tiled.raw";
std::string reference_file = folder + "output.raw";

// Read input0.raw (matrix A) and input1.raw (matrix B)
float *A, *B, *C_naive, *C_tiled;
uint32_t m, n, p;

std::ifstream input0(input0_file);
if (input0.is_open()) {
input0 >> m >> n;
A = new float[m * n];

read_file(input0, A, m, n);
} else {
exit(1);
}

input0.close();

std::ifstream input1(input1_file);
if (input1.is_open()) {
input1 >> n >> p;
B = new float[n * p];

read_file(input1, B, n, p);
} else {
exit(1);
}

input1.close();

C_naive = new float[m * p];
C_tiled = new float[m * p];

// TODO: Use cudaMalloc and cudaMemcpy for GPU memory
float *d_A, *d_B, *d_C_naive, *d_C_tiled;
cudaMalloc(&d_A, sizeof(float)*m*n);
cudaMalloc(&d_B, sizeof(float)*n*p);
cudaMalloc(&d_C_naive, sizeof(float)*m*p);
cudaMalloc(&d_C_tiled, sizeof(float)*m*p);
cudaMemcpy(d_A, A, sizeof(float)*m*n, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, sizeof(float)*n*p, cudaMemcpyHostToDevice);

// Measure naive CUDA performance
cudaEvent_t start_naive, stop_naive;
cudaEventCreate(&start_naive);
cudaEventCreate(&stop_naive);

// TODO: Launch naive_cuda_matmul kernel
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
dim3 gridDim((p + BLOCK_SIZE - 1) / BLOCK_SIZE, (m + BLOCK_SIZE - 1) / BLOCK_SIZE);

cudaEventRecord(start_naive);
naive_cuda_matmul<<<gridDim, blockDim>>>(d_C_naive, d_A, d_B, m, n, p);
cudaDeviceSynchronize();
cudaEventRecord(stop_naive);

cudaMemcpy(C_naive, d_C_naive, sizeof(float) * m * p, cudaMemcpyDeviceToHost);

cudaEventSynchronize(stop_naive);

float naive_cuda_time = 0;
cudaEventElapsedTime(&naive_cuda_time, start_naive, stop_naive);
naive_cuda_time = naive_cuda_time / 1000;

// TODO: Write naive CUDA result to file and validate
std::ofstream result_naive(result_file_naive);

result_naive << m << " " << p << "\n";
for (uint32_t j = 0; j < m; j++) {
for (uint32_t k = 0; k < p; k++) {
result_naive << C_naive[j * p + k];

if (k != p - 1) {
result_naive << " ";
}
}
if (j != m - 1) {
result_naive << "\n";
}
}
result_naive.close();

// Validate naive result
bool naive_correct = validate_result(result_file_naive, reference_file);
if (!naive_correct) {
std::cerr << "Naive result validation failed for case " << case_number << std::endl;
}

// Measure tiled CUDA performance
cudaEvent_t start_tiled, stop_tiled;
cudaEventCreate(&start_tiled);
cudaEventCreate(&stop_tiled);

// TODO: Launch tiled_cuda_matmul kernel
cudaEventRecord(start_tiled);
tiled_cuda_matmul<<<gridDim, blockDim>>>(d_C_tiled, d_A, d_B, m, n, p, 16);
cudaDeviceSynchronize();
cudaEventRecord(stop_tiled);

cudaMemcpy(C_tiled, d_C_tiled, sizeof(float) * m * p, cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop_tiled);

float tiled_cuda_time = 0;
cudaEventElapsedTime(&tiled_cuda_time, start_tiled, stop_tiled);
tiled_cuda_time = tiled_cuda_time / 1000;
// TODO: Write tiled CUDA result to file and validate
std::ofstream result_tiled(result_file_tiled);

result_tiled << m << " " << p << "\n";
for (uint32_t j = 0; j < m; j++) {
for (uint32_t k = 0; k < p; k++) {
result_tiled << C_tiled[j * p + k];

if (k != p - 1) {
result_tiled << " ";
}
}
if (j != m - 1) {
result_tiled << "\n";
}
}
result_tiled.close();

// Validate naive result
bool tiled_correct = validate_result(result_file_tiled, reference_file);
if (!tiled_correct) {
std::cerr << "Tiled result validation failed for case " << case_number << std::endl;
}

// Print performance results
std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
std::cout << "Naive CUDA time: " << naive_cuda_time << " seconds\n";
std::cout << "Tiled CUDA time: " << tiled_cuda_time << " seconds\n";

// Clean up
delete[] A;
delete[] B;
delete[] C_naive;
delete[] C_tiled;

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C_naive);
cudaFree(d_C_tiled);

return 0;
}