Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,29 @@
**Points**: 100

---

### Results table
| Test Case | Dimensions (m × n × p) | Naive CPU (s) | Blocked CPU (s) | Parallel CPU (s) | Naive CUDA | Tiled CUDA | Tiled CUDA Speedup (vs. Naive CUDA) | Tiled CUDA Speedupt (vs. Parallel CPU) |
|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------|
| 0 | 64 × 64 × 64 | 0.000999928 | 0.00199986 | 0.00200009 | 0.000183423995622 | 0.000117343995952| 1.56x | 17.04x |
|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------|
| 1 | 128 x 64 x 128 | 0.00300002 | 0.00500011 | 0.000999928 | 0.0000004696054247| 0.000000073160730| 6.02x | 12 807x |
|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------|
| 2 | 100 x 128 x 56 | 0.00200009 | 0.00300002 | 0.000999928 | 0.0000000000143761| 0 | inf | inf |
|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------|
| 3 | 128 x 64 x 128 | 0.00300002 | 0.00500011 | 0.00100017 | 0.0002397119969828| 0 | inf | inf |
|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------|
| 4 | 32 x 128 x 32 | 0.000999928 | 0.000999928 | 0 | 0.0000000000353951| 0 | inf | inf |
|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------|
| 5 | 200 x 100 x 256 | 0.0190001 | 0.0249999 | 0.00500011 | 0.0000000000132943| 0.000000000008350| 1.59x | 598 802 395x |
|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------|
| 6 | 256 X 256 X 256 | 0.0580001 | 0.0799999 | 0.013 | 0.0029166832100600| 0.001119550740122| 2.43x | 10.87x |
|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------|
| 7 | 256 X 300 256 | 0.0669999 | 0.095 | 0.0170002 | 0.000000000053352 | 0.00000000002155 | 2.48x | 790 697 674x |
|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------|
| 8 | 64 x 128 x 64 | 0.00200009 | 0.00299978 | 0.000999928 | 0.0000000000000076| 0 | inf | inf |
|-----------|------------------------|----------------|------------------|-------------------|-------------------|------------------|-------------------------------------|----------------------------------------|
| 9 | 256 x 256 x 257 | 0.0580001 | 0.0819998 | 0.0140002 | 0.0000000000001401| 0.000000000000375| 0.37x | 37 333 333 333x |
---
### Assignment Overview

Welcome to the third homework assignment of the Parallel Programming course!
Expand Down
308 changes: 295 additions & 13 deletions main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,324 @@
#include <fstream>
#include <string>
#include <cuda_runtime.h>
#include <omp.h>
#include <iomanip>

__global__ void naive_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
// TODO: Implement naive CUDA matrix multiplication

__global__ void naive_cuda_matmul(float *C, float *A, float *B, int m, int n, int p) {
// Implement naive CUDA matrix multiplication
// Define row and col size
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

// Iterate block
if (row < m && col < p) {
float value = 0.0f;
for (int i = 0; i < n; ++i) {
// Calculate value of C by multiplying cells from A and B
value += A[row * n + i] * B[i * p + col];
}
// Set calculated value to C
C[row * p + col] = value;
}
}

__global__ void tiled_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t tile_width) {
// TODO: Implement tiled CUDA matrix multiplication
__global__ void tiled_cuda_matmul(float *C, float *A, float *B, int m, int n, int p, int tile_width) {
// Implement tiled CUDA matrix multiplication
extern __shared__ float sharedMemory[];
float *sA = sharedMemory;
float *sB = &sharedMemory[tile_width * tile_width];

// Define row and col size
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

float value = 0.0f;

for (int i = 0; i < n; i += tile_width) {
// Load tiles into shared memory for faster calculations
if (row < m && (threadIdx.x + i) < n) {
sA[threadIdx.y * tile_width + threadIdx.x] = A[row * n + (threadIdx.x + i)];
} else {
sA[threadIdx.y * tile_width + threadIdx.x] = 0.0f;
}
// Same with matrix B
if (col < p && (threadIdx.y + i) < n) {
sB[threadIdx.y * tile_width + threadIdx.x] = B[(threadIdx.y + i) * p + col];
} else {
sB[threadIdx.y * tile_width + threadIdx.x] = 0.0f;
}

// Sync to ensure all threads in a block have loaded data before computation
__syncthreads();

for (int i = 0; i < tile_width; ++i) {
// Calculate value of C by multiplying tiles from shared memory A and B
value += sA[threadIdx.y * tile_width + i] * sB[i * tile_width + threadIdx.x];
}

__syncthreads();
}
// Set calculated value to C
if (row < m && col < p) {
C[row * p + col] = value;
}
}

bool validate_result(const std::string &result_file, const std::string &reference_file) {
// TODO: Implement result validation (same as Assignment 2)
bool validate_result(const std::string &result_file, const std::string &reference_file, int m, int p) {
// Implement result validation (same as Assignment 2)
std::ifstream comparison(reference_file);
if (!comparison.is_open()) {
// Validate that file opened correctly
std::cerr << "Unable to open file";
exit(1);
}

std::ifstream res(result_file);
if (!res.is_open()) {
// Validate that file opened correctly
std::cerr << "Unable to open file";
exit(1);
}

float Comp, ResValue;

// Iterate using the dimensions of C.
for (int i = 0; i < m; ++i) {
for (int j = 0; j < p; ++j) {
// Get element from both matrix by index, compare values and throw error if values don't match
res >> ResValue;
comparison >> Comp;
if (ResValue != Comp) {
std::cerr << "Value mismatch";
exit(1);
}
}
}

// Close both files once comparison is done
comparison.close();
res.close();
return true;
}

int main(int argc, char *argv[]) {
// Read input0.raw (matrix A) and input1.raw (matrix B)
int m, n, p;

if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
return 1;
}

int case_number = std::atoi(argv[1]);
if (case_number < 0 || case_number > 9) {
std::cerr << "Case number must be between 0 and 9" << std::endl;
return 1;
}

// Construct file paths
std::string folder = "data/" + std::to_string(case_number) + "/";
std::string input0_file = folder + "input0.raw";
std::string input1_file = folder + "input1.raw";
std::string result_file = folder + "result.raw";
std::string reference_file = folder + "output.raw";

// Read input0.raw (matrix A)
std::ifstream FileA(input0_file);
// Validate that file is opened correctly
if (!FileA.is_open()) {
std::cerr << "Error opening file";
return 1;
}

// Read input1.raw (matrix B)
std::ifstream FileB(input1_file);
// Validate that file is opened correctly
if (!FileB.is_open()) {
std::cerr << "Error opening file";
return 1;
}

// Get matrix dimensions
FileA >> m >> n;
FileB >> n >> p;

// TODO: Read input0.raw (matrix A) and input1.raw (matrix B)

// Allocate memory for matrices A and B, to read data from files
float* A = (float*)malloc(m * n * sizeof(float));
// Validate that memory is allocated correctly
if (A == NULL) {
std::cerr << "Memory allocation failed";
return 1;
}

// TODO: Use cudaMalloc and cudaMemcpy for GPU memory
float* B = (float*)malloc(n * p * sizeof(float));
if (B == NULL) {
std::cerr << "Memory allocation failed";
return 1;
}

//Read matrix elements into A and B (row-major order), close file after reading
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
FileA >> A[i * n + j];
}
}
FileA.close();

for (int i = 0; i < n; ++i) {
for (int j = 0; j < p; ++j) {
FileB >> B[i * p + j];
}
}
FileB.close();

// Allocate memory for result matrices
float *C_naive = new float[m * p];
float *C_tiled = new float[m * p];

// Use cudaMalloc and cudaMemcpy for GPU memory
// Allocate device memory for both naive and tiled
float* cu_naive_A;
float* cu_tiled_A;
float* cu_naive_B;
float* cu_tiled_B;
float* cu_naive_C;
float* cu_tiled_C;

cudaMalloc((void**)&cu_naive_A, m * n * sizeof(float));
cudaMalloc((void**)&cu_tiled_A, m * n * sizeof(float));
cudaMalloc((void**)&cu_naive_B, n * p * sizeof(float));
cudaMalloc((void**)&cu_tiled_B, n * p * sizeof(float));
cudaMalloc((void**)&cu_naive_C, m * p * sizeof(float));
cudaMalloc((void**)&cu_tiled_C, m * p * sizeof(float));

// Measure naive CUDA performance
// TODO: Launch naive_cuda_matmul kernel
// Create cuda events and start timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);

// Copy data from initial matrices
cudaMemcpy(cu_naive_A, A, m * n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(cu_naive_B, B, n * p * sizeof(float), cudaMemcpyHostToDevice);

// Set tile size,
int TILE_SIZE = 16;
// block size and number of blocks.
dim3 numThreadsPerBlock(TILE_SIZE, TILE_SIZE);
dim3 numBlocks((p + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x, (m + numThreadsPerBlock.y - 1) / numThreadsPerBlock.y);

// Launch naive_cuda_matmul kernel
naive_cuda_matmul<<< numBlocks, numThreadsPerBlock >>> (cu_naive_C, cu_naive_A, cu_naive_B, m, n, p);

// Copy results back from device to host
cudaMemcpy(C_naive, cu_naive_C, m * p * sizeof(float), cudaMemcpyDeviceToHost);

// Stop timing, write down result and destroy events
cudaEventRecord(stop);
float naive_cuda_time = 0;
cudaEventElapsedTime(&naive_cuda_time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);

// Write naive CUDA result to file and validate
// Write dimensions and elements to result.raw
std::ofstream result(result_file);
// Validate that file is created correctly
if (!result) {
std::cerr << "Unable to open file";
exit(1);
}

// Write the dimensions of C on the first line
result << m << " " << p << std::endl;
result << std::fixed << std::setprecision(2);
// Iterate C and write each element to result.raw
for (int i = 0; i < m; ++i) {
for (int j = 0; j < p; ++j) {
result << C_naive[i * p + j] << " ";
}
result << std::endl;
}
// Close file after writing
result.close();

// Validate naive CUDA results
bool naive_correct = validate_result(result_file, reference_file, m, p);
if (!naive_correct) {
std::cerr << "Naive result validation failed for case " << case_number << std::endl;
}

// TODO: Write naive CUDA result to file and validate
// Measure tiled CUDA performance
// Create cuda events and start timing
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);

// Copy data from initial matrices
cudaMemcpy(cu_tiled_A, A, m * n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(cu_tiled_B, B, n * p * sizeof(float), cudaMemcpyHostToDevice);
size_t sharedMemorySize = 2 * TILE_SIZE * TILE_SIZE * sizeof(float);

// TODO: Launch tiled_cuda_matmul kernel
// Launch tiled_cuda_matmul kernel
tiled_cuda_matmul<<< numBlocks, numThreadsPerBlock, sharedMemorySize >>> (cu_tiled_C, cu_tiled_A, cu_tiled_B, m, n, p, TILE_SIZE);

// Copy results back from device to host
cudaMemcpy(C_tiled, cu_tiled_C, m * p * sizeof(float), cudaMemcpyDeviceToHost);
// Stop timing, write down results and destroy events
cudaEventRecord(stop);
float tiled_cuda_time = 0;
cudaEventElapsedTime(&tiled_cuda_time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);

// TODO: Write tiled CUDA result to file and validate
// Free memory of initial matrices, they are no longer needed at this point
free(A);
free(B);

// Write tiled CUDA result to file and validate
std::ofstream result_tiled(result_file);
// Validate that file is created correctly
if (!result_tiled) {
std::cerr << "Unable to open file";
exit(1);
}

// Write the dimensions of C on the first line
result_tiled << m << " " << p << std::endl;
result_tiled << std::fixed << std::setprecision(2);
// Iterate C and write each element to result.raw
for (int i = 0; i < m; ++i) {
for (int j = 0; j < p; ++j) {
result_tiled << C_tiled[i * p + j] << " ";
}
result_tiled << std::endl;
}
// Close file after writing
result_tiled.close();

// Validate tiled CUDA results
bool tiled_correct = validate_result(result_file, reference_file, m, p);
if (!tiled_correct) {
std::cerr << "Tiled result validation failed for case " << case_number << std::endl;
}

// Print performance results
std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
std::cout << "Naive CUDA time: " << naive_cuda_time << " seconds\n";
std::cout << "Tiled CUDA time: " << tiled_cuda_time << " seconds\n";

// Clean up

cudaFree(cu_naive_A);
cudaFree(cu_tiled_A);
cudaFree(cu_naive_B);
cudaFree(cu_tiled_B);
cudaFree(cu_naive_C);
cudaFree(cu_tiled_C);
delete[] C_naive;
delete[] C_tiled;
return 0;
}