Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,19 @@ git push origin student-name

Good luck, and enjoy accelerating matrix multiplication with CUDA!

### Results

| Test Case | Dimensions (\( m \times n \times p \)) | Naive CPU (s) | Blocked CPU (s) | Parallel CPU (s) | Naive CUDA (s) | Tiled CUDA (s) | Tiled CUDA Speedup (vs. Naive CUDA) | Tiled CUDA Speedup (vs. Parallel CPU) |
|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|
| 0 | 64 x 64 x 64| 0.00286956 | 0.0039845 | 0.0039845 | 0.104896 | 0.083584 | 1.255x | 0.048x |
| 1 | 128 x 64 x 128 | 0.00819703 | 0.0107341 | 0.00161662 | 0.141888 | 0.094272 | 1.505x | 0.017x |
| 2 | 100 x 128 x 56 | 0.00591884 | 0.00705264 | 0.00257082 | 0.121024 | 0.080576 | 1.502x | 0.032x |
| 3 | 128 x 64 x 128 | 0.00855164 | 0.0104604 | 0.00158808 | 0.139712 | 0.095424 | 1.464x | 0.017x |
| 4 | 32 x 128 x 32 | 0.00101901 | 0.00142778 | 0.000622604 | 0.122368 | 0.063424 | 1.93x | 0.01x |
| 5 | 200 x 100 x 256 | 0.030972 | 0.0239394 | 0.00744395 | 0.216736 | 0.175936 | 1.23x | 0.042x |
| 6 | 256 x 256 x 256 | 0.0574685 | 0.0826263 | 0.0242001 | 0.310656 | 0.243968 | 1.273x | 0.099x |
| 7 | 256 x 300 x 256 | 0.0637025 | 0.0922899 | 0.0178825 | 0.298624 | 0.246816 | 1.21x | 0.072x |
| 8 | 64 x 128 x 64 | 0.00378908 | 0.00581044 | 0.001216 | 0.124512 | 0.076128 | 1.64x | 0.016x |
| 9 | 256 x 256 x 257 | 0.0591941 | 0.0762655 | 0.0228895 | 0.29376 | 0.248832 | 1.18x | 0.092x |

The two GPU programs were executed on the CSC Mahti supercomputer, accessed via VSCode Remote-SSH. Based on the results, the tiled CUDA implementation outperformed the naive CUDA version. However, when comparing tiled CUDA with parallel CPU computations, the parallel CPU showed better performance. This can be due to the overhead in data transfers between CPU and GPU. Further, I assume tiled CUDA might perform better if we test it with very large matrices.
1 change: 1 addition & 0 deletions build/.cmake/api/v1/query/client-vscode/query.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"requests":[{"kind":"cache","version":2},{"kind":"codemodel","version":2},{"kind":"toolchains","version":1},{"kind":"cmakeFiles","version":1}]}
65 changes: 65 additions & 0 deletions build/CMakeCache.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# This is the CMakeCache file.
# For build in directory: /home/kaveesha/Desktop/Uthpala_Abo/Parallel_Programming/Homework_3/Homework-3/build
# It was generated by CMake: /usr/bin/cmake
# You can edit this file to change values found and used by cmake.
# If you do not want to change any of the values, simply exit the editor.
# If you do want to change a value, simply edit, save, and exit the editor.
# The syntax for the file is as follows:
# KEY:TYPE=VALUE
# KEY is the name of a variable in the cache.
# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
# VALUE is the current value for the KEY.

########################
# EXTERNAL cache entries
########################

//No help, variable specified on the command line.
CMAKE_BUILD_TYPE:STRING=Debug

//No help, variable specified on the command line.
CMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++

//No help, variable specified on the command line.
CMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc

//No help, variable specified on the command line.
CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE


########################
# INTERNAL cache entries
########################

//This is the directory where this CMakeCache.txt was created
CMAKE_CACHEFILE_DIR:INTERNAL=/home/kaveesha/Desktop/Uthpala_Abo/Parallel_Programming/Homework_3/Homework-3/build
//Major version of cmake used to create the current loaded cache
CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
//Minor version of cmake used to create the current loaded cache
CMAKE_CACHE_MINOR_VERSION:INTERNAL=16
//Patch version of cmake used to create the current loaded cache
CMAKE_CACHE_PATCH_VERSION:INTERNAL=3
//Path to CMake executable.
CMAKE_COMMAND:INTERNAL=/usr/bin/cmake
//Path to cpack program executable.
CMAKE_CPACK_COMMAND:INTERNAL=/usr/bin/cpack
//Path to ctest program executable.
CMAKE_CTEST_COMMAND:INTERNAL=/usr/bin/ctest
//Name of external makefile project generator.
CMAKE_EXTRA_GENERATOR:INTERNAL=
//Name of generator.
CMAKE_GENERATOR:INTERNAL=Unix Makefiles
//Generator instance identifier.
CMAKE_GENERATOR_INSTANCE:INTERNAL=
//Name of generator platform.
CMAKE_GENERATOR_PLATFORM:INTERNAL=
//Name of generator toolset.
CMAKE_GENERATOR_TOOLSET:INTERNAL=
//Source directory with the top level CMakeLists.txt file for this
// project
CMAKE_HOME_DIRECTORY:INTERNAL=/home/kaveesha/Desktop/Uthpala_Abo/Parallel_Programming/Homework_3/Homework-3
//number of local generators
CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1
//Path to CMake installation.
CMAKE_ROOT:INTERNAL=/usr/share/cmake-3.16

1 change: 1 addition & 0 deletions build/CMakeFiles/cmake.check_cache
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# This file is generated by cmake for dependency checking of the CMakeCache.txt file
213 changes: 202 additions & 11 deletions main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,232 @@
#include <string>
#include <cuda_runtime.h>

float *read_matrix(const std::string &filename, int &rows, int &cols)
{
std::ifstream file(filename);
if (!file)
{
throw std::runtime_error("Cannot open file: " + filename);
}
file >> rows >> cols;
float *mat = new float[rows * cols];

for (int i = 0; i < rows * cols; ++i)
{
file >> mat[i];
}

file.close();
return mat;
}

void write_matrix(const std::string &filename, float *mat, int rows, int cols)
{
std::ofstream file(filename);
if (!file)
{
throw std::runtime_error("Cannot open file for writing: " + filename);
}
file << rows << " " << cols << "\n";
for (int i = 0; i < rows * cols; ++i)
{
file << mat[i] << " ";
if ((i + 1) % cols == 0)
{
file << "\n";
}
}

file.close();
}

__global__ void naive_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
// TODO: Implement naive CUDA matrix multiplication
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;


float sum = 0.0f;
if (row < m && col < p) {
for (int k = 0; k < n; ++k) {
sum += A[row * n + k] * B[k * p + col];
}
C[row * p + col] = sum;
}
}

__global__ void tiled_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t tile_width) {
// TODO: Implement tiled CUDA matrix multiplication
__shared__ float tile_A[16][16];
__shared__ float tile_B[16][16];

int row = blockIdx.y * tile_width + threadIdx.y;
int col = blockIdx.x * tile_width + threadIdx.x;

float sum = 0.0f;

for (int t = 0; t < (n + tile_width - 1) / tile_width; ++t) {
if (row < m && (t * tile_width + threadIdx.x) < n)
tile_A[threadIdx.y][threadIdx.x] = A[row * n + t * tile_width + threadIdx.x];
else
tile_A[threadIdx.y][threadIdx.x] = 0.0f;

if (col < p && (t * tile_width + threadIdx.y) < n)
tile_B[threadIdx.y][threadIdx.x] = B[(t * tile_width + threadIdx.y) * p + col];
else
tile_B[threadIdx.y][threadIdx.x] = 0.0f;

__syncthreads();

for (int i = 0; i < tile_width; ++i) {
sum += tile_A[threadIdx.y][i] * tile_B[i][threadIdx.x];
}

__syncthreads();
}

if (row < m && col < p) {
C[row * p + col] = sum;
}


}

bool validate_result(const std::string &result_file, const std::string &reference_file) {
// TODO: Implement result validation (same as Assignment 2)
int result_rows, result_cols, ref_rows, ref_cols;
float *result = read_matrix(result_file, result_rows, result_cols);
float *reference = read_matrix(reference_file, ref_rows, ref_cols);

const float EPSILON = 1e-4f;

for (int i = 0; i < result_rows * result_cols; ++i)
{
if (std::abs(result[i] - reference[i]) > EPSILON)
{
return false;
}
}

return true;


}

int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
return 1;
}

int case_number = std::atoi(argv[1]);
if (case_number < 0 || case_number > 9) {
std::cerr << "Case number must be between 0 and 9" << std::endl;
return 1;
}

// Construct file paths
std::string folder = "data/" + std::to_string(case_number) + "/";
std::string input0_file = folder + "input0.raw";
std::string input1_file = folder + "input1.raw";
std::string result_file = folder + "result.raw";
std::string reference_file = folder + "output.raw";

int m, n, p;
float *A = read_matrix(input0_file, m, n);
float *B = read_matrix(input1_file, n, p);
float *C_naive = new float[m * p];
float *C_tiled = new float[m * p];

float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, sizeof(float) * m * n);
cudaMalloc(&d_B, sizeof(float) * n * p);
cudaMalloc(&d_C, sizeof(float) * m * p);

cudaEvent_t start_naive, stop_naive;
cudaEventCreate(&start_naive);
cudaEventCreate(&stop_naive);
cudaEventRecord(start_naive);

cudaMemcpy(d_A, A, sizeof(float) * m * n, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, sizeof(float) * n * p, cudaMemcpyHostToDevice);

dim3 blockDim(16, 16);
dim3 gridDim((p + 15) / 16, (m + 15) / 16);

// Launch naive kernel
cudaMemset(d_C, 0, sizeof(float) * m * p);

naive_cuda_matmul<<<gridDim, blockDim>>>(d_C, d_A, d_B, m, n, p);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA kernel launch error: %s\n", cudaGetErrorString(err));
}
cudaDeviceSynchronize();

err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA kernel execution error: %s\n", cudaGetErrorString(err));
}

cudaMemcpy(C_naive, d_C, sizeof(float) * m * p, cudaMemcpyDeviceToHost);

cudaEventRecord(stop_naive);
cudaEventSynchronize(stop_naive);
float naive_cuda_time;
cudaEventElapsedTime(&naive_cuda_time, start_naive, stop_naive);
write_matrix(result_file, C_naive, m, p);
validate_result(result_file, reference_file);

//Launch tiled cuda kernel
int tile_width = 16;

dim3 tileBlock(tile_width, tile_width);
dim3 tileGrid((p + tile_width - 1) / tile_width, (m + tile_width - 1) / tile_width);

cudaEvent_t start_tiled, stop_tiled;
cudaEventCreate(&start_tiled);
cudaEventCreate(&stop_tiled);

// TODO: Read input0.raw (matrix A) and input1.raw (matrix B)
cudaEventRecord(start_tiled);

// TODO: Use cudaMalloc and cudaMemcpy for GPU memory
cudaMemcpy(d_A, A, sizeof(float) * m * n, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, sizeof(float) * n * p, cudaMemcpyHostToDevice);

// Measure naive CUDA performance
// TODO: Launch naive_cuda_matmul kernel
cudaMemset(d_C, 0, sizeof(float) * m * p);
tiled_cuda_matmul<<<tileGrid, tileBlock>>>(d_C, d_A, d_B, m, n, p, tile_width);
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA kernel launch error: %s\n", cudaGetErrorString(err));
}
cudaDeviceSynchronize();

// TODO: Write naive CUDA result to file and validate
// Measure tiled CUDA performance
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA kernel execution error: %s\n", cudaGetErrorString(err));
}
cudaMemcpy(C_tiled, d_C, sizeof(float) * m * p, cudaMemcpyDeviceToHost);

// TODO: Launch tiled_cuda_matmul kernel
cudaEventRecord(stop_tiled);
cudaEventSynchronize(stop_tiled);

float tiled_cuda_time;
cudaEventElapsedTime(&tiled_cuda_time, start_tiled, stop_tiled);
write_matrix(result_file, C_tiled, m, p);
validate_result(result_file, reference_file);


// TODO: Write tiled CUDA result to file and validate

// Print performance results
std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
std::cout << "Naive CUDA time: " << naive_cuda_time << " seconds\n";
std::cout << "Tiled CUDA time: " << tiled_cuda_time << " seconds\n";

// Clean up
delete[] A;
delete[] B;
delete[] C_naive;
delete[] C_tiled;
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

return 0;
}
12 changes: 12 additions & 0 deletions run_hw3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
#SBATCH --job-name=hw3
#SBATCH --account=project_2014289
#SBATCH --partition=gpusmall
#SBATCH --time=00:30:00
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --mem-per-cpu=8G
#SBATCH --gres=gpu:a100:2
#SBATCH --output=hw3.txt

srun hw3 2