Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 9 additions & 14 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,25 +1,20 @@
cmake_minimum_required(VERSION 3.10)

project(MatrixMultiplication)

set(CMAKE_CXX_STANDARD 11)

set(CMAKE_CXX_STANDARD_REQUIRED ON)

find_package(OpenMP REQUIRED)

if(OpenMP_CXX_FOUND)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()

# Custom OpenMP setup for macOS
if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_Alignof=alignof")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xpreprocessor -fopenmp -D_Alignof=alignof")
include_directories("/opt/homebrew/include")
link_directories("/opt/homebrew/lib")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
endif()

add_executable(matmul main.cpp)

add_executable(matmul main_ans.cpp)


if(OpenMP_CXX_FOUND)
target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX)
if(APPLE)
target_link_libraries(matmul "-L/opt/homebrew/lib" "-lomp")
endif()
319 changes: 232 additions & 87 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,112 +3,257 @@
#include <string>
#include <omp.h>
#include <cmath>
#include <algorithm>

void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
//TODO : Implement naive matrix multiplication
// Initialize C to zero
for (uint32_t i = 0; i < m; i++) {
for (uint32_t j = 0; j < p; j++) {
C[i * p + j] = 0.0f;
}
}

// Perform naive matrix multiplication using triple nested loop
for (uint32_t i = 0; i < m; i++) {
for (uint32_t j = 0; j < p; j++) {
for (uint32_t k = 0; k < n; k++) {
C[i * p + j] += A[i * n + k] * B[k * p + j];
}
}
}
}

void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) {
// TODO: Implement blocked matrix multiplication
// A is m x n, B is n x p, C is m x p
// Use block_size to divide matrices into submatrices
// Initialize C to zero
for (uint32_t i = 0; i < m; i++) {
for (uint32_t j = 0; j < p; j++) {
C[i * p + j] = 0.0f;
}
}

// Perform blocked matrix multiplication
for (uint32_t ii = 0; ii < m; ii += block_size) {
for (uint32_t jj = 0; jj < p; jj += block_size) {
for (uint32_t kk = 0; kk < n; kk += block_size) {
// Process block
// C[ii:ii+block_size, jj:jj+block_size] += A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size, jj:jj+block_size]
for (uint32_t i = ii; i < std::min(ii + block_size, m); i++) {
for (uint32_t j = jj; j < std::min(jj + block_size, p); j++) {
for (uint32_t k = kk; k < std::min(kk + block_size, n); k++) {
C[i * p + j] += A[i * n + k] * B[k * p + j];
}
}
}
}
}
}
}

void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
// TODO: Implement parallel matrix multiplication using OpenMP
// A is m x n, B is n x p, C is m x p
// Initialize C to zero
#pragma omp parallel for
for (uint32_t i = 0; i < m; i++) {
for (uint32_t j = 0; j < p; j++) {
C[i * p + j] = 0.0f;
}
}

// Parallelize the outer loop with OpenMP
#pragma omp parallel for
for (uint32_t i = 0; i < m; i++) {
for (uint32_t j = 0; j < p; j++) {
float sum = 0.0f;
for (uint32_t k = 0; k < n; k++) {
sum += A[i * n + k] * B[k * p + j];
}
C[i * p + j] = sum;
}
}
}

bool validate_result(const std::string &result_file, const std::string &reference_file) {
//TODO : Implement result validation
std::ifstream result(result_file, std::ios::binary);
std::ifstream reference(reference_file, std::ios::binary);

if (!result || !reference) {
std::cerr << "Error opening result or reference file" << std::endl;
return false;
}

// Determine file sizes
result.seekg(0, std::ios::end);
reference.seekg(0, std::ios::end);
size_t result_size = result.tellg();
size_t reference_size = reference.tellg();
result.seekg(0, std::ios::beg);
reference.seekg(0, std::ios::beg);

// Check if file sizes match
if (result_size != reference_size) {
std::cerr << "Result and reference file sizes do not match" << std::endl;
return false;
}

// Read and compare file contents
const size_t buffer_size = 1024;
float result_buffer[buffer_size];
float reference_buffer[buffer_size];

size_t total_elements = result_size / sizeof(float);
size_t elements_compared = 0;

const float epsilon = 1e-5f; // Tolerance for floating-point comparison

while (elements_compared < total_elements) {
size_t elements_to_read = std::min(buffer_size, total_elements - elements_compared);

result.read(reinterpret_cast<char*>(result_buffer), elements_to_read * sizeof(float));
reference.read(reinterpret_cast<char*>(reference_buffer), elements_to_read * sizeof(float));

for (size_t i = 0; i < elements_to_read; i++) {
if (std::abs(result_buffer[i] - reference_buffer[i]) > epsilon) {
std::cerr << "Result differs from reference at element " << (elements_compared + i) << std::endl;
std::cerr << "Result: " << result_buffer[i] << ", Reference: " << reference_buffer[i] << std::endl;
return false;
}
}

elements_compared += elements_to_read;
}

return true;
}

int main(int argc, char *argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
return 1;
}
if (argc != 3) {
std::cerr << "Usage: " << argv[0] << " <case_number> <naive|blocked|parallel>" << std::endl;
return 1;
}

int case_number = std::atoi(argv[1]);
if (case_number < 0 || case_number > 9) {
std::cerr << "Case number must be between 0 and 9" << std::endl;
return 1;
}
int case_number = std::atoi(argv[1]);
std::string algorithm = argv[2];

// Validate algorithm choice
if (algorithm != "naive" && algorithm != "blocked" && algorithm != "parallel") {
std::cerr << "Invalid algorithm: " << algorithm << std::endl;
std::cerr << "Valid algorithms: naive, blocked, parallel" << std::endl;
return 1;
}

if (case_number < 0 || case_number > 9) {
std::cerr << "Case number must be between 0 and 9" << std::endl;
return 1;
}

// Construct file paths
std::string folder = "data/" + std::to_string(case_number) + "/";
std::string input0_file = folder + "input0.raw";
std::string input1_file = folder + "input1.raw";
std::string result_file = folder + "result.raw";
std::string reference_file = folder + "output.raw";
// Construct file paths
std::string folder = "data/" + std::to_string(case_number) + "/";
std::string input0_file = folder + "input0.raw";
std::string input1_file = folder + "input1.raw";
std::string result_file = "result.raw"; // Write to current directory
std::string reference_file = folder + "output.raw";

// TODO Read input0.raw (matrix A)
// Read input0.raw (matrix A)
std::ifstream input0(input0_file, std::ios::binary);
if (!input0) {
std::cerr << "Error opening input0.raw" << std::endl;
return 1;
}

// Read dimensions from input0.raw
uint32_t m, n;
input0.read(reinterpret_cast<char*>(&m), sizeof(uint32_t));
input0.read(reinterpret_cast<char*>(&n), sizeof(uint32_t));

// Allocate memory for A
float *A = new float[m * n];
input0.read(reinterpret_cast<char*>(A), m * n * sizeof(float));
input0.close();

// Read input1.raw (matrix B)
std::ifstream input1(input1_file, std::ios::binary);
if (!input1) {
std::cerr << "Error opening input1.raw" << std::endl;
delete[] A;
return 1;
}

// Read dimensions from input1.raw
uint32_t n_check, p;
input1.read(reinterpret_cast<char*>(&n_check), sizeof(uint32_t));
input1.read(reinterpret_cast<char*>(&p), sizeof(uint32_t));

// Check if dimensions are compatible
if (n != n_check) {
std::cerr << "Incompatible matrix dimensions" << std::endl;
delete[] A;
return 1;
}

// Allocate memory for B
float *B = new float[n * p];
input1.read(reinterpret_cast<char*>(B), n * p * sizeof(float));
input1.close();

// Allocate memory for result matrix C
float *C = new float[m * p];

// TODO Read input1.raw (matrix B)
std::cout << "Matrix dimensions: A(" << m << "x" << n << "), B(" << n << "x" << p << ")" << std::endl;

if (algorithm == "naive") {
// Time naive matrix multiplication
double t0 = omp_get_wtime();
naive_matmul(C, A, B, m, n, p);
double t1 = omp_get_wtime();
std::cout << "Naive matrix multiplication time: " << (t1 - t0) * 1000 << " ms" << std::endl;
}
else if (algorithm == "blocked") {
// Time blocked matrix multiplication
double t0 = omp_get_wtime();
blocked_matmul(C, A, B, m, n, p, 32); // Block size of 32
double t1 = omp_get_wtime();
std::cout << "Blocked matrix multiplication time: " << (t1 - t0) * 1000 << " ms" << std::endl;
}
else if (algorithm == "parallel") {
// Time parallel matrix multiplication
double t0 = omp_get_wtime();
parallel_matmul(C, A, B, m, n, p);
double t1 = omp_get_wtime();
std::cout << "Parallel matrix multiplication time: " << (t1 - t0) * 1000 << " ms" << std::endl;
}

// Write result to file
std::ofstream output(result_file, std::ios::binary);
if (!output) {
std::cerr << "Error opening result file" << std::endl;
delete[] A;
delete[] B;
delete[] C;
return 1;
}

// Write dimensions to result file
output.write(reinterpret_cast<char*>(&m), sizeof(uint32_t));
output.write(reinterpret_cast<char*>(&p), sizeof(uint32_t));

// Write result matrix to file
output.write(reinterpret_cast<char*>(C), m * p * sizeof(float));
output.close();

// Validate result
if (validate_result(result_file, reference_file)) {
std::cout << "Result validation successful!" << std::endl;
} else {
std::cerr << "Result validation failed!" << std::endl;
delete[] A;
delete[] B;
delete[] C;
return 1;
}

// Allocate memory for result matrices
float *C_naive = new float[m * p];
float *C_blocked = new float[m * p];
float *C_parallel = new float[m * p];

// Measure performance of naive_matmul
double start_time = omp_get_wtime();
naive_matmul(C_naive, A, B, m, n, p);
double naive_time = omp_get_wtime() - start_time;

// TODO Write naive result to file


// Validate naive result
bool naive_correct = validate_result(result_file, reference_file);
if (!naive_correct) {
std::cerr << "Naive result validation failed for case " << case_number << std::endl;
}

// Measure performance of blocked_matmul (use block_size = 32 as default)
start_time = omp_get_wtime();
blocked_matmul(C_blocked, A, B, m, n, p, 32);
double blocked_time = omp_get_wtime() - start_time;

// TODO Write blocked result to file


// Validate blocked result
bool blocked_correct = validate_result(result_file, reference_file);
if (!blocked_correct) {
std::cerr << "Blocked result validation failed for case " << case_number << std::endl;
}

// Measure performance of parallel_matmul
start_time = omp_get_wtime();
parallel_matmul(C_parallel, A, B, m, n, p);
double parallel_time = omp_get_wtime() - start_time;

// TODO Write parallel result to file


// Validate parallel result
bool parallel_correct = validate_result(result_file, reference_file);
if (!parallel_correct) {
std::cerr << "Parallel result validation failed for case " << case_number << std::endl;
}

// Print performance results
std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
std::cout << "Naive time: " << naive_time << " seconds\n";
std::cout << "Blocked time: " << blocked_time << " seconds\n";
std::cout << "Parallel time: " << parallel_time << " seconds\n";
std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";

// Clean up
delete[] A;
delete[] B;
delete[] C_naive;
delete[] C_blocked;
delete[] C_parallel;

return 0;
// Free memory
delete[] A;
delete[] B;
delete[] C;

return 0;
}