From c354ced7e6966d8272d003b5ff5c65d909476cda Mon Sep 17 00:00:00 2001 From: Rasmus Lagerqvist Date: Fri, 2 May 2025 20:34:29 +0300 Subject: [PATCH] rasmus-lagerqvist: Implemented optimized matrix multiplication --- CMakeLists.txt | 23 ++-- main.cpp | 319 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 241 insertions(+), 101 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b04fd0..ee606e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,25 +1,20 @@ cmake_minimum_required(VERSION 3.10) - project(MatrixMultiplication) set(CMAKE_CXX_STANDARD 11) - set(CMAKE_CXX_STANDARD_REQUIRED ON) -find_package(OpenMP REQUIRED) - -if(OpenMP_CXX_FOUND) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") -endif() - +# Custom OpenMP setup for macOS if(APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_Alignof=alignof") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xpreprocessor -fopenmp -D_Alignof=alignof") + include_directories("/opt/homebrew/include") + link_directories("/opt/homebrew/lib") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") endif() +add_executable(matmul main.cpp) -add_executable(matmul main_ans.cpp) - - -if(OpenMP_CXX_FOUND) - target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX) +if(APPLE) + target_link_libraries(matmul "-L/opt/homebrew/lib" "-lomp") endif() \ No newline at end of file diff --git a/main.cpp b/main.cpp index 65bf108..3c2a38d 100644 --- a/main.cpp +++ b/main.cpp @@ -3,112 +3,257 @@ #include #include #include +#include void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - //TODO : Implement naive matrix multiplication + // Initialize C to zero + for (uint32_t i = 0; i < m; i++) { + for (uint32_t j = 0; j < p; j++) { + C[i * p + j] = 0.0f; + } + } + + // Perform naive matrix multiplication using triple nested loop + for (uint32_t i = 0; i < m; i++) { + for (uint32_t j = 0; j < p; j++) { + for (uint32_t k = 0; k < n; k++) { + C[i * p + j] += A[i * n + k] * B[k * p + j]; + } + } + } } void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) { - // TODO: Implement blocked matrix multiplication - // A is m x n, B is n x p, C is m x p - // Use block_size to divide matrices into submatrices + // Initialize C to zero + for (uint32_t i = 0; i < m; i++) { + for (uint32_t j = 0; j < p; j++) { + C[i * p + j] = 0.0f; + } + } + + // Perform blocked matrix multiplication + for (uint32_t ii = 0; ii < m; ii += block_size) { + for (uint32_t jj = 0; jj < p; jj += block_size) { + for (uint32_t kk = 0; kk < n; kk += block_size) { + // Process block + // C[ii:ii+block_size, jj:jj+block_size] += A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size, jj:jj+block_size] + for (uint32_t i = ii; i < std::min(ii + block_size, m); i++) { + for (uint32_t j = jj; j < std::min(jj + block_size, p); j++) { + for (uint32_t k = kk; k < std::min(kk + block_size, n); k++) { + C[i * p + j] += A[i * n + k] * B[k * p + j]; + } + } + } + } + } + } } void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - // TODO: Implement parallel matrix multiplication using OpenMP - // A is m x n, B is n x p, C is m x p + // Initialize C to zero + #pragma omp parallel for + for (uint32_t i = 0; i < m; i++) { + for (uint32_t j = 0; j < p; j++) { + C[i * p + j] = 0.0f; + } + } + + // Parallelize the outer loop with OpenMP + #pragma omp parallel for + for (uint32_t i = 0; i < m; i++) { + for (uint32_t j = 0; j < p; j++) { + float sum = 0.0f; + for (uint32_t k = 0; k < n; k++) { + sum += A[i * n + k] * B[k * p + j]; + } + C[i * p + j] = sum; + } + } } bool validate_result(const std::string &result_file, const std::string &reference_file) { - //TODO : Implement result validation + std::ifstream result(result_file, std::ios::binary); + std::ifstream reference(reference_file, std::ios::binary); + + if (!result || !reference) { + std::cerr << "Error opening result or reference file" << std::endl; + return false; + } + + // Determine file sizes + result.seekg(0, std::ios::end); + reference.seekg(0, std::ios::end); + size_t result_size = result.tellg(); + size_t reference_size = reference.tellg(); + result.seekg(0, std::ios::beg); + reference.seekg(0, std::ios::beg); + + // Check if file sizes match + if (result_size != reference_size) { + std::cerr << "Result and reference file sizes do not match" << std::endl; + return false; + } + + // Read and compare file contents + const size_t buffer_size = 1024; + float result_buffer[buffer_size]; + float reference_buffer[buffer_size]; + + size_t total_elements = result_size / sizeof(float); + size_t elements_compared = 0; + + const float epsilon = 1e-5f; // Tolerance for floating-point comparison + + while (elements_compared < total_elements) { + size_t elements_to_read = std::min(buffer_size, total_elements - elements_compared); + + result.read(reinterpret_cast(result_buffer), elements_to_read * sizeof(float)); + reference.read(reinterpret_cast(reference_buffer), elements_to_read * sizeof(float)); + + for (size_t i = 0; i < elements_to_read; i++) { + if (std::abs(result_buffer[i] - reference_buffer[i]) > epsilon) { + std::cerr << "Result differs from reference at element " << (elements_compared + i) << std::endl; + std::cerr << "Result: " << result_buffer[i] << ", Reference: " << reference_buffer[i] << std::endl; + return false; + } + } + + elements_compared += elements_to_read; + } + + return true; } int main(int argc, char *argv[]) { - if (argc != 2) { - std::cerr << "Usage: " << argv[0] << " " << std::endl; - return 1; - } + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } - int case_number = std::atoi(argv[1]); - if (case_number < 0 || case_number > 9) { - std::cerr << "Case number must be between 0 and 9" << std::endl; - return 1; - } + int case_number = std::atoi(argv[1]); + std::string algorithm = argv[2]; + + // Validate algorithm choice + if (algorithm != "naive" && algorithm != "blocked" && algorithm != "parallel") { + std::cerr << "Invalid algorithm: " << algorithm << std::endl; + std::cerr << "Valid algorithms: naive, blocked, parallel" << std::endl; + return 1; + } + + if (case_number < 0 || case_number > 9) { + std::cerr << "Case number must be between 0 and 9" << std::endl; + return 1; + } - // Construct file paths - std::string folder = "data/" + std::to_string(case_number) + "/"; - std::string input0_file = folder + "input0.raw"; - std::string input1_file = folder + "input1.raw"; - std::string result_file = folder + "result.raw"; - std::string reference_file = folder + "output.raw"; + // Construct file paths + std::string folder = "data/" + std::to_string(case_number) + "/"; + std::string input0_file = folder + "input0.raw"; + std::string input1_file = folder + "input1.raw"; + std::string result_file = "result.raw"; // Write to current directory + std::string reference_file = folder + "output.raw"; - // TODO Read input0.raw (matrix A) + // Read input0.raw (matrix A) + std::ifstream input0(input0_file, std::ios::binary); + if (!input0) { + std::cerr << "Error opening input0.raw" << std::endl; + return 1; + } + + // Read dimensions from input0.raw + uint32_t m, n; + input0.read(reinterpret_cast(&m), sizeof(uint32_t)); + input0.read(reinterpret_cast(&n), sizeof(uint32_t)); + + // Allocate memory for A + float *A = new float[m * n]; + input0.read(reinterpret_cast(A), m * n * sizeof(float)); + input0.close(); + + // Read input1.raw (matrix B) + std::ifstream input1(input1_file, std::ios::binary); + if (!input1) { + std::cerr << "Error opening input1.raw" << std::endl; + delete[] A; + return 1; + } + + // Read dimensions from input1.raw + uint32_t n_check, p; + input1.read(reinterpret_cast(&n_check), sizeof(uint32_t)); + input1.read(reinterpret_cast(&p), sizeof(uint32_t)); + + // Check if dimensions are compatible + if (n != n_check) { + std::cerr << "Incompatible matrix dimensions" << std::endl; + delete[] A; + return 1; + } + + // Allocate memory for B + float *B = new float[n * p]; + input1.read(reinterpret_cast(B), n * p * sizeof(float)); + input1.close(); + // Allocate memory for result matrix C + float *C = new float[m * p]; - // TODO Read input1.raw (matrix B) + std::cout << "Matrix dimensions: A(" << m << "x" << n << "), B(" << n << "x" << p << ")" << std::endl; + + if (algorithm == "naive") { + // Time naive matrix multiplication + double t0 = omp_get_wtime(); + naive_matmul(C, A, B, m, n, p); + double t1 = omp_get_wtime(); + std::cout << "Naive matrix multiplication time: " << (t1 - t0) * 1000 << " ms" << std::endl; + } + else if (algorithm == "blocked") { + // Time blocked matrix multiplication + double t0 = omp_get_wtime(); + blocked_matmul(C, A, B, m, n, p, 32); // Block size of 32 + double t1 = omp_get_wtime(); + std::cout << "Blocked matrix multiplication time: " << (t1 - t0) * 1000 << " ms" << std::endl; + } + else if (algorithm == "parallel") { + // Time parallel matrix multiplication + double t0 = omp_get_wtime(); + parallel_matmul(C, A, B, m, n, p); + double t1 = omp_get_wtime(); + std::cout << "Parallel matrix multiplication time: " << (t1 - t0) * 1000 << " ms" << std::endl; + } + // Write result to file + std::ofstream output(result_file, std::ios::binary); + if (!output) { + std::cerr << "Error opening result file" << std::endl; + delete[] A; + delete[] B; + delete[] C; + return 1; + } + + // Write dimensions to result file + output.write(reinterpret_cast(&m), sizeof(uint32_t)); + output.write(reinterpret_cast(&p), sizeof(uint32_t)); + + // Write result matrix to file + output.write(reinterpret_cast(C), m * p * sizeof(float)); + output.close(); + + // Validate result + if (validate_result(result_file, reference_file)) { + std::cout << "Result validation successful!" << std::endl; + } else { + std::cerr << "Result validation failed!" << std::endl; + delete[] A; + delete[] B; + delete[] C; + return 1; + } - // Allocate memory for result matrices - float *C_naive = new float[m * p]; - float *C_blocked = new float[m * p]; - float *C_parallel = new float[m * p]; - - // Measure performance of naive_matmul - double start_time = omp_get_wtime(); - naive_matmul(C_naive, A, B, m, n, p); - double naive_time = omp_get_wtime() - start_time; - - // TODO Write naive result to file - - - // Validate naive result - bool naive_correct = validate_result(result_file, reference_file); - if (!naive_correct) { - std::cerr << "Naive result validation failed for case " << case_number << std::endl; - } - - // Measure performance of blocked_matmul (use block_size = 32 as default) - start_time = omp_get_wtime(); - blocked_matmul(C_blocked, A, B, m, n, p, 32); - double blocked_time = omp_get_wtime() - start_time; - - // TODO Write blocked result to file - - - // Validate blocked result - bool blocked_correct = validate_result(result_file, reference_file); - if (!blocked_correct) { - std::cerr << "Blocked result validation failed for case " << case_number << std::endl; - } - - // Measure performance of parallel_matmul - start_time = omp_get_wtime(); - parallel_matmul(C_parallel, A, B, m, n, p); - double parallel_time = omp_get_wtime() - start_time; - - // TODO Write parallel result to file - - - // Validate parallel result - bool parallel_correct = validate_result(result_file, reference_file); - if (!parallel_correct) { - std::cerr << "Parallel result validation failed for case " << case_number << std::endl; - } - - // Print performance results - std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n"; - std::cout << "Naive time: " << naive_time << " seconds\n"; - std::cout << "Blocked time: " << blocked_time << " seconds\n"; - std::cout << "Parallel time: " << parallel_time << " seconds\n"; - std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n"; - std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n"; - - // Clean up - delete[] A; - delete[] B; - delete[] C_naive; - delete[] C_blocked; - delete[] C_parallel; - - return 0; + // Free memory + delete[] A; + delete[] B; + delete[] C; + + return 0; } \ No newline at end of file