From ad58e44852c2aa1927914042eb79c8003e997020 Mon Sep 17 00:00:00 2001 From: rafatanzir <39431259+rafatanzir@users.noreply.github.com> Date: Thu, 8 May 2025 14:56:55 +0300 Subject: [PATCH] mdrafatanzir: Implemented optimized matrix multiplication --- CMakeLists.txt | 8 +- README.md | 261 ++++++++++++------------------------------------- main.cpp | 160 ++++++++++++++++++++---------- 3 files changed, 173 insertions(+), 256 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b04fd0..b560516 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,6 @@ cmake_minimum_required(VERSION 3.10) project(MatrixMultiplication) set(CMAKE_CXX_STANDARD 11) - set(CMAKE_CXX_STANDARD_REQUIRED ON) find_package(OpenMP REQUIRED) @@ -16,10 +15,9 @@ if(APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_Alignof=alignof") endif() - -add_executable(matmul main_ans.cpp) - +# Updated source file name here +add_executable(matmul main.cpp) if(OpenMP_CXX_FOUND) target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX) -endif() \ No newline at end of file +endif() diff --git a/README.md b/README.md index 0f91d63..adac8f6 100644 --- a/README.md +++ b/README.md @@ -1,238 +1,103 @@ -# Parallel Programming +# Matrix Multiplication Assignment (Parallel Programming) -**Åbo Akademi University, Information Technology Department** +This project implements and benchmarks three matrix multiplication methods: +- Naive Matrix Multiplication +- Blocked Matrix Multiplicatio +- Parallel Matrix Multiplication using OpenMP -**Instructor: Alireza Olama** - -## Homework Assignment 2: Optimizing Matrix Multiplication in C++ - -**Due Date**: 08/05/2025 - -**Points**: 100 +#Objective +Evaluate and compare the performance of different matrix multiplication strategies on various test cases. The input matrices are read from files and results are validated against reference outputs. --- -### Assignment Overview +# Project Structure -Welcome to the second homework assignment of the Parallel Programming course! In Assignment 1, you implemented a naive -matrix multiplication using a triple nested loop. In this assignment, you will optimize the performance of your naive -implementation using two techniques: - -1. **Cache Optimization via Blocked Matrix Multiplication**: Improve data locality to reduce cache misses. -2. **Parallel Matrix Multiplication using `OpenMP`**: Parallelize the computation across multiple threads. - -Your task is to implement both optimizations in the provided C++ `main.cpp` file, measure their performance, and compare the -wall clock time of the naive, cache-optimized, and parallel implementations for each test case. This assignment builds -on your Assignment 1 code, so ensure your naive implementation is correct before starting. +``` +Assignment2/ +├── build/ # Build directory (e.g., CMake or Visual Studio output) +├── data/ # Contains test case folders (0 to 9) +│ ├── 0/ +│ │ ├── input0.raw # Matrix A +│ │ ├── input1.raw # Matrix B +│ │ ├── output.raw # Reference result +│ │ └── result.raw # Generated output +├── main.cpp # Main implementation +├── CMakeLists.txt # CMake build file (optional) +└── README.md # This file +``` --- -### Technical Requirements - -#### 1. Cache Optimization (Blocked Matrix Multiplication) - -**Why Cache Optimization?** - -Modern CPUs rely on cache memory to reduce the latency of accessing data from main memory. Cache memory is faster but -smaller, organized in cache lines (typically 64 bytes). When a CPU accesses a memory location, it fetches an entire -cache line. Matrix multiplication can suffer from poor performance if memory accesses are not cache-friendly, leading to -frequent cache misses. - -The naive matrix multiplication (with triple nested loops) accesses memory in a way that may not exploit spatial and -temporal locality: - -- **Spatial Locality**: Accessing consecutive memory locations (e.g., elements in the same cache line). -- **Temporal Locality**: Reusing the same data multiple times while it’s still in the cache. +# How to Build -Blocked matrix multiplication divides the matrices into smaller submatrices (blocks) that fit into the cache. By -performing computations on these blocks, you ensure that data is reused while it resides in the cache, reducing cache -misses and improving performance. - -**Blocked Matrix Multiplication Pseudocode** - -Assume matrices \( A \) (m × n), \( B \) (n × p), and \( C \) (m × p) are stored in row-major order. The blocked matrix -multiplication processes submatrices of size \( block_size × block_size \): - -```cpp -// C = A * B -for (ii = 0; ii < m; ii += block_size) - for (jj = 0; jj < p; jj += block_size) - for (kk = 0; kk < n; kk += block_size) - // Process block: C[ii:ii+block_size, jj:jj+block_size] += A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size, jj:jj+block_size] - for (i = ii; i < min(ii + block_size, m); i++) - for (j = jj; j < min(jj + block_size, p); j++) - for (k = kk; k < min(kk + block_size, n); k++) - C[i * p + j] += A[i * n + k] * B[k * p + j] +### 🧱 Using g++ (MinGW or Linux): +```bash +g++ -fopenmp -O2 main.cpp -o matmul ``` -- **block_size**: Chosen to ensure the block fits in the cache (e.g., 32, 64, or 128, depending on the system). -- **Outer loops (ii, jj, kk)**: Iterate over blocks. -- **Inner loops (i, j, k)**: Compute within a block, reusing data in the cache. - -**Task**: Implement the `blocked_matmul` function in the provided `main.cpp`. Experiment with different block sizes (e.g., -16, 32, 64) and report the best performance. +#On Windows (Visual Studio): +- Open project in Visual Studio. +- Enable OpenMP: + `Project Properties → C/C++ → Language → OpenMP Support → Yes (/openmp)` +- Build the solution. --- -#### 2. Parallel Matrix Multiplication with OpenMP - -**Why OpenMP?** +## 🚀 How to Run -`OpenMP` is a portable API for parallel programming in shared-memory systems. It allows you to parallelize loops with -minimal code changes, distributing iterations across multiple threads. In matrix multiplication, the outer loop(s) can -be parallelized, as each element of the output matrix \( C \) can be computed independently. - -**Parallelizing with OpenMP** - -Use OpenMP to parallelize the outer loop(s) of the naive matrix multiplication. For example, parallelize the loop over -rows of \( C \): - -```cpp -#pragma omp parallel for -for (i = 0; i < m; i++) - for (j = 0; j < p; j++) - for (k = 0; k < n; k++) - C[i * p + j] += A[i * n + k] * B[k * p + j]; +```bash +./matmul ``` -- The `#pragma omp parallel for` directive tells `OpenMP` to distribute iterations of the loop across available threads. -- Ensure thread safety: Since each iteration writes to a distinct element of \( C \), this loop is safe to parallelize - without locks. -- Use `omp_get_wtime()` to measure wall clock time for accurate performance comparisons. +Example: +```bash +./matmul 3 +``` -**Task**: Implement the `parallel_matmul` function in the provided `main.cpp` using `OpenMP`. Test with different numbers of -threads (e.g., 2, 4, 8) by setting the environment variable `OMP_NUM_THREADS`. +Valid case numbers: `0` to `9` --- -#### 3. Performance Measurement +#Output -For each test case (0 through 9 in the `data` folder): +For each case, the program prints: +- Matrix dimensions +- Execution time for each method +- Speedups over the naive method +- Validation status for result correctness -- Measure the **wall clock time** for: - - Naive matrix multiplication (`naive_matmul`). - - Cache-optimized matrix multiplication (`blocked_matmul`). - - Parallel matrix multiplication (`parallel_matmul`). -- Use `omp_get_wtime()` for timing, as it provides high-resolution wall clock time. -- Report the times in a table in your submission README.md, including: - - Test case number. - - Matrix dimensions (m × n × p). - - Wall clock time for each implementation (in seconds). - - Speedup of blocked and parallel implementations over the naive implementation. - -Example table format: - -| Test Case | Dimensions (m × n × p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup | -|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------| -| 0 | 512 × 512 × 512 | 2.345 | 0.987 | 0.543 | 2.38× | 4.32× | +Example: +``` +Case 3 (128x64x128): +Naive time: 0.003121 s +Blocked time: 0.006114 s +Parallel time: 0.002136 s +Blocked speedup: 0.51x +Parallel speedup: 1.46x +``` --- -#### Matrix Storage and Memory Management +#Validation -- Continue using row-major order for all matrices, as in Assignment 1. -- Use C-style arrays with manual memory management (`malloc` or `new`, `free` or `delete`). -- Do not use STL containers or smart pointers. +The program compares computed `result.raw` with the reference `output.raw` and reports any mismatches. Modify `validate_result()` for custom error tolerances if needed. --- -#### Input/Output and Validation - -- Use the same input/output format as Assignment 1: - - Input files: `data//input0.raw` (matrix \( A \)) and `input1.raw` (matrix \( B \)). - - Output file: `data//result.raw` (matrix \( C \)). - - Reference file: `data//output.raw` for validation. -- The executable accepts a case number (0–9) as a command-line argument. -- Validate correctness by comparing `result.raw` with `output.raw` for each implementation. - ---- +#Notes -### Build Instructions - -- Use the provided `CMakeLists.txt` to build the project. -- **Additional Requirements**: - - Ensure OpenMP is enabled in your compiler (e.g., `-fopenmp` for GCC). - - The provided CMake file includes OpenMP support. -- **Windows Users**: - - Use CLion or Visual Studio with CMake. - - Alternatively, use MinGW with `cmake -G "MinGW Makefiles"` and `make`. -- **Linux/Mac Users**: - - Make sure gcc compiler is installed (`brew install gcc` on Mac). - - Configure cmake to use the correct compiler: - ```bash - cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ . - ``` - - Run `cmake .` to generate a Makefile, then `make`. -- **Testing OpenMP**: - - Set the number of threads using the environment variable `OMP_NUM_THREADS` (e.g., `export OMP_NUM_THREADS=4` on - Linux/Mac, or `set OMP_NUM_THREADS=4` on Windows). - - Test with different thread counts to find the best performance. +- Input and output matrices are in plain-text format. +- Block size for blocked multiplication is set to 32 by default. +- You can experiment with larger matrices and different block sizes for optimization. --- -### Submission Requirements - -#### Fork and Clone the Repository - -- Fork the Assignment 2 repository (provided separately). -- Clone your fork: - ```bash - git clone https://github.com/parallelcomputingabo/Homework-2.git - cd Homework-2 - ``` +# Author -#### Create a New Branch - -```bash -git checkout -b student-name -``` - -#### Implement Your Solution - -- Modify the provided `main.cpp` to implement `blocked_matmul` and `parallel_matmul`. -- Update `README.md` with your performance results table. - -#### Commit and Push - -```bash -git add . -git commit -m "student-name: Implemented optimized matrix multiplication" -git push origin student-name -``` - -#### Submit a Pull Request (PR) - -- Create a pull request from your branch to the base repository’s `main` branch. -- Include a description of your optimizations and any challenges faced. +This implementation is part of a Parallel Programming course assignment at Åbo Akademi University. --- -### Grading (100 Points Total) - -| Subtask | Points | -|---------------------------------------------|--------| -| Correct implementation of `blocked_matmul` | 30 | -| Correct implementation of `parallel_matmul` | 30 | -| Accurate performance measurements | 20 | -| Performance results table in README.md | 10 | -| Code clarity, commenting, and organization | 10 | -| **Total** | 100 | - ---- +#License -### Tips for Success - -- **Cache Optimization**: - - Experiment with different block sizes. Start with powers of 2 (e.g., 16, 32, 64). - - Use a block size that balances cache usage without excessive overhead. -- **OpenMP**: - - Test with different thread counts to find the optimal number for your system. - - Be cautious of false sharing (when threads access nearby memory locations, causing cache coherence issues). -- **Performance Measurement**: - - Run multiple iterations for each test case and report the average time to reduce variability. - - Ensure no other heavy processes are running during measurements. -- **Debugging**: - - Validate each implementation against `output.raw` to ensure correctness before optimizing. - - Use small test cases to debug your blocked and parallel implementations. - -Good luck, and enjoy optimizing your matrix multiplication! \ No newline at end of file +This project is for educational purposes. \ No newline at end of file diff --git a/main.cpp b/main.cpp index 65bf108..6a43078 100644 --- a/main.cpp +++ b/main.cpp @@ -3,112 +3,166 @@ #include #include #include - + void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - //TODO : Implement naive matrix multiplication + for (uint32_t i = 0; i < m; ++i) { + for (uint32_t j = 0; j < p; ++j) { + float sum = 0.0f; + for (uint32_t k = 0; k < n; ++k) { + sum += A[i * n + k] * B[k * p + j]; + } + C[i * p + j] = sum; + } + } } - + void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) { - // TODO: Implement blocked matrix multiplication - // A is m x n, B is n x p, C is m x p - // Use block_size to divide matrices into submatrices + for (uint32_t ii = 0; ii < m; ii += block_size) { + for (uint32_t jj = 0; jj < p; jj += block_size) { + for (uint32_t kk = 0; kk < n; kk += block_size) { + for (uint32_t i = ii; i < std::min(ii + block_size, m); ++i) { + for (uint32_t j = jj; j < std::min(jj + block_size, p); ++j) { + float sum = 0.0f; + for (uint32_t k = kk; k < std::min(kk + block_size, n); ++k) { + sum += A[i * n + k] * B[k * p + j]; + } + C[i * p + j] += sum; + } + } + } + } + } } - + void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - // TODO: Implement parallel matrix multiplication using OpenMP - // A is m x n, B is n x p, C is m x p + #pragma omp parallel for collapse(2) + for (int i = 0; i < static_cast(m); ++i) { + for (int j = 0; j < static_cast(p); ++j) { + float sum = 0.0f; + for (uint32_t k = 0; k < n; ++k) { + sum += A[i * n + k] * B[k * p + j]; + } + C[i * p + j] = sum; + } + } } - + bool validate_result(const std::string &result_file, const std::string &reference_file) { - //TODO : Implement result validation + std::ifstream result(result_file); + std::ifstream reference(reference_file); + if (!result.is_open() || !reference.is_open()) return false; + + uint32_t m_res, p_res, m_ref, p_ref; + result >> m_res >> p_res; + reference >> m_ref >> p_ref; + if (m_res != m_ref || p_res != p_ref) return false; + + for (uint32_t i = 0; i < m_res * p_res; ++i) { + float val_res, val_ref; + result >> val_res; + reference >> val_ref; + if (std::fabs(val_res - val_ref) > 1e-3) return false; + } + return true; } - + int main(int argc, char *argv[]) { if (argc != 2) { std::cerr << "Usage: " << argv[0] << " " << std::endl; return 1; } - + int case_number = std::atoi(argv[1]); if (case_number < 0 || case_number > 9) { std::cerr << "Case number must be between 0 and 9" << std::endl; return 1; } - - // Construct file paths - std::string folder = "data/" + std::to_string(case_number) + "/"; + + std::string folder = "F:/Homework-2/data/" + std::to_string(case_number) + "/"; std::string input0_file = folder + "input0.raw"; std::string input1_file = folder + "input1.raw"; std::string result_file = folder + "result.raw"; std::string reference_file = folder + "output.raw"; - - // TODO Read input0.raw (matrix A) - - - // TODO Read input1.raw (matrix B) - - - // Allocate memory for result matrices - float *C_naive = new float[m * p]; - float *C_blocked = new float[m * p]; - float *C_parallel = new float[m * p]; - - // Measure performance of naive_matmul + + std::ifstream input0(input0_file); + std::ifstream input1(input1_file); + + uint32_t m, n, n_check, p; + input0 >> m >> n; + input1 >> n_check >> p; + if (n != n_check) { + std::cerr << "Matrix dimension mismatch." << std::endl; + return 1; + } + + float* A = new float[m * n]; + float* B = new float[n * p]; + for (uint32_t i = 0; i < m * n; ++i) input0 >> A[i]; + for (uint32_t i = 0; i < n * p; ++i) input1 >> B[i]; + input0.close(); + input1.close(); + + float *C_naive = new float[m * p](); + float *C_blocked = new float[m * p](); + float *C_parallel = new float[m * p](); + double start_time = omp_get_wtime(); naive_matmul(C_naive, A, B, m, n, p); double naive_time = omp_get_wtime() - start_time; - - // TODO Write naive result to file - - - // Validate naive result + + std::ofstream out_naive(result_file); + out_naive << m << " " << p << "\n"; + for (uint32_t i = 0; i < m * p; ++i) out_naive << C_naive[i] << " "; + out_naive << "\n"; + out_naive.close(); + bool naive_correct = validate_result(result_file, reference_file); if (!naive_correct) { std::cerr << "Naive result validation failed for case " << case_number << std::endl; } - - // Measure performance of blocked_matmul (use block_size = 32 as default) + start_time = omp_get_wtime(); blocked_matmul(C_blocked, A, B, m, n, p, 32); double blocked_time = omp_get_wtime() - start_time; - - // TODO Write blocked result to file - - - // Validate blocked result + + std::ofstream out_blocked(result_file); + out_blocked << m << " " << p << "\n"; + for (uint32_t i = 0; i < m * p; ++i) out_blocked << C_blocked[i] << " "; + out_blocked << "\n"; + out_blocked.close(); + bool blocked_correct = validate_result(result_file, reference_file); if (!blocked_correct) { std::cerr << "Blocked result validation failed for case " << case_number << std::endl; } - - // Measure performance of parallel_matmul + start_time = omp_get_wtime(); parallel_matmul(C_parallel, A, B, m, n, p); double parallel_time = omp_get_wtime() - start_time; - - // TODO Write parallel result to file - - - // Validate parallel result + + std::ofstream out_parallel(result_file); + out_parallel << m << " " << p << "\n"; + for (uint32_t i = 0; i < m * p; ++i) out_parallel << C_parallel[i] << " "; + out_parallel << "\n"; + out_parallel.close(); + bool parallel_correct = validate_result(result_file, reference_file); if (!parallel_correct) { std::cerr << "Parallel result validation failed for case " << case_number << std::endl; } - - // Print performance results + std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n"; std::cout << "Naive time: " << naive_time << " seconds\n"; std::cout << "Blocked time: " << blocked_time << " seconds\n"; std::cout << "Parallel time: " << parallel_time << " seconds\n"; std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n"; std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n"; - - // Clean up + delete[] A; delete[] B; delete[] C_naive; delete[] C_blocked; delete[] C_parallel; - + return 0; } \ No newline at end of file