From ad58e44852c2aa1927914042eb79c8003e997020 Mon Sep 17 00:00:00 2001
From: rafatanzir <39431259+rafatanzir@users.noreply.github.com>
Date: Thu, 8 May 2025 14:56:55 +0300
Subject: [PATCH] mdrafatanzir: Implemented optimized matrix multiplication

---
 CMakeLists.txt |   8 +-
 README.md      | 261 ++++++++++++-------------------------------------
 main.cpp       | 160 ++++++++++++++++++++----------
 3 files changed, 173 insertions(+), 256 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b04fd0..b560516 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,6 @@ cmake_minimum_required(VERSION 3.10)
 project(MatrixMultiplication)
 
 set(CMAKE_CXX_STANDARD 11)
-
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 find_package(OpenMP REQUIRED)
@@ -16,10 +15,9 @@ if(APPLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_Alignof=alignof")
 endif()
 
-
-add_executable(matmul main_ans.cpp)
-
+# Updated source file name here
+add_executable(matmul main.cpp)
 
 if(OpenMP_CXX_FOUND)
     target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX)
-endif()
\ No newline at end of file
+endif()
diff --git a/README.md b/README.md
index 0f91d63..adac8f6 100644
--- a/README.md
+++ b/README.md
@@ -1,238 +1,103 @@
-# Parallel Programming
+# Matrix Multiplication Assignment (Parallel Programming)
 
-**Åbo Akademi University, Information Technology Department**
+This project implements and benchmarks three matrix multiplication methods:
+- Naive Matrix Multiplication
+- Blocked Matrix Multiplicatio
+- Parallel Matrix Multiplication using OpenMP
 
-**Instructor: Alireza Olama**
-
-## Homework Assignment 2: Optimizing Matrix Multiplication in C++
-
-**Due Date**: 08/05/2025
-
-**Points**: 100
+#Objective
+Evaluate and compare the performance of different matrix multiplication strategies on various test cases. The input matrices are read from files and results are validated against reference outputs.
 
 ---
 
-### Assignment Overview
+# Project Structure
 
-Welcome to the second homework assignment of the Parallel Programming course! In Assignment 1, you implemented a naive
-matrix multiplication using a triple nested loop. In this assignment, you will optimize the performance of your naive
-implementation using two techniques:
-
-1. **Cache Optimization via Blocked Matrix Multiplication**: Improve data locality to reduce cache misses.
-2. **Parallel Matrix Multiplication using `OpenMP`**: Parallelize the computation across multiple threads.
-
-Your task is to implement both optimizations in the provided C++ `main.cpp` file, measure their performance, and compare the
-wall clock time of the naive, cache-optimized, and parallel implementations for each test case. This assignment builds
-on your Assignment 1 code, so ensure your naive implementation is correct before starting.
+```
+Assignment2/
+├── build/              # Build directory (e.g., CMake or Visual Studio output)
+├── data/               # Contains test case folders (0 to 9)
+│   ├── 0/
+│   │   ├── input0.raw  # Matrix A
+│   │   ├── input1.raw  # Matrix B
+│   │   ├── output.raw  # Reference result
+│   │   └── result.raw  # Generated output
+├── main.cpp            # Main implementation
+├── CMakeLists.txt      # CMake build file (optional)
+└── README.md           # This file
+```
 
 ---
 
-### Technical Requirements
-
-#### 1. Cache Optimization (Blocked Matrix Multiplication)
-
-**Why Cache Optimization?**
-
-Modern CPUs rely on cache memory to reduce the latency of accessing data from main memory. Cache memory is faster but
-smaller, organized in cache lines (typically 64 bytes). When a CPU accesses a memory location, it fetches an entire
-cache line. Matrix multiplication can suffer from poor performance if memory accesses are not cache-friendly, leading to
-frequent cache misses.
-
-The naive matrix multiplication (with triple nested loops) accesses memory in a way that may not exploit spatial and
-temporal locality:
-
-- **Spatial Locality**: Accessing consecutive memory locations (e.g., elements in the same cache line).
-- **Temporal Locality**: Reusing the same data multiple times while it’s still in the cache.
+# How to Build
 
-Blocked matrix multiplication divides the matrices into smaller submatrices (blocks) that fit into the cache. By
-performing computations on these blocks, you ensure that data is reused while it resides in the cache, reducing cache
-misses and improving performance.
-
-**Blocked Matrix Multiplication Pseudocode**
-
-Assume matrices \( A \) (m × n), \( B \) (n × p), and \( C \) (m × p) are stored in row-major order. The blocked matrix
-multiplication processes submatrices of size \( block_size × block_size \):
-
-```cpp
-// C = A * B
-for (ii = 0; ii < m; ii += block_size)
-    for (jj = 0; jj < p; jj += block_size)
-        for (kk = 0; kk < n; kk += block_size)
-            // Process block: C[ii:ii+block_size, jj:jj+block_size] += A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size, jj:jj+block_size]
-            for (i = ii; i < min(ii + block_size, m); i++)
-                for (j = jj; j < min(jj + block_size, p); j++)
-                    for (k = kk; k < min(kk + block_size, n); k++)
-                        C[i * p + j] += A[i * n + k] * B[k * p + j]
+### 🧱 Using g++ (MinGW or Linux):
+```bash
+g++ -fopenmp -O2 main.cpp -o matmul
 ```
 
-- **block_size**: Chosen to ensure the block fits in the cache (e.g., 32, 64, or 128, depending on the system).
-- **Outer loops (ii, jj, kk)**: Iterate over blocks.
-- **Inner loops (i, j, k)**: Compute within a block, reusing data in the cache.
-
-**Task**: Implement the `blocked_matmul` function in the provided `main.cpp`. Experiment with different block sizes (e.g.,
-16, 32, 64) and report the best performance.
+#On Windows (Visual Studio):
+- Open project in Visual Studio.
+- Enable OpenMP:  
+  `Project Properties → C/C++ → Language → OpenMP Support → Yes (/openmp)`
+- Build the solution.
 
 ---
 
-#### 2. Parallel Matrix Multiplication with OpenMP
-
-**Why OpenMP?**
+## 🚀 How to Run
 
-`OpenMP` is a portable API for parallel programming in shared-memory systems. It allows you to parallelize loops with
-minimal code changes, distributing iterations across multiple threads. In matrix multiplication, the outer loop(s) can
-be parallelized, as each element of the output matrix \( C \) can be computed independently.
-
-**Parallelizing with OpenMP**
-
-Use OpenMP to parallelize the outer loop(s) of the naive matrix multiplication. For example, parallelize the loop over
-rows of \( C \):
-
-```cpp
-#pragma omp parallel for
-for (i = 0; i < m; i++)
-    for (j = 0; j < p; j++)
-        for (k = 0; k < n; k++)
-            C[i * p + j] += A[i * n + k] * B[k * p + j];
+```bash
+./matmul <case_number>
 ```
 
-- The `#pragma omp parallel for` directive tells `OpenMP` to distribute iterations of the loop across available threads.
-- Ensure thread safety: Since each iteration writes to a distinct element of \( C \), this loop is safe to parallelize
-  without locks.
-- Use `omp_get_wtime()` to measure wall clock time for accurate performance comparisons.
+Example:
+```bash
+./matmul 3
+```
 
-**Task**: Implement the `parallel_matmul` function in the provided `main.cpp` using `OpenMP`. Test with different numbers of
-threads (e.g., 2, 4, 8) by setting the environment variable `OMP_NUM_THREADS`.
+Valid case numbers: `0` to `9`
 
 ---
 
-#### 3. Performance Measurement
+#Output
 
-For each test case (0 through 9 in the `data` folder):
+For each case, the program prints:
+- Matrix dimensions
+- Execution time for each method
+- Speedups over the naive method
+- Validation status for result correctness
 
-- Measure the **wall clock time** for:
-    - Naive matrix multiplication (`naive_matmul`).
-    - Cache-optimized matrix multiplication (`blocked_matmul`).
-    - Parallel matrix multiplication (`parallel_matmul`).
-- Use `omp_get_wtime()` for timing, as it provides high-resolution wall clock time.
-- Report the times in a table in your submission README.md, including:
-    - Test case number.
-    - Matrix dimensions (m × n × p).
-    - Wall clock time for each implementation (in seconds).
-    - Speedup of blocked and parallel implementations over the naive implementation.
-
-Example table format:
-
-| Test Case | Dimensions (m × n × p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup |
-|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|
-| 0         | 512 × 512 × 512        | 2.345          | 0.987            | 0.543             | 2.38×           | 4.32×            |
+Example:
+```
+Case 3 (128x64x128):
+Naive time: 0.003121 s
+Blocked time: 0.006114 s
+Parallel time: 0.002136 s
+Blocked speedup: 0.51x
+Parallel speedup: 1.46x
+```
 
 ---
 
-#### Matrix Storage and Memory Management
+#Validation
 
-- Continue using row-major order for all matrices, as in Assignment 1.
-- Use C-style arrays with manual memory management (`malloc` or `new`, `free` or `delete`).
-- Do not use STL containers or smart pointers.
+The program compares computed `result.raw` with the reference `output.raw` and reports any mismatches. Modify `validate_result()` for custom error tolerances if needed.
 
 ---
 
-#### Input/Output and Validation
-
-- Use the same input/output format as Assignment 1:
-    - Input files: `data/<case>/input0.raw` (matrix \( A \)) and `input1.raw` (matrix \( B \)).
-    - Output file: `data/<case>/result.raw` (matrix \( C \)).
-    - Reference file: `data/<case>/output.raw` for validation.
-- The executable accepts a case number (0–9) as a command-line argument.
-- Validate correctness by comparing `result.raw` with `output.raw` for each implementation.
-
----
+#Notes
 
-### Build Instructions
-
-- Use the provided `CMakeLists.txt` to build the project.
-- **Additional Requirements**:
-    - Ensure OpenMP is enabled in your compiler (e.g., `-fopenmp` for GCC).
-    - The provided CMake file includes OpenMP support.
-- **Windows Users**:
-    - Use CLion or Visual Studio with CMake.
-    - Alternatively, use MinGW with `cmake -G "MinGW Makefiles"` and `make`.
-- **Linux/Mac Users**:
-    - Make sure gcc compiler is installed (`brew install gcc` on Mac).
-    - Configure cmake to use the correct compiler:
-      ```bash
-      cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ .
-      ```
-    - Run `cmake .` to generate a Makefile, then `make`.
-- **Testing OpenMP**:
-    - Set the number of threads using the environment variable `OMP_NUM_THREADS` (e.g., `export OMP_NUM_THREADS=4` on
-      Linux/Mac, or `set OMP_NUM_THREADS=4` on Windows).
-    - Test with different thread counts to find the best performance.
+- Input and output matrices are in plain-text format.
+- Block size for blocked multiplication is set to 32 by default.
+- You can experiment with larger matrices and different block sizes for optimization.
 
 ---
 
-### Submission Requirements
-
-#### Fork and Clone the Repository
-
-- Fork the Assignment 2 repository (provided separately).
-- Clone your fork:
-  ```bash
-  git clone https://github.com/parallelcomputingabo/Homework-2.git
-  cd Homework-2
-  ```
+# Author
 
-#### Create a New Branch
-
-```bash
-git checkout -b student-name
-```
-
-#### Implement Your Solution
-
-- Modify the provided `main.cpp` to implement `blocked_matmul` and `parallel_matmul`.
-- Update `README.md` with your performance results table.
-
-#### Commit and Push
-
-```bash
-git add .
-git commit -m "student-name: Implemented optimized matrix multiplication"
-git push origin student-name
-```
-
-#### Submit a Pull Request (PR)
-
-- Create a pull request from your branch to the base repository’s `main` branch.
-- Include a description of your optimizations and any challenges faced.
+This implementation is part of a Parallel Programming course assignment at Åbo Akademi University.
 
 ---
 
-### Grading (100 Points Total)
-
-| Subtask                                     | Points |
-|---------------------------------------------|--------|
-| Correct implementation of `blocked_matmul`  | 30     |
-| Correct implementation of `parallel_matmul` | 30     |
-| Accurate performance measurements           | 20     |
-| Performance results table in README.md      | 10     |
-| Code clarity, commenting, and organization  | 10     |
-| **Total**                                   | 100    |
-
----
+#License
 
-### Tips for Success
-
-- **Cache Optimization**:
-    - Experiment with different block sizes. Start with powers of 2 (e.g., 16, 32, 64).
-    - Use a block size that balances cache usage without excessive overhead.
-- **OpenMP**:
-    - Test with different thread counts to find the optimal number for your system.
-    - Be cautious of false sharing (when threads access nearby memory locations, causing cache coherence issues).
-- **Performance Measurement**:
-    - Run multiple iterations for each test case and report the average time to reduce variability.
-    - Ensure no other heavy processes are running during measurements.
-- **Debugging**:
-    - Validate each implementation against `output.raw` to ensure correctness before optimizing.
-    - Use small test cases to debug your blocked and parallel implementations.
-
-Good luck, and enjoy optimizing your matrix multiplication!
\ No newline at end of file
+This project is for educational purposes.
\ No newline at end of file
diff --git a/main.cpp b/main.cpp
index 65bf108..6a43078 100644
--- a/main.cpp
+++ b/main.cpp
@@ -3,112 +3,166 @@
 #include <string>
 #include <omp.h>
 #include <cmath>
-
+ 
 void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    //TODO : Implement naive matrix multiplication
+    for (uint32_t i = 0; i < m; ++i) {
+        for (uint32_t j = 0; j < p; ++j) {
+            float sum = 0.0f;
+            for (uint32_t k = 0; k < n; ++k) {
+                sum += A[i * n + k] * B[k * p + j];
+            }
+            C[i * p + j] = sum;
+        }
+    }
 }
-
+ 
 void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) {
-    // TODO: Implement blocked matrix multiplication
-    // A is m x n, B is n x p, C is m x p
-    // Use block_size to divide matrices into submatrices
+    for (uint32_t ii = 0; ii < m; ii += block_size) {
+        for (uint32_t jj = 0; jj < p; jj += block_size) {
+            for (uint32_t kk = 0; kk < n; kk += block_size) {
+                for (uint32_t i = ii; i < std::min(ii + block_size, m); ++i) {
+                    for (uint32_t j = jj; j < std::min(jj + block_size, p); ++j) {
+                        float sum = 0.0f;
+                        for (uint32_t k = kk; k < std::min(kk + block_size, n); ++k) {
+                            sum += A[i * n + k] * B[k * p + j];
+                        }
+                        C[i * p + j] += sum;
+                    }
+                }
+            }
+        }
+    }
 }
-
+ 
 void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    // TODO: Implement parallel matrix multiplication using OpenMP
-    // A is m x n, B is n x p, C is m x p
+    #pragma omp parallel for collapse(2)
+    for (int i = 0; i < static_cast<int>(m); ++i) {
+        for (int j = 0; j < static_cast<int>(p); ++j) {
+            float sum = 0.0f;
+            for (uint32_t k = 0; k < n; ++k) {
+                sum += A[i * n + k] * B[k * p + j];
+            }
+            C[i * p + j] = sum;
+        }
+    }
 }
-
+ 
 bool validate_result(const std::string &result_file, const std::string &reference_file) {
-   //TODO : Implement result validation
+    std::ifstream result(result_file);
+    std::ifstream reference(reference_file);
+    if (!result.is_open() || !reference.is_open()) return false;
+ 
+    uint32_t m_res, p_res, m_ref, p_ref;
+    result >> m_res >> p_res;
+    reference >> m_ref >> p_ref;
+    if (m_res != m_ref || p_res != p_ref) return false;
+ 
+    for (uint32_t i = 0; i < m_res * p_res; ++i) {
+        float val_res, val_ref;
+        result >> val_res;
+        reference >> val_ref;
+        if (std::fabs(val_res - val_ref) > 1e-3) return false;
+    }
+    return true;
 }
-
+ 
 int main(int argc, char *argv[]) {
     if (argc != 2) {
         std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
         return 1;
     }
-
+ 
     int case_number = std::atoi(argv[1]);
     if (case_number < 0 || case_number > 9) {
         std::cerr << "Case number must be between 0 and 9" << std::endl;
         return 1;
     }
-
-    // Construct file paths
-    std::string folder = "data/" + std::to_string(case_number) + "/";
+ 
+    std::string folder = "F:/Homework-2/data/" + std::to_string(case_number) + "/";
     std::string input0_file = folder + "input0.raw";
     std::string input1_file = folder + "input1.raw";
     std::string result_file = folder + "result.raw";
     std::string reference_file = folder + "output.raw";
-
-    // TODO Read input0.raw (matrix A)
-
-
-    // TODO Read input1.raw (matrix B)
-
-
-    // Allocate memory for result matrices
-    float *C_naive = new float[m * p];
-    float *C_blocked = new float[m * p];
-    float *C_parallel = new float[m * p];
-
-    // Measure performance of naive_matmul
+ 
+    std::ifstream input0(input0_file);
+    std::ifstream input1(input1_file);
+ 
+    uint32_t m, n, n_check, p;
+    input0 >> m >> n;
+    input1 >> n_check >> p;
+    if (n != n_check) {
+        std::cerr << "Matrix dimension mismatch." << std::endl;
+        return 1;
+    }
+ 
+    float* A = new float[m * n];
+    float* B = new float[n * p];
+    for (uint32_t i = 0; i < m * n; ++i) input0 >> A[i];
+    for (uint32_t i = 0; i < n * p; ++i) input1 >> B[i];
+    input0.close();
+    input1.close();
+ 
+    float *C_naive = new float[m * p]();
+    float *C_blocked = new float[m * p]();
+    float *C_parallel = new float[m * p]();
+ 
     double start_time = omp_get_wtime();
     naive_matmul(C_naive, A, B, m, n, p);
     double naive_time = omp_get_wtime() - start_time;
-
-    // TODO Write naive result to file
-
-
-    // Validate naive result
+ 
+    std::ofstream out_naive(result_file);
+    out_naive << m << " " << p << "\n";
+    for (uint32_t i = 0; i < m * p; ++i) out_naive << C_naive[i] << " ";
+    out_naive << "\n";
+    out_naive.close();
+ 
     bool naive_correct = validate_result(result_file, reference_file);
     if (!naive_correct) {
         std::cerr << "Naive result validation failed for case " << case_number << std::endl;
     }
-
-    // Measure performance of blocked_matmul (use block_size = 32 as default)
+ 
     start_time = omp_get_wtime();
     blocked_matmul(C_blocked, A, B, m, n, p, 32);
     double blocked_time = omp_get_wtime() - start_time;
-
-    // TODO Write blocked result to file
-
-
-    // Validate blocked result
+ 
+    std::ofstream out_blocked(result_file);
+    out_blocked << m << " " << p << "\n";
+    for (uint32_t i = 0; i < m * p; ++i) out_blocked << C_blocked[i] << " ";
+    out_blocked << "\n";
+    out_blocked.close();
+ 
     bool blocked_correct = validate_result(result_file, reference_file);
     if (!blocked_correct) {
         std::cerr << "Blocked result validation failed for case " << case_number << std::endl;
     }
-
-    // Measure performance of parallel_matmul
+ 
     start_time = omp_get_wtime();
     parallel_matmul(C_parallel, A, B, m, n, p);
     double parallel_time = omp_get_wtime() - start_time;
-
-    // TODO Write parallel result to file
-
-
-    // Validate parallel result
+ 
+    std::ofstream out_parallel(result_file);
+    out_parallel << m << " " << p << "\n";
+    for (uint32_t i = 0; i < m * p; ++i) out_parallel << C_parallel[i] << " ";
+    out_parallel << "\n";
+    out_parallel.close();
+ 
     bool parallel_correct = validate_result(result_file, reference_file);
     if (!parallel_correct) {
         std::cerr << "Parallel result validation failed for case " << case_number << std::endl;
     }
-
-    // Print performance results
+ 
     std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
     std::cout << "Naive time: " << naive_time << " seconds\n";
     std::cout << "Blocked time: " << blocked_time << " seconds\n";
     std::cout << "Parallel time: " << parallel_time << " seconds\n";
     std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
     std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";
-
-    // Clean up
+ 
     delete[] A;
     delete[] B;
     delete[] C_naive;
     delete[] C_blocked;
     delete[] C_parallel;
-
+ 
     return 0;
 }
\ No newline at end of file