From c354ced7e6966d8272d003b5ff5c65d909476cda Mon Sep 17 00:00:00 2001
From: Rasmus Lagerqvist <rasmus.o.lagerqvist@gmail.com>
Date: Fri, 2 May 2025 20:34:29 +0300
Subject: [PATCH] rasmus-lagerqvist: Implemented optimized matrix
 multiplication

---
 CMakeLists.txt |  23 ++--
 main.cpp       | 319 +++++++++++++++++++++++++++++++++++--------------
 2 files changed, 241 insertions(+), 101 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b04fd0..ee606e9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,25 +1,20 @@
 cmake_minimum_required(VERSION 3.10)
-
 project(MatrixMultiplication)
 
 set(CMAKE_CXX_STANDARD 11)
-
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-find_package(OpenMP REQUIRED)
-
-if(OpenMP_CXX_FOUND)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-endif()
-
+# Custom OpenMP setup for macOS
 if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_Alignof=alignof")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xpreprocessor -fopenmp -D_Alignof=alignof")
+	include_directories("/opt/homebrew/include")
+	link_directories("/opt/homebrew/lib")
+else()
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
 endif()
 
+add_executable(matmul main.cpp)
 
-add_executable(matmul main_ans.cpp)
-
-
-if(OpenMP_CXX_FOUND)
-    target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX)
+if(APPLE)
+	target_link_libraries(matmul "-L/opt/homebrew/lib" "-lomp")
 endif()
\ No newline at end of file
diff --git a/main.cpp b/main.cpp
index 65bf108..3c2a38d 100644
--- a/main.cpp
+++ b/main.cpp
@@ -3,112 +3,257 @@
 #include <string>
 #include <omp.h>
 #include <cmath>
+#include <algorithm>
 
 void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    //TODO : Implement naive matrix multiplication
+	// Initialize C to zero
+	for (uint32_t i = 0; i < m; i++) {
+		for (uint32_t j = 0; j < p; j++) {
+			C[i * p + j] = 0.0f;
+		}
+	}
+	
+	// Perform naive matrix multiplication using triple nested loop
+	for (uint32_t i = 0; i < m; i++) {
+		for (uint32_t j = 0; j < p; j++) {
+			for (uint32_t k = 0; k < n; k++) {
+				C[i * p + j] += A[i * n + k] * B[k * p + j];
+			}
+		}
+	}
 }
 
 void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) {
-    // TODO: Implement blocked matrix multiplication
-    // A is m x n, B is n x p, C is m x p
-    // Use block_size to divide matrices into submatrices
+	// Initialize C to zero
+	for (uint32_t i = 0; i < m; i++) {
+		for (uint32_t j = 0; j < p; j++) {
+			C[i * p + j] = 0.0f;
+		}
+	}
+	
+	// Perform blocked matrix multiplication
+	for (uint32_t ii = 0; ii < m; ii += block_size) {
+		for (uint32_t jj = 0; jj < p; jj += block_size) {
+			for (uint32_t kk = 0; kk < n; kk += block_size) {
+				// Process block
+				// C[ii:ii+block_size, jj:jj+block_size] += A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size, jj:jj+block_size]
+				for (uint32_t i = ii; i < std::min(ii + block_size, m); i++) {
+					for (uint32_t j = jj; j < std::min(jj + block_size, p); j++) {
+						for (uint32_t k = kk; k < std::min(kk + block_size, n); k++) {
+							C[i * p + j] += A[i * n + k] * B[k * p + j];
+						}
+					}
+				}
+			}
+		}
+	}
 }
 
 void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    // TODO: Implement parallel matrix multiplication using OpenMP
-    // A is m x n, B is n x p, C is m x p
+	// Initialize C to zero
+	#pragma omp parallel for
+	for (uint32_t i = 0; i < m; i++) {
+		for (uint32_t j = 0; j < p; j++) {
+			C[i * p + j] = 0.0f;
+		}
+	}
+	
+	// Parallelize the outer loop with OpenMP
+	#pragma omp parallel for
+	for (uint32_t i = 0; i < m; i++) {
+		for (uint32_t j = 0; j < p; j++) {
+			float sum = 0.0f;
+			for (uint32_t k = 0; k < n; k++) {
+				sum += A[i * n + k] * B[k * p + j];
+			}
+			C[i * p + j] = sum;
+		}
+	}
 }
 
 bool validate_result(const std::string &result_file, const std::string &reference_file) {
-   //TODO : Implement result validation
+	std::ifstream result(result_file, std::ios::binary);
+	std::ifstream reference(reference_file, std::ios::binary);
+	
+	if (!result || !reference) {
+		std::cerr << "Error opening result or reference file" << std::endl;
+		return false;
+	}
+	
+	// Determine file sizes
+	result.seekg(0, std::ios::end);
+	reference.seekg(0, std::ios::end);
+	size_t result_size = result.tellg();
+	size_t reference_size = reference.tellg();
+	result.seekg(0, std::ios::beg);
+	reference.seekg(0, std::ios::beg);
+	
+	// Check if file sizes match
+	if (result_size != reference_size) {
+		std::cerr << "Result and reference file sizes do not match" << std::endl;
+		return false;
+	}
+	
+	// Read and compare file contents
+	const size_t buffer_size = 1024;
+	float result_buffer[buffer_size];
+	float reference_buffer[buffer_size];
+	
+	size_t total_elements = result_size / sizeof(float);
+	size_t elements_compared = 0;
+	
+	const float epsilon = 1e-5f; // Tolerance for floating-point comparison
+	
+	while (elements_compared < total_elements) {
+		size_t elements_to_read = std::min(buffer_size, total_elements - elements_compared);
+		
+		result.read(reinterpret_cast<char*>(result_buffer), elements_to_read * sizeof(float));
+		reference.read(reinterpret_cast<char*>(reference_buffer), elements_to_read * sizeof(float));
+		
+		for (size_t i = 0; i < elements_to_read; i++) {
+			if (std::abs(result_buffer[i] - reference_buffer[i]) > epsilon) {
+				std::cerr << "Result differs from reference at element " << (elements_compared + i) << std::endl;
+				std::cerr << "Result: " << result_buffer[i] << ", Reference: " << reference_buffer[i] << std::endl;
+				return false;
+			}
+		}
+		
+		elements_compared += elements_to_read;
+	}
+	
+	return true;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc != 2) {
-        std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
-        return 1;
-    }
+	if (argc != 3) {
+		std::cerr << "Usage: " << argv[0] << " <case_number> <naive|blocked|parallel>" << std::endl;
+		return 1;
+	}
 
-    int case_number = std::atoi(argv[1]);
-    if (case_number < 0 || case_number > 9) {
-        std::cerr << "Case number must be between 0 and 9" << std::endl;
-        return 1;
-    }
+	int case_number = std::atoi(argv[1]);
+	std::string algorithm = argv[2];
+	
+	// Validate algorithm choice
+	if (algorithm != "naive" && algorithm != "blocked" && algorithm != "parallel") {
+		std::cerr << "Invalid algorithm: " << algorithm << std::endl;
+		std::cerr << "Valid algorithms: naive, blocked, parallel" << std::endl;
+		return 1;
+	}
+	
+	if (case_number < 0 || case_number > 9) {
+		std::cerr << "Case number must be between 0 and 9" << std::endl;
+		return 1;
+	}
 
-    // Construct file paths
-    std::string folder = "data/" + std::to_string(case_number) + "/";
-    std::string input0_file = folder + "input0.raw";
-    std::string input1_file = folder + "input1.raw";
-    std::string result_file = folder + "result.raw";
-    std::string reference_file = folder + "output.raw";
+	// Construct file paths
+	std::string folder = "data/" + std::to_string(case_number) + "/";
+	std::string input0_file = folder + "input0.raw";
+	std::string input1_file = folder + "input1.raw";
+	std::string result_file = "result.raw";  // Write to current directory
+	std::string reference_file = folder + "output.raw";
 
-    // TODO Read input0.raw (matrix A)
+	// Read input0.raw (matrix A)
+	std::ifstream input0(input0_file, std::ios::binary);
+	if (!input0) {
+		std::cerr << "Error opening input0.raw" << std::endl;
+		return 1;
+	}
+	
+	// Read dimensions from input0.raw
+	uint32_t m, n;
+	input0.read(reinterpret_cast<char*>(&m), sizeof(uint32_t));
+	input0.read(reinterpret_cast<char*>(&n), sizeof(uint32_t));
+	
+	// Allocate memory for A
+	float *A = new float[m * n];
+	input0.read(reinterpret_cast<char*>(A), m * n * sizeof(float));
+	input0.close();
+	
+	// Read input1.raw (matrix B)
+	std::ifstream input1(input1_file, std::ios::binary);
+	if (!input1) {
+		std::cerr << "Error opening input1.raw" << std::endl;
+		delete[] A;
+		return 1;
+	}
+	
+	// Read dimensions from input1.raw
+	uint32_t n_check, p;
+	input1.read(reinterpret_cast<char*>(&n_check), sizeof(uint32_t));
+	input1.read(reinterpret_cast<char*>(&p), sizeof(uint32_t));
+	
+	// Check if dimensions are compatible
+	if (n != n_check) {
+		std::cerr << "Incompatible matrix dimensions" << std::endl;
+		delete[] A;
+		return 1;
+	}
+	
+	// Allocate memory for B
+	float *B = new float[n * p];
+	input1.read(reinterpret_cast<char*>(B), n * p * sizeof(float));
+	input1.close();
 
+	// Allocate memory for result matrix C
+	float *C = new float[m * p];
 
-    // TODO Read input1.raw (matrix B)
+	std::cout << "Matrix dimensions: A(" << m << "x" << n << "), B(" << n << "x" << p << ")" << std::endl;
+	
+	if (algorithm == "naive") {
+		// Time naive matrix multiplication
+		double t0 = omp_get_wtime();
+		naive_matmul(C, A, B, m, n, p);
+		double t1 = omp_get_wtime();
+		std::cout << "Naive matrix multiplication time: " << (t1 - t0) * 1000 << " ms" << std::endl;
+	} 
+	else if (algorithm == "blocked") {
+		// Time blocked matrix multiplication
+		double t0 = omp_get_wtime();
+		blocked_matmul(C, A, B, m, n, p, 32);  // Block size of 32
+		double t1 = omp_get_wtime();
+		std::cout << "Blocked matrix multiplication time: " << (t1 - t0) * 1000 << " ms" << std::endl;
+	} 
+	else if (algorithm == "parallel") {
+		// Time parallel matrix multiplication
+		double t0 = omp_get_wtime();
+		parallel_matmul(C, A, B, m, n, p);
+		double t1 = omp_get_wtime();
+		std::cout << "Parallel matrix multiplication time: " << (t1 - t0) * 1000 << " ms" << std::endl;
+	}
 
+	// Write result to file
+	std::ofstream output(result_file, std::ios::binary);
+	if (!output) {
+		std::cerr << "Error opening result file" << std::endl;
+		delete[] A;
+		delete[] B;
+		delete[] C;
+		return 1;
+	}
+	
+	// Write dimensions to result file
+	output.write(reinterpret_cast<char*>(&m), sizeof(uint32_t));
+	output.write(reinterpret_cast<char*>(&p), sizeof(uint32_t));
+	
+	// Write result matrix to file
+	output.write(reinterpret_cast<char*>(C), m * p * sizeof(float));
+	output.close();
+	
+	// Validate result
+	if (validate_result(result_file, reference_file)) {
+		std::cout << "Result validation successful!" << std::endl;
+	} else {
+		std::cerr << "Result validation failed!" << std::endl;
+		delete[] A;
+		delete[] B;
+		delete[] C;
+		return 1;
+	}
 
-    // Allocate memory for result matrices
-    float *C_naive = new float[m * p];
-    float *C_blocked = new float[m * p];
-    float *C_parallel = new float[m * p];
-
-    // Measure performance of naive_matmul
-    double start_time = omp_get_wtime();
-    naive_matmul(C_naive, A, B, m, n, p);
-    double naive_time = omp_get_wtime() - start_time;
-
-    // TODO Write naive result to file
-
-
-    // Validate naive result
-    bool naive_correct = validate_result(result_file, reference_file);
-    if (!naive_correct) {
-        std::cerr << "Naive result validation failed for case " << case_number << std::endl;
-    }
-
-    // Measure performance of blocked_matmul (use block_size = 32 as default)
-    start_time = omp_get_wtime();
-    blocked_matmul(C_blocked, A, B, m, n, p, 32);
-    double blocked_time = omp_get_wtime() - start_time;
-
-    // TODO Write blocked result to file
-
-
-    // Validate blocked result
-    bool blocked_correct = validate_result(result_file, reference_file);
-    if (!blocked_correct) {
-        std::cerr << "Blocked result validation failed for case " << case_number << std::endl;
-    }
-
-    // Measure performance of parallel_matmul
-    start_time = omp_get_wtime();
-    parallel_matmul(C_parallel, A, B, m, n, p);
-    double parallel_time = omp_get_wtime() - start_time;
-
-    // TODO Write parallel result to file
-
-
-    // Validate parallel result
-    bool parallel_correct = validate_result(result_file, reference_file);
-    if (!parallel_correct) {
-        std::cerr << "Parallel result validation failed for case " << case_number << std::endl;
-    }
-
-    // Print performance results
-    std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
-    std::cout << "Naive time: " << naive_time << " seconds\n";
-    std::cout << "Blocked time: " << blocked_time << " seconds\n";
-    std::cout << "Parallel time: " << parallel_time << " seconds\n";
-    std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
-    std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";
-
-    // Clean up
-    delete[] A;
-    delete[] B;
-    delete[] C_naive;
-    delete[] C_blocked;
-    delete[] C_parallel;
-
-    return 0;
+	// Free memory
+	delete[] A;
+	delete[] B;
+	delete[] C;
+	
+	return 0;
 }
\ No newline at end of file