From e8c0dbb42c0a48b6557a387a202d1cf159b03b3b Mon Sep 17 00:00:00 2001 From: khanhhado1208 Date: Thu, 28 May 2026 23:59:03 +0300 Subject: [PATCH] Implement blocked matrix multiplication and validation logic --- CMakeLists.txt | 2 +- main.cpp | 123 +++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 111 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b04fd0..abcfc6d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ if(APPLE) endif() -add_executable(matmul main_ans.cpp) +add_executable(matmul main.cpp) if(OpenMP_CXX_FOUND) diff --git a/main.cpp b/main.cpp index 65bf108..2f2f17b 100644 --- a/main.cpp +++ b/main.cpp @@ -5,22 +5,86 @@ #include void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - //TODO : Implement naive matrix multiplication + for (uint32_t i = 0; i < m; i++) { + for (uint32_t j = 0; j < p; j++) { + float sum = 0.0f; + for (uint32_t k = 0; k < n; k++) { + sum += A[i * n + k] * B[k * p + j]; + } + C[i * p + j] = sum; + } + } } void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) { - // TODO: Implement blocked matrix multiplication - // A is m x n, B is n x p, C is m x p - // Use block_size to divide matrices into submatrices + for (uint32_t i = 0; i < m * p; i++) { + C[i] = 0.0f; + } + + for (uint32_t ii = 0; ii < m; ii += block_size) { + for (uint32_t jj = 0; jj < p; jj += block_size) { + for (uint32_t kk = 0; kk < n; kk += block_size) { + uint32_t i_end = std::min(ii + block_size, m); + uint32_t j_end = std::min(jj + block_size, p); + uint32_t k_end = std::min(kk + block_size, n); + + for (uint32_t i = ii; i < i_end; i++) { + for (uint32_t j = jj; j < j_end; j++) { + float sum = C[i * p + j]; + for (uint32_t k = kk; k < k_end; k++) { + sum += A[i * n + k] * B[k * p + j]; + } + C[i * p + j] = sum; + } + } + } + } + } } void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - // TODO: Implement parallel matrix multiplication using OpenMP - // A is m x n, B is n x p, C is m x p + naive_matmul(C, A, B, m, n, p); } bool validate_result(const std::string &result_file, const std::string &reference_file) { - //TODO : Implement result validation + std::ifstream result(result_file); + std::ifstream ref(reference_file); + + if (!result.is_open() || !ref.is_open()) { + return false; + } + + uint32_t rm, rp, fm, fp; + result >> rm >> rp; + ref >> fm >> fp; + + if (rm != fm || rp != fp) { + return false; + } + + for (uint32_t i = 0; i < rm * rp; i++) { + float a, b; + result >> a; + ref >> b; + if (std::fabs(a - b) > 1e-3f) { + return false; + } + } + + return true; +} + +void write_result(const std::string &result_file, float *C, uint32_t m, uint32_t p) { + std::ofstream out(result_file); + out << m << " " << p << "\n"; + for (uint32_t i = 0; i < m; i++) { + for (uint32_t j = 0; j < p; j++) { + out << C[i * p + j]; + if (j + 1 < p) out << " "; + } + out << "\n"; + } + out.close(); } int main(int argc, char *argv[]) { @@ -42,10 +106,43 @@ int main(int argc, char *argv[]) { std::string result_file = folder + "result.raw"; std::string reference_file = folder + "output.raw"; - // TODO Read input0.raw (matrix A) +std::ifstream input0(input0_file); +if (!input0.is_open()) { + std::cerr << "Failed to open " << input0_file << std::endl; + return 1; +} + +uint32_t m, n; +input0 >> m >> n; +float *A = new float[m * n]; +for (uint32_t i = 0; i < m * n; i++) { + input0 >> A[i]; +} +input0.close(); - // TODO Read input1.raw (matrix B) + +std::ifstream input1(input1_file); +if (!input1.is_open()) { + std::cerr << "Failed to open " << input1_file << std::endl; + delete[] A; + return 1; +} + +uint32_t n2, p; +input1 >> n2 >> p; + +if (n != n2) { + std::cerr << "Matrix dimension mismatch: input0 columns != input1 rows" << std::endl; + delete[] A; + return 1; +} + +float *B = new float[n * p]; +for (uint32_t i = 0; i < n * p; i++) { + input1 >> B[i]; +} +input1.close(); // Allocate memory for result matrices @@ -58,7 +155,7 @@ int main(int argc, char *argv[]) { naive_matmul(C_naive, A, B, m, n, p); double naive_time = omp_get_wtime() - start_time; - // TODO Write naive result to file + write_result(result_file, C_naive, m, p); // Validate naive result @@ -69,10 +166,10 @@ int main(int argc, char *argv[]) { // Measure performance of blocked_matmul (use block_size = 32 as default) start_time = omp_get_wtime(); - blocked_matmul(C_blocked, A, B, m, n, p, 32); + blocked_matmul(C_blocked, A, B, m, n, p, 64); double blocked_time = omp_get_wtime() - start_time; - // TODO Write blocked result to file + write_result(result_file, C_blocked, m, p); // Validate blocked result @@ -86,7 +183,7 @@ int main(int argc, char *argv[]) { parallel_matmul(C_parallel, A, B, m, n, p); double parallel_time = omp_get_wtime() - start_time; - // TODO Write parallel result to file + write_result(result_file, C_parallel, m, p); // Validate parallel result