diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f797853 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.vscode/ +result.raw +matmul +matmul_p +performance.md diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index 9b04fd0..0000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -cmake_minimum_required(VERSION 3.10) - -project(MatrixMultiplication) - -set(CMAKE_CXX_STANDARD 11) - -set(CMAKE_CXX_STANDARD_REQUIRED ON) - -find_package(OpenMP REQUIRED) - -if(OpenMP_CXX_FOUND) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") -endif() - -if(APPLE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_Alignof=alignof") -endif() - - -add_executable(matmul main_ans.cpp) - - -if(OpenMP_CXX_FOUND) - target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX) -endif() \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..22748a8 --- /dev/null +++ b/Makefile @@ -0,0 +1,12 @@ +.PHONY: clean + +override CFLAGS := -Wall -Wextra -Wpedantic -Werror -fopenmp -O3 $(CFLAGS) + +matmul: main.cpp + g++ $(CFLAGS) -o matmul main.cpp + +matmul_p: main.cpp + g++ $(CFLAGS) -DPERFORMANCE_MD -o matmul_p main.cpp + +clean: + rm -f ./matmul ./matmul_p diff --git a/README.md b/README.md index 0f91d63..77f2b43 100644 --- a/README.md +++ b/README.md @@ -235,4 +235,26 @@ git push origin student-name - Validate each implementation against `output.raw` to ensure correctness before optimizing. - Use small test cases to debug your blocked and parallel implementations. -Good luck, and enjoy optimizing your matrix multiplication! \ No newline at end of file +Good luck, and enjoy optimizing your matrix multiplication! + +--- + +### Results + +The best results can be seen in the table below (block size 32, 12 threads): + +| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup | +|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------| +| 0 | 64 x 64 x 64 | 0.000140 | 0.000127 | 0.000260 | 1.101x | 0.538x | +| 1 | 128 x 64 x 128 | 0.000570 | 0.000511 | 0.000350 | 1.115x | 1.627x | +| 2 | 100 x 128 x 56 | 0.000531 | 0.000389 | 0.008613 | 1.365x | 0.062x | +| 3 | 128 x 64 x 128 | 0.000563 | 0.000571 | 0.000323 | 0.987x | 1.742x | +| 4 | 32 x 128 x 32 | 0.000103 | 0.000079 | 0.000261 | 1.304x | 0.396x | +| 5 | 200 x 100 x 256 | 0.005079 | 0.002296 | 0.001107 | 2.212x | 4.587x | +| 6 | 256 x 256 x 256 | 0.017202 | 0.007565 | 0.003063 | 2.274x | 5.616x | +| 7 | 256 x 300 x 256 | 0.020208 | 0.009192 | 0.003385 | 2.199x | 5.970x | +| 8 | 64 x 128 x 64 | 0.000376 | 0.000306 | 0.000302 | 1.231x | 1.246x | +| 9 | 256 x 256 x 257 | 0.011176 | 0.007139 | 0.001828 | 1.565x | 6.115x | + +In most cases the parallel speedup was significantly higher than the blocked speedup, however for cases 0, 2, and 4 +the parallel implementation ended up being much slower than both the blocked and naive implementation. diff --git a/main.cpp b/main.cpp index 65bf108..ddf2bf2 100644 --- a/main.cpp +++ b/main.cpp @@ -3,36 +3,138 @@ #include #include #include +#include -void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - //TODO : Implement naive matrix multiplication +void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) +{ + for (uint32_t i = 0; i < m; i++) + { + for (uint32_t j = 0; j < p; j++) + { + float sum = 0; + for (uint32_t k = 0; k < n; k++) + { + sum += A[i * n + k] * B[k * p + j]; + } + C[i * p + j] = sum; + } + } } -void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) { - // TODO: Implement blocked matrix multiplication +void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) +{ // A is m x n, B is n x p, C is m x p // Use block_size to divide matrices into submatrices + for (uint32_t ii = 0; ii < m; ii += block_size) + for (uint32_t jj = 0; jj < p; jj += block_size) + for (uint32_t kk = 0; kk < n; kk += block_size) + // Process block: C[ii:ii+block_size, jj:jj+block_size] += A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size, jj:jj+block_size] + for (uint32_t i = ii; i < std::min(ii + block_size, m); i++) + for (uint32_t j = jj; j < std::min(jj + block_size, p); j++) + { + float sum = 0; // significantly faster than writing and reading C repeatedly + for (uint32_t k = kk; k < std::min(kk + block_size, n); k++) + sum += A[i * n + k] * B[k * p + j]; + C[i * p + j] = sum; + } } -void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) { - // TODO: Implement parallel matrix multiplication using OpenMP - // A is m x n, B is n x p, C is m x p +void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) +{ +// A is m x n, B is n x p, C is m x p +#pragma omp parallel for + for (uint32_t i = 0; i < m; i++) + for (uint32_t j = 0; j < p; j++) + { + float sum = 0; // significantly faster than writing and reading C repeatedly + for (uint32_t k = 0; k < n; k++) + sum += A[i * n + k] * B[k * p + j]; + C[i * p + j] = sum; + } } -bool validate_result(const std::string &result_file, const std::string &reference_file) { - //TODO : Implement result validation +bool validate_result(const std::string &result_file, const std::string &reference_file) +{ + FILE *a; + FILE *b; + + a = fopen(result_file.c_str(), "r"); + b = fopen(reference_file.c_str(), "r"); + if (a == NULL) + { + fprintf(stderr, "Error opening %s\n", result_file.c_str()); + exit(1); + } + if (b == NULL) + { + fprintf(stderr, "Error opening %s\n", result_file.c_str()); + exit(1); + } + + int len_a, len_b; + fseek(a, SEEK_END, 0); + fseek(b, SEEK_END, 0); + len_a = ftell(a); + len_b = ftell(b); + fseek(a, SEEK_SET, 0); + fseek(b, SEEK_SET, 0); + + if (len_a != len_b) + return false; + + for (int i = 0; i < len_a; i++) + { + if (getc(a) != getc(b)) + return false; + } + + return true; } -int main(int argc, char *argv[]) { - if (argc != 2) { +/** + * Formats a floating point number into `buf` to two-decimal precision and no trailing zeroes. + */ +void format_properly(float f, char *buf, size_t buf_size) +{ + int len = snprintf(buf, buf_size, "%.2f", f); + char *ptr = buf + len - 1; + while (*ptr == '0' && ptr >= buf) + { + *ptr-- = '\x00'; + } +} + +void write_results(FILE *result, int m, int p, float *C) +{ + fprintf(result, "%d %d\n", m, p); + char buf[16] = {0}; + for (int i = 0; i < m; i++) + { + for (int j = 0; j < p; j++) + { + if (j > 0) + fputc(' ', result); + format_properly(C[i * p + j], buf, sizeof(buf)); + fprintf(result, "%s", buf); + } + if (i < m - 1) + fputc('\n', result); + } +} + +int main(int argc, char *argv[]) +{ + if (argc != 2) + { std::cerr << "Usage: " << argv[0] << " " << std::endl; - return 1; + return 2; } int case_number = std::atoi(argv[1]); - if (case_number < 0 || case_number > 9) { + if (case_number < 0 || case_number > 9) + { std::cerr << "Case number must be between 0 and 9" << std::endl; - return 1; + return 2; } // Construct file paths @@ -42,56 +144,113 @@ int main(int argc, char *argv[]) { std::string result_file = folder + "result.raw"; std::string reference_file = folder + "output.raw"; - // TODO Read input0.raw (matrix A) + FILE *input0 = fopen(input0_file.c_str(), "r"); + if (input0 == NULL) + { + fprintf(stderr, "Error opening %s\n", input0_file.c_str()); + return 1; + } + FILE *input1 = fopen(input1_file.c_str(), "r"); + if (input1 == NULL) + { + fprintf(stderr, "Error opening %s\n", input1_file.c_str()); + return 1; + } + + int m, n, p; + fscanf(input0, "%d %d", &m, &n); + fscanf(input1, "%d %d", &n, &p); + float *A = new float[m * n]; + float *B = new float[n * p]; - // TODO Read input1.raw (matrix B) + float f = 0.0; + for (int i = 0; i < m; i++) + { + for (int j = 0; j < n; j++) + { + fscanf(input0, "%f", &f); + A[i * n + j] = f; + } + } + for (int i = 0; i < n; i++) + { + for (int j = 0; j < p; j++) + { + fscanf(input1, "%f", &f); + B[i * p + j] = f; + } + } // Allocate memory for result matrices float *C_naive = new float[m * p]; float *C_blocked = new float[m * p]; float *C_parallel = new float[m * p]; + // NAIVE START // Measure performance of naive_matmul double start_time = omp_get_wtime(); naive_matmul(C_naive, A, B, m, n, p); double naive_time = omp_get_wtime() - start_time; - // TODO Write naive result to file - + FILE *result = fopen(result_file.c_str(), "w"); + if (result == NULL) + { + fprintf(stderr, "Error opening %s\n", result_file.c_str()); + return 1; + } + write_results(result, m, p, C_naive); + fclose(result); // Validate naive result bool naive_correct = validate_result(result_file, reference_file); - if (!naive_correct) { + if (!naive_correct) + { std::cerr << "Naive result validation failed for case " << case_number << std::endl; } + // BLOCKED START // Measure performance of blocked_matmul (use block_size = 32 as default) start_time = omp_get_wtime(); blocked_matmul(C_blocked, A, B, m, n, p, 32); double blocked_time = omp_get_wtime() - start_time; - // TODO Write blocked result to file - + result = fopen(result_file.c_str(), "w"); + if (result == NULL) + { + fprintf(stderr, "Error opening %s\n", result_file.c_str()); + return 1; + } + write_results(result, m, p, C_blocked); + fclose(result); // Validate blocked result bool blocked_correct = validate_result(result_file, reference_file); - if (!blocked_correct) { + if (!blocked_correct) + { std::cerr << "Blocked result validation failed for case " << case_number << std::endl; } + // PARALLEL START // Measure performance of parallel_matmul start_time = omp_get_wtime(); parallel_matmul(C_parallel, A, B, m, n, p); double parallel_time = omp_get_wtime() - start_time; - // TODO Write parallel result to file - + result = fopen(result_file.c_str(), "w"); + if (result == NULL) + { + fprintf(stderr, "Error opening %s\n", result_file.c_str()); + return 1; + } + write_results(result, m, p, C_parallel); + fclose(result); // Validate parallel result bool parallel_correct = validate_result(result_file, reference_file); - if (!parallel_correct) { + if (!parallel_correct) + { std::cerr << "Parallel result validation failed for case " << case_number << std::endl; } @@ -103,6 +262,35 @@ int main(int argc, char *argv[]) { std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n"; std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n"; +#ifdef PERFORMANCE_MD + FILE *out; + out = fopen("performance.md", "a"); + if (out != NULL) + { + if (ftell(out) == 0) + { + fprintf(out, + "| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup |\n" + "|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|\n"); + } + fprintf( + out, + "| %-9d | %-3d x %-3d x %-10d | %-14f | %-16f | %-17f | %.3fx%-9s | %.3fx%-10s |\n", + case_number, + m, n, p, + naive_time, + blocked_time, + parallel_time, + (naive_time / blocked_time), "", + (naive_time / parallel_time), ""); + fclose(out); + } + else + { + fprintf(stderr, "Error opening performance.md, skipped\n"); + } +#endif + // Clean up delete[] A; delete[] B; @@ -111,4 +299,4 @@ int main(int argc, char *argv[]) { delete[] C_parallel; return 0; -} \ No newline at end of file +} diff --git a/performance_i7-2640m_2.80GHz.md b/performance_i7-2640m_2.80GHz.md new file mode 100644 index 0000000..2585d73 --- /dev/null +++ b/performance_i7-2640m_2.80GHz.md @@ -0,0 +1,32 @@ +| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup | +|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------| +| 0 | 64 x 64 x 64 | 0.000476 | 0.000535 | 0.002130 | 0.890x | 0.224x | +| 0 | 64 x 64 x 64 | 0.000343 | 0.000334 | 0.003825 | 1.028x | 0.090x | +| 0 | 64 x 64 x 64 | 0.000306 | 0.000287 | 0.002904 | 1.069x | 0.106x | +| 1 | 128 x 64 x 128 | 0.001485 | 0.001315 | 0.002541 | 1.129x | 0.585x | +| 1 | 128 x 64 x 128 | 0.001343 | 0.001968 | 0.000823 | 0.682x | 1.632x | +| 1 | 128 x 64 x 128 | 0.001315 | 0.001949 | 0.006872 | 0.674x | 0.191x | +| 2 | 100 x 128 x 56 | 0.000823 | 0.000740 | 0.004221 | 1.112x | 0.195x | +| 2 | 100 x 128 x 56 | 0.000857 | 0.000749 | 0.000768 | 1.145x | 1.117x | +| 2 | 100 x 128 x 56 | 0.001343 | 0.001336 | 0.003337 | 1.006x | 0.403x | +| 3 | 128 x 64 x 128 | 0.001323 | 0.001971 | 0.000885 | 0.671x | 1.494x | +| 3 | 128 x 64 x 128 | 0.001348 | 0.001126 | 0.002626 | 1.197x | 0.513x | +| 3 | 128 x 64 x 128 | 0.001858 | 0.001351 | 0.000844 | 1.376x | 2.202x | +| 4 | 32 x 128 x 32 | 0.000226 | 0.000183 | 0.003853 | 1.237x | 0.059x | +| 4 | 32 x 128 x 32 | 0.000146 | 0.000133 | 0.000252 | 1.101x | 0.581x | +| 4 | 32 x 128 x 32 | 0.000146 | 0.000134 | 0.003343 | 1.096x | 0.044x | +| 5 | 200 x 100 x 256 | 0.006846 | 0.011000 | 0.009009 | 0.622x | 0.760x | +| 5 | 200 x 100 x 256 | 0.007464 | 0.005513 | 0.003677 | 1.354x | 2.030x | +| 5 | 200 x 100 x 256 | 0.007161 | 0.006429 | 0.006429 | 1.114x | 1.114x | +| 6 | 256 x 256 x 256 | 0.028310 | 0.019140 | 0.026673 | 1.479x | 1.061x | +| 6 | 256 x 256 x 256 | 0.028416 | 0.018362 | 0.015405 | 1.548x | 1.845x | +| 6 | 256 x 256 x 256 | 0.028912 | 0.018607 | 0.020357 | 1.554x | 1.420x | +| 7 | 256 x 300 x 256 | 0.035351 | 0.025475 | 0.026895 | 1.388x | 1.314x | +| 7 | 256 x 300 x 256 | 0.034877 | 0.021123 | 0.018867 | 1.651x | 1.849x | +| 7 | 256 x 300 x 256 | 0.034435 | 0.021094 | 0.018698 | 1.632x | 1.842x | +| 8 | 64 x 128 x 64 | 0.000611 | 0.000588 | 0.002378 | 1.039x | 0.257x | +| 8 | 64 x 128 x 64 | 0.001242 | 0.000982 | 0.002868 | 1.265x | 0.433x | +| 8 | 64 x 128 x 64 | 0.000803 | 0.000542 | 0.004964 | 1.483x | 0.162x | +| 9 | 256 x 256 x 257 | 0.021527 | 0.021547 | 0.018358 | 0.999x | 1.173x | +| 9 | 256 x 256 x 257 | 0.021785 | 0.017791 | 0.010855 | 1.225x | 2.007x | +| 9 | 256 x 256 x 257 | 0.021559 | 0.017826 | 0.010466 | 1.209x | 2.060x | diff --git a/performance_ryzen-5-5600x_3.7GHz.md b/performance_ryzen-5-5600x_3.7GHz.md new file mode 100644 index 0000000..d16207b --- /dev/null +++ b/performance_ryzen-5-5600x_3.7GHz.md @@ -0,0 +1,32 @@ +| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup | +|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------| +| 0 | 64 x 64 x 64 | 0.000140 | 0.000127 | 0.000266 | 1.103x | 0.526x | +| 0 | 64 x 64 x 64 | 0.000140 | 0.000127 | 0.000260 | 1.101x | 0.538x | +| 0 | 64 x 64 x 64 | 0.000141 | 0.000139 | 0.000308 | 1.016x | 0.457x | +| 1 | 128 x 64 x 128 | 0.000585 | 0.000616 | 0.001149 | 0.950x | 0.509x | +| 1 | 128 x 64 x 128 | 0.000570 | 0.000511 | 0.000350 | 1.115x | 1.627x | +| 1 | 128 x 64 x 128 | 0.000561 | 0.000470 | 0.000360 | 1.193x | 1.558x | +| 2 | 100 x 128 x 56 | 0.000466 | 0.000339 | 0.008811 | 1.377x | 0.053x | +| 2 | 100 x 128 x 56 | 0.000531 | 0.000389 | 0.008613 | 1.365x | 0.062x | +| 2 | 100 x 128 x 56 | 0.000460 | 0.000343 | 0.009490 | 1.340x | 0.048x | +| 3 | 128 x 64 x 128 | 0.000557 | 0.000489 | 0.000357 | 1.140x | 1.561x | +| 3 | 128 x 64 x 128 | 0.000563 | 0.000571 | 0.000323 | 0.987x | 1.742x | +| 3 | 128 x 64 x 128 | 0.000558 | 0.000476 | 0.000424 | 1.172x | 1.317x | +| 4 | 32 x 128 x 32 | 0.000091 | 0.000069 | 0.002518 | 1.327x | 0.036x | +| 4 | 32 x 128 x 32 | 0.000083 | 0.000069 | 0.000262 | 1.195x | 0.315x | +| 4 | 32 x 128 x 32 | 0.000103 | 0.000079 | 0.000261 | 1.304x | 0.396x | +| 5 | 200 x 100 x 256 | 0.005067 | 0.002468 | 0.001149 | 2.053x | 4.411x | +| 5 | 200 x 100 x 256 | 0.005037 | 0.002748 | 0.001177 | 1.833x | 4.279x | +| 5 | 200 x 100 x 256 | 0.005079 | 0.002296 | 0.001107 | 2.212x | 4.587x | +| 6 | 256 x 256 x 256 | 0.016594 | 0.008135 | 0.003038 | 2.040x | 5.462x | +| 6 | 256 x 256 x 256 | 0.017552 | 0.007665 | 0.003464 | 2.290x | 5.067x | +| 6 | 256 x 256 x 256 | 0.017202 | 0.007565 | 0.003063 | 2.274x | 5.616x | +| 7 | 256 x 300 x 256 | 0.019288 | 0.008852 | 0.003956 | 2.179x | 4.875x | +| 7 | 256 x 300 x 256 | 0.019299 | 0.008837 | 0.004133 | 2.184x | 4.669x | +| 7 | 256 x 300 x 256 | 0.020208 | 0.009192 | 0.003385 | 2.199x | 5.970x | +| 8 | 64 x 128 x 64 | 0.000346 | 0.000245 | 0.010065 | 1.415x | 0.034x | +| 8 | 64 x 128 x 64 | 0.000376 | 0.000306 | 0.000302 | 1.231x | 1.246x | +| 8 | 64 x 128 x 64 | 0.000346 | 0.000252 | 0.009449 | 1.373x | 0.037x | +| 9 | 256 x 256 x 257 | 0.010446 | 0.007139 | 0.001865 | 1.463x | 5.602x | +| 9 | 256 x 256 x 257 | 0.012096 | 0.007262 | 0.009335 | 1.666x | 1.296x | +| 9 | 256 x 256 x 257 | 0.011176 | 0.007139 | 0.001828 | 1.565x | 6.115x |