Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.vscode/
result.raw
matmul
matmul_p
performance.md
25 changes: 0 additions & 25 deletions CMakeLists.txt

This file was deleted.

12 changes: 12 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
.PHONY: clean

override CFLAGS := -Wall -Wextra -Wpedantic -Werror -fopenmp -O3 $(CFLAGS)

matmul: main.cpp
g++ $(CFLAGS) -o matmul main.cpp

matmul_p: main.cpp
g++ $(CFLAGS) -DPERFORMANCE_MD -o matmul_p main.cpp

clean:
rm -f ./matmul ./matmul_p
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,4 +235,26 @@ git push origin student-name
- Validate each implementation against `output.raw` to ensure correctness before optimizing.
- Use small test cases to debug your blocked and parallel implementations.

Good luck, and enjoy optimizing your matrix multiplication!
Good luck, and enjoy optimizing your matrix multiplication!

---

### Results

The best results can be seen in the table below (block size 32, 12 threads):

| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup |
|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|
| 0 | 64 x 64 x 64 | 0.000140 | 0.000127 | 0.000260 | 1.101x | 0.538x |
| 1 | 128 x 64 x 128 | 0.000570 | 0.000511 | 0.000350 | 1.115x | 1.627x |
| 2 | 100 x 128 x 56 | 0.000531 | 0.000389 | 0.008613 | 1.365x | 0.062x |
| 3 | 128 x 64 x 128 | 0.000563 | 0.000571 | 0.000323 | 0.987x | 1.742x |
| 4 | 32 x 128 x 32 | 0.000103 | 0.000079 | 0.000261 | 1.304x | 0.396x |
| 5 | 200 x 100 x 256 | 0.005079 | 0.002296 | 0.001107 | 2.212x | 4.587x |
| 6 | 256 x 256 x 256 | 0.017202 | 0.007565 | 0.003063 | 2.274x | 5.616x |
| 7 | 256 x 300 x 256 | 0.020208 | 0.009192 | 0.003385 | 2.199x | 5.970x |
| 8 | 64 x 128 x 64 | 0.000376 | 0.000306 | 0.000302 | 1.231x | 1.246x |
| 9 | 256 x 256 x 257 | 0.011176 | 0.007139 | 0.001828 | 1.565x | 6.115x |

In most cases the parallel speedup was significantly higher than the blocked speedup, however for cases 0, 2, and 4
the parallel implementation ended up being much slower than both the blocked and naive implementation.
240 changes: 214 additions & 26 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,138 @@
#include <string>
#include <omp.h>
#include <cmath>
#include <cstdint>

void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
//TODO : Implement naive matrix multiplication
void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p)
{
for (uint32_t i = 0; i < m; i++)
{
for (uint32_t j = 0; j < p; j++)
{
float sum = 0;
for (uint32_t k = 0; k < n; k++)
{
sum += A[i * n + k] * B[k * p + j];
}
C[i * p + j] = sum;
}
}
}

void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) {
// TODO: Implement blocked matrix multiplication
void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size)
{
// A is m x n, B is n x p, C is m x p
// Use block_size to divide matrices into submatrices
for (uint32_t ii = 0; ii < m; ii += block_size)
for (uint32_t jj = 0; jj < p; jj += block_size)
for (uint32_t kk = 0; kk < n; kk += block_size)
// Process block: C[ii:ii+block_size, jj:jj+block_size] += A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size, jj:jj+block_size]
for (uint32_t i = ii; i < std::min(ii + block_size, m); i++)
for (uint32_t j = jj; j < std::min(jj + block_size, p); j++)
{
float sum = 0; // significantly faster than writing and reading C repeatedly
for (uint32_t k = kk; k < std::min(kk + block_size, n); k++)
sum += A[i * n + k] * B[k * p + j];
C[i * p + j] = sum;
}
}

void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
// TODO: Implement parallel matrix multiplication using OpenMP
// A is m x n, B is n x p, C is m x p
void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p)
{
// A is m x n, B is n x p, C is m x p
#pragma omp parallel for
for (uint32_t i = 0; i < m; i++)
for (uint32_t j = 0; j < p; j++)
{
float sum = 0; // significantly faster than writing and reading C repeatedly
for (uint32_t k = 0; k < n; k++)
sum += A[i * n + k] * B[k * p + j];
C[i * p + j] = sum;
}
}

bool validate_result(const std::string &result_file, const std::string &reference_file) {
//TODO : Implement result validation
bool validate_result(const std::string &result_file, const std::string &reference_file)
{
FILE *a;
FILE *b;

a = fopen(result_file.c_str(), "r");
b = fopen(reference_file.c_str(), "r");
if (a == NULL)
{
fprintf(stderr, "Error opening %s\n", result_file.c_str());
exit(1);
}
if (b == NULL)
{
fprintf(stderr, "Error opening %s\n", result_file.c_str());
exit(1);
}

int len_a, len_b;
fseek(a, SEEK_END, 0);
fseek(b, SEEK_END, 0);
len_a = ftell(a);
len_b = ftell(b);
fseek(a, SEEK_SET, 0);
fseek(b, SEEK_SET, 0);

if (len_a != len_b)
return false;

for (int i = 0; i < len_a; i++)
{
if (getc(a) != getc(b))
return false;
}

return true;
}

int main(int argc, char *argv[]) {
if (argc != 2) {
/**
* Formats a floating point number into `buf` to two-decimal precision and no trailing zeroes.
*/
void format_properly(float f, char *buf, size_t buf_size)
{
int len = snprintf(buf, buf_size, "%.2f", f);
char *ptr = buf + len - 1;
while (*ptr == '0' && ptr >= buf)
{
*ptr-- = '\x00';
}
}

void write_results(FILE *result, int m, int p, float *C)
{
fprintf(result, "%d %d\n", m, p);
char buf[16] = {0};
for (int i = 0; i < m; i++)
{
for (int j = 0; j < p; j++)
{
if (j > 0)
fputc(' ', result);
format_properly(C[i * p + j], buf, sizeof(buf));
fprintf(result, "%s", buf);
}
if (i < m - 1)
fputc('\n', result);
}
}

int main(int argc, char *argv[])
{
if (argc != 2)
{
std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
return 1;
return 2;
}

int case_number = std::atoi(argv[1]);
if (case_number < 0 || case_number > 9) {
if (case_number < 0 || case_number > 9)
{
std::cerr << "Case number must be between 0 and 9" << std::endl;
return 1;
return 2;
}

// Construct file paths
Expand All @@ -42,56 +144,113 @@ int main(int argc, char *argv[]) {
std::string result_file = folder + "result.raw";
std::string reference_file = folder + "output.raw";

// TODO Read input0.raw (matrix A)
FILE *input0 = fopen(input0_file.c_str(), "r");
if (input0 == NULL)
{
fprintf(stderr, "Error opening %s\n", input0_file.c_str());
return 1;
}
FILE *input1 = fopen(input1_file.c_str(), "r");
if (input1 == NULL)
{
fprintf(stderr, "Error opening %s\n", input1_file.c_str());
return 1;
}

int m, n, p;
fscanf(input0, "%d %d", &m, &n);
fscanf(input1, "%d %d", &n, &p);

float *A = new float[m * n];
float *B = new float[n * p];

// TODO Read input1.raw (matrix B)
float f = 0.0;
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
fscanf(input0, "%f", &f);
A[i * n + j] = f;
}
}

for (int i = 0; i < n; i++)
{
for (int j = 0; j < p; j++)
{
fscanf(input1, "%f", &f);
B[i * p + j] = f;
}
}

// Allocate memory for result matrices
float *C_naive = new float[m * p];
float *C_blocked = new float[m * p];
float *C_parallel = new float[m * p];

// NAIVE START
// Measure performance of naive_matmul
double start_time = omp_get_wtime();
naive_matmul(C_naive, A, B, m, n, p);
double naive_time = omp_get_wtime() - start_time;

// TODO Write naive result to file

FILE *result = fopen(result_file.c_str(), "w");
if (result == NULL)
{
fprintf(stderr, "Error opening %s\n", result_file.c_str());
return 1;
}
write_results(result, m, p, C_naive);
fclose(result);

// Validate naive result
bool naive_correct = validate_result(result_file, reference_file);
if (!naive_correct) {
if (!naive_correct)
{
std::cerr << "Naive result validation failed for case " << case_number << std::endl;
}

// BLOCKED START
// Measure performance of blocked_matmul (use block_size = 32 as default)
start_time = omp_get_wtime();
blocked_matmul(C_blocked, A, B, m, n, p, 32);
double blocked_time = omp_get_wtime() - start_time;

// TODO Write blocked result to file

result = fopen(result_file.c_str(), "w");
if (result == NULL)
{
fprintf(stderr, "Error opening %s\n", result_file.c_str());
return 1;
}
write_results(result, m, p, C_blocked);
fclose(result);

// Validate blocked result
bool blocked_correct = validate_result(result_file, reference_file);
if (!blocked_correct) {
if (!blocked_correct)
{
std::cerr << "Blocked result validation failed for case " << case_number << std::endl;
}

// PARALLEL START
// Measure performance of parallel_matmul
start_time = omp_get_wtime();
parallel_matmul(C_parallel, A, B, m, n, p);
double parallel_time = omp_get_wtime() - start_time;

// TODO Write parallel result to file

result = fopen(result_file.c_str(), "w");
if (result == NULL)
{
fprintf(stderr, "Error opening %s\n", result_file.c_str());
return 1;
}
write_results(result, m, p, C_parallel);
fclose(result);

// Validate parallel result
bool parallel_correct = validate_result(result_file, reference_file);
if (!parallel_correct) {
if (!parallel_correct)
{
std::cerr << "Parallel result validation failed for case " << case_number << std::endl;
}

Expand All @@ -103,6 +262,35 @@ int main(int argc, char *argv[]) {
std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";

#ifdef PERFORMANCE_MD
FILE *out;
out = fopen("performance.md", "a");
if (out != NULL)
{
if (ftell(out) == 0)
{
fprintf(out,
"| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup |\n"
"|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|\n");
}
fprintf(
out,
"| %-9d | %-3d x %-3d x %-10d | %-14f | %-16f | %-17f | %.3fx%-9s | %.3fx%-10s |\n",
case_number,
m, n, p,
naive_time,
blocked_time,
parallel_time,
(naive_time / blocked_time), "",
(naive_time / parallel_time), "");
fclose(out);
}
else
{
fprintf(stderr, "Error opening performance.md, skipped\n");
}
#endif

// Clean up
delete[] A;
delete[] B;
Expand All @@ -111,4 +299,4 @@ int main(int argc, char *argv[]) {
delete[] C_parallel;

return 0;
}
}
Loading