diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f797853
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.vscode/
+result.raw
+matmul
+matmul_p
+performance.md
diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index 9b04fd0..0000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(MatrixMultiplication)
-
-set(CMAKE_CXX_STANDARD 11)
-
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-find_package(OpenMP REQUIRED)
-
-if(OpenMP_CXX_FOUND)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-endif()
-
-if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_Alignof=alignof")
-endif()
-
-
-add_executable(matmul main_ans.cpp)
-
-
-if(OpenMP_CXX_FOUND)
-    target_link_libraries(matmul PUBLIC OpenMP::OpenMP_CXX)
-endif()
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..22748a8
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,12 @@
+.PHONY: clean
+
+override CFLAGS := -Wall -Wextra -Wpedantic -Werror -fopenmp -O3 $(CFLAGS)
+
+matmul: main.cpp
+	g++ $(CFLAGS) -o matmul main.cpp
+
+matmul_p: main.cpp
+	g++ $(CFLAGS) -DPERFORMANCE_MD -o matmul_p main.cpp
+
+clean:
+	rm -f ./matmul ./matmul_p
diff --git a/README.md b/README.md
index 0f91d63..77f2b43 100644
--- a/README.md
+++ b/README.md
@@ -235,4 +235,26 @@ git push origin student-name
     - Validate each implementation against `output.raw` to ensure correctness before optimizing.
     - Use small test cases to debug your blocked and parallel implementations.
 
-Good luck, and enjoy optimizing your matrix multiplication!
\ No newline at end of file
+Good luck, and enjoy optimizing your matrix multiplication!
+
+---
+
+### Results
+
+The best results can be seen in the table below (block size 32, 12 threads):
+
+| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup |
+|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|
+| 0         | 64  x 64  x 64         | 0.000140       | 0.000127         | 0.000260          | 1.101x          | 0.538x           |
+| 1         | 128 x 64  x 128        | 0.000570       | 0.000511         | 0.000350          | 1.115x          | 1.627x           |
+| 2         | 100 x 128 x 56         | 0.000531       | 0.000389         | 0.008613          | 1.365x          | 0.062x           |
+| 3         | 128 x 64  x 128        | 0.000563       | 0.000571         | 0.000323          | 0.987x          | 1.742x           |
+| 4         | 32  x 128 x 32         | 0.000103       | 0.000079         | 0.000261          | 1.304x          | 0.396x           |
+| 5         | 200 x 100 x 256        | 0.005079       | 0.002296         | 0.001107          | 2.212x          | 4.587x           |
+| 6         | 256 x 256 x 256        | 0.017202       | 0.007565         | 0.003063          | 2.274x          | 5.616x           |
+| 7         | 256 x 300 x 256        | 0.020208       | 0.009192         | 0.003385          | 2.199x          | 5.970x           |
+| 8         | 64  x 128 x 64         | 0.000376       | 0.000306         | 0.000302          | 1.231x          | 1.246x           |
+| 9         | 256 x 256 x 257        | 0.011176       | 0.007139         | 0.001828          | 1.565x          | 6.115x           |
+
+In most cases the parallel speedup was significantly higher than the blocked speedup, however for cases 0, 2, and 4
+the parallel implementation ended up being much slower than both the blocked and naive implementation.
diff --git a/main.cpp b/main.cpp
index 65bf108..ddf2bf2 100644
--- a/main.cpp
+++ b/main.cpp
@@ -3,36 +3,138 @@
 #include <string>
 #include <omp.h>
 #include <cmath>
+#include <cstdint>
 
-void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    //TODO : Implement naive matrix multiplication
+void naive_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p)
+{
+    for (uint32_t i = 0; i < m; i++)
+    {
+        for (uint32_t j = 0; j < p; j++)
+        {
+            float sum = 0;
+            for (uint32_t k = 0; k < n; k++)
+            {
+                sum += A[i * n + k] * B[k * p + j];
+            }
+            C[i * p + j] = sum;
+        }
+    }
 }
 
-void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size) {
-    // TODO: Implement blocked matrix multiplication
+void blocked_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t block_size)
+{
     // A is m x n, B is n x p, C is m x p
     // Use block_size to divide matrices into submatrices
+    for (uint32_t ii = 0; ii < m; ii += block_size)
+        for (uint32_t jj = 0; jj < p; jj += block_size)
+            for (uint32_t kk = 0; kk < n; kk += block_size)
+                // Process block: C[ii:ii+block_size, jj:jj+block_size] += A[ii:ii+block_size, kk:kk+block_size] * B[kk:kk+block_size, jj:jj+block_size]
+                for (uint32_t i = ii; i < std::min(ii + block_size, m); i++)
+                    for (uint32_t j = jj; j < std::min(jj + block_size, p); j++)
+                    {
+                        float sum = 0; // significantly faster than writing and reading C repeatedly
+                        for (uint32_t k = kk; k < std::min(kk + block_size, n); k++)
+                            sum += A[i * n + k] * B[k * p + j];
+                        C[i * p + j] = sum;
+                    }
 }
 
-void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    // TODO: Implement parallel matrix multiplication using OpenMP
-    // A is m x n, B is n x p, C is m x p
+void parallel_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p)
+{
+// A is m x n, B is n x p, C is m x p
+#pragma omp parallel for
+    for (uint32_t i = 0; i < m; i++)
+        for (uint32_t j = 0; j < p; j++)
+        {
+            float sum = 0; // significantly faster than writing and reading C repeatedly
+            for (uint32_t k = 0; k < n; k++)
+                sum += A[i * n + k] * B[k * p + j];
+            C[i * p + j] = sum;
+        }
 }
 
-bool validate_result(const std::string &result_file, const std::string &reference_file) {
-   //TODO : Implement result validation
+bool validate_result(const std::string &result_file, const std::string &reference_file)
+{
+    FILE *a;
+    FILE *b;
+
+    a = fopen(result_file.c_str(), "r");
+    b = fopen(reference_file.c_str(), "r");
+    if (a == NULL)
+    {
+        fprintf(stderr, "Error opening %s\n", result_file.c_str());
+        exit(1);
+    }
+    if (b == NULL)
+    {
+        fprintf(stderr, "Error opening %s\n", result_file.c_str());
+        exit(1);
+    }
+
+    int len_a, len_b;
+    fseek(a, SEEK_END, 0);
+    fseek(b, SEEK_END, 0);
+    len_a = ftell(a);
+    len_b = ftell(b);
+    fseek(a, SEEK_SET, 0);
+    fseek(b, SEEK_SET, 0);
+
+    if (len_a != len_b)
+        return false;
+
+    for (int i = 0; i < len_a; i++)
+    {
+        if (getc(a) != getc(b))
+            return false;
+    }
+
+    return true;
 }
 
-int main(int argc, char *argv[]) {
-    if (argc != 2) {
+/**
+ * Formats a floating point number into `buf` to two-decimal precision and no trailing zeroes.
+ */
+void format_properly(float f, char *buf, size_t buf_size)
+{
+    int len = snprintf(buf, buf_size, "%.2f", f);
+    char *ptr = buf + len - 1;
+    while (*ptr == '0' && ptr >= buf)
+    {
+        *ptr-- = '\x00';
+    }
+}
+
+void write_results(FILE *result, int m, int p, float *C)
+{
+    fprintf(result, "%d %d\n", m, p);
+    char buf[16] = {0};
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < p; j++)
+        {
+            if (j > 0)
+                fputc(' ', result);
+            format_properly(C[i * p + j], buf, sizeof(buf));
+            fprintf(result, "%s", buf);
+        }
+        if (i < m - 1)
+            fputc('\n', result);
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    if (argc != 2)
+    {
         std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
-        return 1;
+        return 2;
     }
 
     int case_number = std::atoi(argv[1]);
-    if (case_number < 0 || case_number > 9) {
+    if (case_number < 0 || case_number > 9)
+    {
         std::cerr << "Case number must be between 0 and 9" << std::endl;
-        return 1;
+        return 2;
     }
 
     // Construct file paths
@@ -42,56 +144,113 @@ int main(int argc, char *argv[]) {
     std::string result_file = folder + "result.raw";
     std::string reference_file = folder + "output.raw";
 
-    // TODO Read input0.raw (matrix A)
+    FILE *input0 = fopen(input0_file.c_str(), "r");
+    if (input0 == NULL)
+    {
+        fprintf(stderr, "Error opening %s\n", input0_file.c_str());
+        return 1;
+    }
+    FILE *input1 = fopen(input1_file.c_str(), "r");
+    if (input1 == NULL)
+    {
+        fprintf(stderr, "Error opening %s\n", input1_file.c_str());
+        return 1;
+    }
+
+    int m, n, p;
+    fscanf(input0, "%d %d", &m, &n);
+    fscanf(input1, "%d %d", &n, &p);
 
+    float *A = new float[m * n];
+    float *B = new float[n * p];
 
-    // TODO Read input1.raw (matrix B)
+    float f = 0.0;
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            fscanf(input0, "%f", &f);
+            A[i * n + j] = f;
+        }
+    }
 
+    for (int i = 0; i < n; i++)
+    {
+        for (int j = 0; j < p; j++)
+        {
+            fscanf(input1, "%f", &f);
+            B[i * p + j] = f;
+        }
+    }
 
     // Allocate memory for result matrices
     float *C_naive = new float[m * p];
     float *C_blocked = new float[m * p];
     float *C_parallel = new float[m * p];
 
+    // NAIVE START
     // Measure performance of naive_matmul
     double start_time = omp_get_wtime();
     naive_matmul(C_naive, A, B, m, n, p);
     double naive_time = omp_get_wtime() - start_time;
 
-    // TODO Write naive result to file
-
+    FILE *result = fopen(result_file.c_str(), "w");
+    if (result == NULL)
+    {
+        fprintf(stderr, "Error opening %s\n", result_file.c_str());
+        return 1;
+    }
+    write_results(result, m, p, C_naive);
+    fclose(result);
 
     // Validate naive result
     bool naive_correct = validate_result(result_file, reference_file);
-    if (!naive_correct) {
+    if (!naive_correct)
+    {
         std::cerr << "Naive result validation failed for case " << case_number << std::endl;
     }
 
+    // BLOCKED START
     // Measure performance of blocked_matmul (use block_size = 32 as default)
     start_time = omp_get_wtime();
     blocked_matmul(C_blocked, A, B, m, n, p, 32);
     double blocked_time = omp_get_wtime() - start_time;
 
-    // TODO Write blocked result to file
-
+    result = fopen(result_file.c_str(), "w");
+    if (result == NULL)
+    {
+        fprintf(stderr, "Error opening %s\n", result_file.c_str());
+        return 1;
+    }
+    write_results(result, m, p, C_blocked);
+    fclose(result);
 
     // Validate blocked result
     bool blocked_correct = validate_result(result_file, reference_file);
-    if (!blocked_correct) {
+    if (!blocked_correct)
+    {
         std::cerr << "Blocked result validation failed for case " << case_number << std::endl;
     }
 
+    // PARALLEL START
     // Measure performance of parallel_matmul
     start_time = omp_get_wtime();
     parallel_matmul(C_parallel, A, B, m, n, p);
     double parallel_time = omp_get_wtime() - start_time;
 
-    // TODO Write parallel result to file
-
+    result = fopen(result_file.c_str(), "w");
+    if (result == NULL)
+    {
+        fprintf(stderr, "Error opening %s\n", result_file.c_str());
+        return 1;
+    }
+    write_results(result, m, p, C_parallel);
+    fclose(result);
 
     // Validate parallel result
     bool parallel_correct = validate_result(result_file, reference_file);
-    if (!parallel_correct) {
+    if (!parallel_correct)
+    {
         std::cerr << "Parallel result validation failed for case " << case_number << std::endl;
     }
 
@@ -103,6 +262,35 @@ int main(int argc, char *argv[]) {
     std::cout << "Blocked speedup: " << (naive_time / blocked_time) << "x\n";
     std::cout << "Parallel speedup: " << (naive_time / parallel_time) << "x\n";
 
+#ifdef PERFORMANCE_MD
+    FILE *out;
+    out = fopen("performance.md", "a");
+    if (out != NULL)
+    {
+        if (ftell(out) == 0)
+        {
+            fprintf(out,
+                    "| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup |\n"
+                    "|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|\n");
+        }
+        fprintf(
+            out,
+            "| %-9d | %-3d x %-3d x %-10d | %-14f | %-16f | %-17f | %.3fx%-9s | %.3fx%-10s |\n",
+            case_number,
+            m, n, p,
+            naive_time,
+            blocked_time,
+            parallel_time,
+            (naive_time / blocked_time), "",
+            (naive_time / parallel_time), "");
+        fclose(out);
+    }
+    else
+    {
+        fprintf(stderr, "Error opening performance.md, skipped\n");
+    }
+#endif
+
     // Clean up
     delete[] A;
     delete[] B;
@@ -111,4 +299,4 @@ int main(int argc, char *argv[]) {
     delete[] C_parallel;
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/performance_i7-2640m_2.80GHz.md b/performance_i7-2640m_2.80GHz.md
new file mode 100644
index 0000000..2585d73
--- /dev/null
+++ b/performance_i7-2640m_2.80GHz.md
@@ -0,0 +1,32 @@
+| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup |
+|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|
+| 0         | 64  x 64  x 64         | 0.000476       | 0.000535         | 0.002130          | 0.890x          | 0.224x           |
+| 0         | 64  x 64  x 64         | 0.000343       | 0.000334         | 0.003825          | 1.028x          | 0.090x           |
+| 0         | 64  x 64  x 64         | 0.000306       | 0.000287         | 0.002904          | 1.069x          | 0.106x           |
+| 1         | 128 x 64  x 128        | 0.001485       | 0.001315         | 0.002541          | 1.129x          | 0.585x           |
+| 1         | 128 x 64  x 128        | 0.001343       | 0.001968         | 0.000823          | 0.682x          | 1.632x           |
+| 1         | 128 x 64  x 128        | 0.001315       | 0.001949         | 0.006872          | 0.674x          | 0.191x           |
+| 2         | 100 x 128 x 56         | 0.000823       | 0.000740         | 0.004221          | 1.112x          | 0.195x           |
+| 2         | 100 x 128 x 56         | 0.000857       | 0.000749         | 0.000768          | 1.145x          | 1.117x           |
+| 2         | 100 x 128 x 56         | 0.001343       | 0.001336         | 0.003337          | 1.006x          | 0.403x           |
+| 3         | 128 x 64  x 128        | 0.001323       | 0.001971         | 0.000885          | 0.671x          | 1.494x           |
+| 3         | 128 x 64  x 128        | 0.001348       | 0.001126         | 0.002626          | 1.197x          | 0.513x           |
+| 3         | 128 x 64  x 128        | 0.001858       | 0.001351         | 0.000844          | 1.376x          | 2.202x           |
+| 4         | 32  x 128 x 32         | 0.000226       | 0.000183         | 0.003853          | 1.237x          | 0.059x           |
+| 4         | 32  x 128 x 32         | 0.000146       | 0.000133         | 0.000252          | 1.101x          | 0.581x           |
+| 4         | 32  x 128 x 32         | 0.000146       | 0.000134         | 0.003343          | 1.096x          | 0.044x           |
+| 5         | 200 x 100 x 256        | 0.006846       | 0.011000         | 0.009009          | 0.622x          | 0.760x           |
+| 5         | 200 x 100 x 256        | 0.007464       | 0.005513         | 0.003677          | 1.354x          | 2.030x           |
+| 5         | 200 x 100 x 256        | 0.007161       | 0.006429         | 0.006429          | 1.114x          | 1.114x           |
+| 6         | 256 x 256 x 256        | 0.028310       | 0.019140         | 0.026673          | 1.479x          | 1.061x           |
+| 6         | 256 x 256 x 256        | 0.028416       | 0.018362         | 0.015405          | 1.548x          | 1.845x           |
+| 6         | 256 x 256 x 256        | 0.028912       | 0.018607         | 0.020357          | 1.554x          | 1.420x           |
+| 7         | 256 x 300 x 256        | 0.035351       | 0.025475         | 0.026895          | 1.388x          | 1.314x           |
+| 7         | 256 x 300 x 256        | 0.034877       | 0.021123         | 0.018867          | 1.651x          | 1.849x           |
+| 7         | 256 x 300 x 256        | 0.034435       | 0.021094         | 0.018698          | 1.632x          | 1.842x           |
+| 8         | 64  x 128 x 64         | 0.000611       | 0.000588         | 0.002378          | 1.039x          | 0.257x           |
+| 8         | 64  x 128 x 64         | 0.001242       | 0.000982         | 0.002868          | 1.265x          | 0.433x           |
+| 8         | 64  x 128 x 64         | 0.000803       | 0.000542         | 0.004964          | 1.483x          | 0.162x           |
+| 9         | 256 x 256 x 257        | 0.021527       | 0.021547         | 0.018358          | 0.999x          | 1.173x           |
+| 9         | 256 x 256 x 257        | 0.021785       | 0.017791         | 0.010855          | 1.225x          | 2.007x           |
+| 9         | 256 x 256 x 257        | 0.021559       | 0.017826         | 0.010466          | 1.209x          | 2.060x           |
diff --git a/performance_ryzen-5-5600x_3.7GHz.md b/performance_ryzen-5-5600x_3.7GHz.md
new file mode 100644
index 0000000..d16207b
--- /dev/null
+++ b/performance_ryzen-5-5600x_3.7GHz.md
@@ -0,0 +1,32 @@
+| Test Case | Dimensions (m x n x p) | Naive Time (s) | Blocked Time (s) | Parallel Time (s) | Blocked Speedup | Parallel Speedup |
+|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|
+| 0         | 64  x 64  x 64         | 0.000140       | 0.000127         | 0.000266          | 1.103x          | 0.526x           |
+| 0         | 64  x 64  x 64         | 0.000140       | 0.000127         | 0.000260          | 1.101x          | 0.538x           |
+| 0         | 64  x 64  x 64         | 0.000141       | 0.000139         | 0.000308          | 1.016x          | 0.457x           |
+| 1         | 128 x 64  x 128        | 0.000585       | 0.000616         | 0.001149          | 0.950x          | 0.509x           |
+| 1         | 128 x 64  x 128        | 0.000570       | 0.000511         | 0.000350          | 1.115x          | 1.627x           |
+| 1         | 128 x 64  x 128        | 0.000561       | 0.000470         | 0.000360          | 1.193x          | 1.558x           |
+| 2         | 100 x 128 x 56         | 0.000466       | 0.000339         | 0.008811          | 1.377x          | 0.053x           |
+| 2         | 100 x 128 x 56         | 0.000531       | 0.000389         | 0.008613          | 1.365x          | 0.062x           |
+| 2         | 100 x 128 x 56         | 0.000460       | 0.000343         | 0.009490          | 1.340x          | 0.048x           |
+| 3         | 128 x 64  x 128        | 0.000557       | 0.000489         | 0.000357          | 1.140x          | 1.561x           |
+| 3         | 128 x 64  x 128        | 0.000563       | 0.000571         | 0.000323          | 0.987x          | 1.742x           |
+| 3         | 128 x 64  x 128        | 0.000558       | 0.000476         | 0.000424          | 1.172x          | 1.317x           |
+| 4         | 32  x 128 x 32         | 0.000091       | 0.000069         | 0.002518          | 1.327x          | 0.036x           |
+| 4         | 32  x 128 x 32         | 0.000083       | 0.000069         | 0.000262          | 1.195x          | 0.315x           |
+| 4         | 32  x 128 x 32         | 0.000103       | 0.000079         | 0.000261          | 1.304x          | 0.396x           |
+| 5         | 200 x 100 x 256        | 0.005067       | 0.002468         | 0.001149          | 2.053x          | 4.411x           |
+| 5         | 200 x 100 x 256        | 0.005037       | 0.002748         | 0.001177          | 1.833x          | 4.279x           |
+| 5         | 200 x 100 x 256        | 0.005079       | 0.002296         | 0.001107          | 2.212x          | 4.587x           |
+| 6         | 256 x 256 x 256        | 0.016594       | 0.008135         | 0.003038          | 2.040x          | 5.462x           |
+| 6         | 256 x 256 x 256        | 0.017552       | 0.007665         | 0.003464          | 2.290x          | 5.067x           |
+| 6         | 256 x 256 x 256        | 0.017202       | 0.007565         | 0.003063          | 2.274x          | 5.616x           |
+| 7         | 256 x 300 x 256        | 0.019288       | 0.008852         | 0.003956          | 2.179x          | 4.875x           |
+| 7         | 256 x 300 x 256        | 0.019299       | 0.008837         | 0.004133          | 2.184x          | 4.669x           |
+| 7         | 256 x 300 x 256        | 0.020208       | 0.009192         | 0.003385          | 2.199x          | 5.970x           |
+| 8         | 64  x 128 x 64         | 0.000346       | 0.000245         | 0.010065          | 1.415x          | 0.034x           |
+| 8         | 64  x 128 x 64         | 0.000376       | 0.000306         | 0.000302          | 1.231x          | 1.246x           |
+| 8         | 64  x 128 x 64         | 0.000346       | 0.000252         | 0.009449          | 1.373x          | 0.037x           |
+| 9         | 256 x 256 x 257        | 0.010446       | 0.007139         | 0.001865          | 1.463x          | 5.602x           |
+| 9         | 256 x 256 x 257        | 0.012096       | 0.007262         | 0.009335          | 1.666x          | 1.296x           |
+| 9         | 256 x 256 x 257        | 0.011176       | 0.007139         | 0.001828          | 1.565x          | 6.115x           |