diff --git a/README.md b/README.md
index d5a2886..b4bb7f2 100644
--- a/README.md
+++ b/README.md
@@ -205,3 +205,19 @@ git push origin student-name
 
 Good luck, and enjoy accelerating matrix multiplication with CUDA!
 
+### Results
+
+| Test Case | Dimensions (\( m \times n \times p \)) | Naive CPU (s) | Blocked CPU (s) | Parallel CPU (s) | Naive CUDA (s) | Tiled CUDA (s) | Tiled CUDA Speedup (vs. Naive CUDA) | Tiled CUDA Speedup (vs. Parallel CPU) |
+|-----------|------------------------|----------------|------------------|-------------------|-----------------|------------------|
+| 0 | 64 x 64 x 64| 0.00286956 | 0.0039845 | 0.0039845 | 0.104896 | 0.083584 | 1.255x |  0.048x |
+| 1 | 128 x 64 x 128 | 0.00819703  | 0.0107341 | 0.00161662 | 0.141888 | 0.094272 | 1.505x | 0.017x |
+| 2 | 100 x 128 x 56 | 0.00591884 | 0.00705264 | 0.00257082 | 0.121024 | 0.080576 | 1.502x | 0.032x | 
+| 3 | 128 x 64 x 128 | 0.00855164 | 0.0104604 | 0.00158808 | 0.139712 | 0.095424 | 1.464x | 0.017x |
+| 4 | 32 x 128 x 32 | 0.00101901 | 0.00142778 | 0.000622604 | 0.122368 | 0.063424 | 1.93x | 0.01x |
+| 5 | 200 x 100 x 256 | 0.030972 | 0.0239394 | 0.00744395 | 0.216736 | 0.175936 | 1.23x | 0.042x |
+| 6 | 256 x 256 x 256 | 0.0574685 | 0.0826263 | 0.0242001 | 0.310656 | 0.243968 | 1.273x | 0.099x |
+| 7 | 256  x 300 x 256 | 0.0637025 | 0.0922899 | 0.0178825 | 0.298624 | 0.246816 | 1.21x | 0.072x |
+| 8 | 64 x 128 x 64 | 0.00378908 | 0.00581044 | 0.001216 | 0.124512 | 0.076128 | 1.64x | 0.016x |
+| 9 | 256 x 256 x 257 | 0.0591941 | 0.0762655 | 0.0228895 | 0.29376 | 0.248832 | 1.18x | 0.092x |
+
+The two GPU programs were executed on the CSC Mahti supercomputer, accessed via VSCode Remote-SSH. Based on the results, the tiled CUDA implementation outperformed the naive CUDA version. However, when comparing tiled CUDA with parallel CPU computations, the parallel CPU showed better performance. This can be due to the overhead in  data transfers between CPU and GPU. Further, I assume tiled CUDA might perform better if we test it with very large matrices.
\ No newline at end of file
diff --git a/build/.cmake/api/v1/query/client-vscode/query.json b/build/.cmake/api/v1/query/client-vscode/query.json
new file mode 100644
index 0000000..82bb964
--- /dev/null
+++ b/build/.cmake/api/v1/query/client-vscode/query.json
@@ -0,0 +1 @@
+{"requests":[{"kind":"cache","version":2},{"kind":"codemodel","version":2},{"kind":"toolchains","version":1},{"kind":"cmakeFiles","version":1}]}
\ No newline at end of file
diff --git a/build/CMakeCache.txt b/build/CMakeCache.txt
new file mode 100644
index 0000000..2f42ed5
--- /dev/null
+++ b/build/CMakeCache.txt
@@ -0,0 +1,65 @@
+# This is the CMakeCache file.
+# For build in directory: /home/kaveesha/Desktop/Uthpala_Abo/Parallel_Programming/Homework_3/Homework-3/build
+# It was generated by CMake: /usr/bin/cmake
+# You can edit this file to change values found and used by cmake.
+# If you do not want to change any of the values, simply exit the editor.
+# If you do want to change a value, simply edit, save, and exit the editor.
+# The syntax for the file is as follows:
+# KEY:TYPE=VALUE
+# KEY is the name of a variable in the cache.
+# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
+# VALUE is the current value for the KEY.
+
+########################
+# EXTERNAL cache entries
+########################
+
+//No help, variable specified on the command line.
+CMAKE_BUILD_TYPE:STRING=Debug
+
+//No help, variable specified on the command line.
+CMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++
+
+//No help, variable specified on the command line.
+CMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc
+
+//No help, variable specified on the command line.
+CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE
+
+
+########################
+# INTERNAL cache entries
+########################
+
+//This is the directory where this CMakeCache.txt was created
+CMAKE_CACHEFILE_DIR:INTERNAL=/home/kaveesha/Desktop/Uthpala_Abo/Parallel_Programming/Homework_3/Homework-3/build
+//Major version of cmake used to create the current loaded cache
+CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
+//Minor version of cmake used to create the current loaded cache
+CMAKE_CACHE_MINOR_VERSION:INTERNAL=16
+//Patch version of cmake used to create the current loaded cache
+CMAKE_CACHE_PATCH_VERSION:INTERNAL=3
+//Path to CMake executable.
+CMAKE_COMMAND:INTERNAL=/usr/bin/cmake
+//Path to cpack program executable.
+CMAKE_CPACK_COMMAND:INTERNAL=/usr/bin/cpack
+//Path to ctest program executable.
+CMAKE_CTEST_COMMAND:INTERNAL=/usr/bin/ctest
+//Name of external makefile project generator.
+CMAKE_EXTRA_GENERATOR:INTERNAL=
+//Name of generator.
+CMAKE_GENERATOR:INTERNAL=Unix Makefiles
+//Generator instance identifier.
+CMAKE_GENERATOR_INSTANCE:INTERNAL=
+//Name of generator platform.
+CMAKE_GENERATOR_PLATFORM:INTERNAL=
+//Name of generator toolset.
+CMAKE_GENERATOR_TOOLSET:INTERNAL=
+//Source directory with the top level CMakeLists.txt file for this
+// project
+CMAKE_HOME_DIRECTORY:INTERNAL=/home/kaveesha/Desktop/Uthpala_Abo/Parallel_Programming/Homework_3/Homework-3
+//number of local generators
+CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=1
+//Path to CMake installation.
+CMAKE_ROOT:INTERNAL=/usr/share/cmake-3.16
+
diff --git a/build/CMakeFiles/cmake.check_cache b/build/CMakeFiles/cmake.check_cache
new file mode 100644
index 0000000..3dccd73
--- /dev/null
+++ b/build/CMakeFiles/cmake.check_cache
@@ -0,0 +1 @@
+# This file is generated by cmake for dependency checking of the CMakeCache.txt file
diff --git a/main.cu b/main.cu
index 50df6a1..a216b90 100644
--- a/main.cu
+++ b/main.cu
@@ -3,34 +3,218 @@
 #include <string>
 #include <cuda_runtime.h>
 
+float *read_matrix(const std::string &filename, int &rows, int &cols)
+{
+    std::ifstream file(filename);
+    if (!file)
+    {
+        throw std::runtime_error("Cannot open file: " + filename);
+    }
+    file >> rows >> cols;
+    float *mat = new float[rows * cols];
+
+    for (int i = 0; i < rows * cols; ++i)
+    {
+        file >> mat[i];
+    }
+
+    file.close();
+    return mat;
+}
+
+void write_matrix(const std::string &filename, float *mat, int rows, int cols)
+{
+    std::ofstream file(filename);
+    if (!file)
+    {
+        throw std::runtime_error("Cannot open file for writing: " + filename);
+    }
+    file << rows << " " << cols << "\n";
+    for (int i = 0; i < rows * cols; ++i)
+    {
+        file << mat[i] << " ";
+        if ((i + 1) % cols == 0)
+        {
+            file << "\n";
+        }
+    }
+
+    file.close();
+}
+
 __global__ void naive_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p) {
-    // TODO: Implement naive CUDA matrix multiplication
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+
+    float sum = 0.0f;
+    if (row < m && col < p) {
+        for (int k = 0; k < n; ++k) {
+            sum += A[row * n + k] * B[k * p + col];
+        }
+        C[row * p + col] = sum;
+    }
 }
 
 __global__ void tiled_cuda_matmul(float *C, float *A, float *B, uint32_t m, uint32_t n, uint32_t p, uint32_t tile_width) {
-    // TODO: Implement tiled CUDA matrix multiplication
+    __shared__ float tile_A[16][16];
+    __shared__ float tile_B[16][16];
+
+    int row = blockIdx.y * tile_width + threadIdx.y;
+    int col = blockIdx.x * tile_width + threadIdx.x;
+
+    float sum = 0.0f;
+
+    for (int t = 0; t < (n + tile_width - 1) / tile_width; ++t) {
+        if (row < m && (t * tile_width + threadIdx.x) < n)
+            tile_A[threadIdx.y][threadIdx.x] = A[row * n + t * tile_width + threadIdx.x];
+        else
+            tile_A[threadIdx.y][threadIdx.x] = 0.0f;
+
+        if (col < p && (t * tile_width + threadIdx.y) < n)
+            tile_B[threadIdx.y][threadIdx.x] = B[(t * tile_width + threadIdx.y) * p + col];
+        else
+            tile_B[threadIdx.y][threadIdx.x] = 0.0f;
+
+        __syncthreads();
+
+        for (int i = 0; i < tile_width; ++i) {
+            sum += tile_A[threadIdx.y][i] * tile_B[i][threadIdx.x];
+        }
+
+        __syncthreads();
+    }
+
+    if (row < m && col < p) {
+        C[row * p + col] = sum;
+    }
+
+
 }
 
 bool validate_result(const std::string &result_file, const std::string &reference_file) {
-    // TODO: Implement result validation (same as Assignment 2)
+    int result_rows, result_cols, ref_rows, ref_cols;
+    float *result = read_matrix(result_file, result_rows, result_cols);
+    float *reference = read_matrix(reference_file, ref_rows, ref_cols);
+
+    const float EPSILON = 1e-4f;
+
+    for (int i = 0; i < result_rows * result_cols; ++i)
+    {
+        if (std::abs(result[i] - reference[i]) > EPSILON)
+        {
+            return false;
+        }
+    }
+
+    return true;
+
+
 }
 
 int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        std::cerr << "Usage: " << argv[0] << " <case_number>" << std::endl;
+        return 1;
+    }
+
+    int case_number = std::atoi(argv[1]);
+    if (case_number < 0 || case_number > 9) {
+        std::cerr << "Case number must be between 0 and 9" << std::endl;
+        return 1;
+    }
+
+    // Construct file paths
+    std::string folder = "data/" + std::to_string(case_number) + "/";
+    std::string input0_file = folder + "input0.raw";
+    std::string input1_file = folder + "input1.raw";
+    std::string result_file = folder + "result.raw";
+    std::string reference_file = folder + "output.raw";
+
+    int m, n, p;
+    float *A = read_matrix(input0_file, m, n);
+    float *B = read_matrix(input1_file, n, p);
+    float *C_naive = new float[m * p];
+    float *C_tiled = new float[m * p];
+
+    float *d_A, *d_B, *d_C;
+    cudaMalloc(&d_A, sizeof(float) * m * n);
+    cudaMalloc(&d_B, sizeof(float) * n * p);
+    cudaMalloc(&d_C, sizeof(float) * m * p);
+
+    cudaEvent_t start_naive, stop_naive;
+    cudaEventCreate(&start_naive);
+    cudaEventCreate(&stop_naive);
+    cudaEventRecord(start_naive);
+
+    cudaMemcpy(d_A, A, sizeof(float) * m * n, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_B, B, sizeof(float) * n * p, cudaMemcpyHostToDevice);
+
+    dim3 blockDim(16, 16);
+    dim3 gridDim((p + 15) / 16, (m + 15) / 16);
+
+    // Launch naive kernel
+    cudaMemset(d_C, 0, sizeof(float) * m * p);
+    
+    naive_cuda_matmul<<<gridDim, blockDim>>>(d_C, d_A, d_B, m, n, p);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("CUDA kernel launch error: %s\n", cudaGetErrorString(err));
+    }
+    cudaDeviceSynchronize(); 
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("CUDA kernel execution error: %s\n", cudaGetErrorString(err));
+    }
+
+    cudaMemcpy(C_naive, d_C, sizeof(float) * m * p, cudaMemcpyDeviceToHost);
+    
+    cudaEventRecord(stop_naive);
+    cudaEventSynchronize(stop_naive);
+    float naive_cuda_time;
+    cudaEventElapsedTime(&naive_cuda_time, start_naive, stop_naive);
+    write_matrix(result_file, C_naive, m, p);
+    validate_result(result_file, reference_file);
+
+    //Launch tiled cuda kernel
+    int tile_width = 16;
+
+    dim3 tileBlock(tile_width, tile_width);
+    dim3 tileGrid((p + tile_width - 1) / tile_width, (m + tile_width - 1) / tile_width);
 
+    cudaEvent_t start_tiled, stop_tiled;
+    cudaEventCreate(&start_tiled);
+    cudaEventCreate(&stop_tiled);
 
-    // TODO: Read input0.raw (matrix A) and input1.raw (matrix B)
+    cudaEventRecord(start_tiled);
 
-    // TODO: Use cudaMalloc and cudaMemcpy for GPU memory
+    cudaMemcpy(d_A, A, sizeof(float) * m * n, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_B, B, sizeof(float) * n * p, cudaMemcpyHostToDevice);
 
-    // Measure naive CUDA performance
-    // TODO: Launch naive_cuda_matmul kernel
+    cudaMemset(d_C, 0, sizeof(float) * m * p);
+    tiled_cuda_matmul<<<tileGrid, tileBlock>>>(d_C, d_A, d_B, m, n, p, tile_width);
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("CUDA kernel launch error: %s\n", cudaGetErrorString(err));
+    }
+    cudaDeviceSynchronize();
 
-    // TODO: Write naive CUDA result to file and validate
-    // Measure tiled CUDA performance
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("CUDA kernel execution error: %s\n", cudaGetErrorString(err));
+    }
+    cudaMemcpy(C_tiled, d_C, sizeof(float) * m * p, cudaMemcpyDeviceToHost);
 
-    // TODO: Launch tiled_cuda_matmul kernel
+    cudaEventRecord(stop_tiled);
+    cudaEventSynchronize(stop_tiled);
+    
+    float tiled_cuda_time;
+    cudaEventElapsedTime(&tiled_cuda_time, start_tiled, stop_tiled);
+    write_matrix(result_file, C_tiled, m, p);
+    validate_result(result_file, reference_file);
+    
 
-    // TODO: Write tiled CUDA result to file and validate
 
     // Print performance results
     std::cout << "Case " << case_number << " (" << m << "x" << n << "x" << p << "):\n";
@@ -38,6 +222,13 @@ int main(int argc, char *argv[]) {
     std::cout << "Tiled CUDA time: " << tiled_cuda_time << " seconds\n";
 
     // Clean up
+    delete[] A;
+    delete[] B;
+    delete[] C_naive;
+    delete[] C_tiled;
+    cudaFree(d_A);
+    cudaFree(d_B);
+    cudaFree(d_C);
 
     return 0;
 }
\ No newline at end of file
diff --git a/run_hw3.sh b/run_hw3.sh
new file mode 100644
index 0000000..f7c7a9e
--- /dev/null
+++ b/run_hw3.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+#SBATCH --job-name=hw3
+#SBATCH --account=project_2014289
+#SBATCH --partition=gpusmall
+#SBATCH --time=00:30:00
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=4
+#SBATCH --mem-per-cpu=8G
+#SBATCH --gres=gpu:a100:2
+#SBATCH --output=hw3.txt
+
+srun hw3 2