SebastiaAgramunt · SebastiaAgramunt · Apr 1, 2026 · Apr 18, 2026 · Apr 18, 2026 · Apr 18, 2026
diff --git a/cuda-mma/.gitignore b/cuda-mma/.gitignore
@@ -0,0 +1,2 @@
+build/
+output/
diff --git a/cuda-mma/Makefile b/cuda-mma/Makefile
@@ -0,0 +1,57 @@
+# ---------------------------------------------------------------------------
+# Build system for cuda-mma
+# ---------------------------------------------------------------------------
+
+NVCC      := nvcc
+CXX_FLAGS := -std=c++17 -O3
+
+# sm_80 = Ampere (A100 / RTX 30xx)   sm_86 = RTX 30xx consumer
+# sm_70 = Volta (V100)                sm_75 = Turing (RTX 20xx / T4)
+# sm_89 = Ada (RTX 40xx)
+# Override on the command line: make ARCH=sm_75
+ARCH      ?= sm_80
+
+NVCC_FLAGS := $(CXX_FLAGS) -arch=$(ARCH) \
+              -rdc=true \
+              --generate-line-info \
+              -Xcompiler -Wall \
+              -I. -Iinclude
+
+BIN_DIR   := build/bin
+TARGET    := $(BIN_DIR)/cuda_mma
+TEST      := $(BIN_DIR)/test_correctness
+SRCS      := main.cu src/kernels.cu src/utils.cu src/benchmarks.cu
+TEST_SRCS := test/test_correctness.cu src/kernels.cu src/utils.cu
+HDRS      := include/cuda_check.cuh include/kernels.cuh \
+             include/timer.h include/utils.h include/benchmarks.h
+
+.PHONY: all clean run test profile
+
+all: $(TARGET) $(TEST)
+
+$(BIN_DIR):
+	mkdir -p $(BIN_DIR)
+
+$(TARGET): $(SRCS) $(HDRS) | $(BIN_DIR)
+	$(NVCC) $(NVCC_FLAGS) -o $@ $(SRCS)
+
+$(TEST): $(TEST_SRCS) $(HDRS) | $(BIN_DIR)
+	$(NVCC) $(NVCC_FLAGS) -o $@ $(TEST_SRCS)
+
+run: $(TARGET)
+	./$(TARGET)
+
+test: $(TEST)
+	./$(TEST)
+
+# Requires Nsight Compute (ncu).
+profile: $(TARGET)
+	ncu --metrics \
+	    l1tex__t_bytes_pipe_lsu_mem_global_op_ld.sum,\
+	    l1tex__t_bytes_pipe_lsu_mem_global_op_st.sum,\
+	    sm__warps_active.avg.pct_of_peak_sustained_active,\
+	    smsp__sass_thread_inst_executed_op_ffma_pred_on.sum \
+	    ./$(TARGET)
+
+clean:
+	rm -rf build
diff --git a/cuda-mma/include/benchmarks.h b/cuda-mma/include/benchmarks.h
@@ -0,0 +1,5 @@
+#pragma once
+
+float benchmark_naive(int S, const float* h_A, const float* h_B);
+float benchmark_tiled(int S, const float* h_A, const float* h_B);
+float benchmark_coalesced(int S, const float* h_A, const float* h_B);
diff --git a/cuda-mma/include/cuda_check.cuh b/cuda-mma/include/cuda_check.cuh
@@ -0,0 +1,40 @@
+// cuda_check.cuh
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstdlib>
+
+// ── Host-side API checks ──────────────────────────────────────────────
+#define CUDA_CHECK(call)                                               \
+  do {                                                                 \
+    cudaError_t _e = (call);                                           \
+    if (_e != cudaSuccess) {                                           \
+      fprintf(stderr, "[CUDA ERROR] %s:%d  %s\n  → %s\n",              \
+              __FILE__, __LINE__, #call,                               \
+              cudaGetErrorString(_e));                                 \
+      std::exit(EXIT_FAILURE);                                         \
+    }                                                                  \
+  } while (0)
+
+// ── Kernel launch checks ──────────────────────────────────────────────
+#define CHECK_LAST_ERROR()                                             \
+  do {                                                                 \
+    cudaError_t _e = cudaGetLastError();                               \
+    if (_e != cudaSuccess) {                                           \
+      fprintf(stderr, "[KERNEL LAUNCH ERROR] %s:%d  → %s\n",           \
+              __FILE__, __LINE__, cudaGetErrorString(_e));             \
+      std::exit(EXIT_FAILURE);                                         \
+    }                                                                  \
+  } while (0)
+
+// ── Post-kernel execution checks ──────────────────────────────────────
+#define CHECK_SYNC()                                                   \
+  do {                                                                 \
+    cudaError_t _e = cudaDeviceSynchronize();                          \
+    if (_e != cudaSuccess) {                                           \
+      fprintf(stderr, "[KERNEL EXEC ERROR] %s:%d  → %s\n",             \
+              __FILE__, __LINE__, cudaGetErrorString(_e));             \
+      std::exit(EXIT_FAILURE);                                         \
+    }                                                                  \
+  } while (0)
diff --git a/cuda-mma/include/kernels.cuh b/cuda-mma/include/kernels.cuh
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+
+__global__ void sgemm_naive(
+    size_t M,             // Number of rows in A and C
+    size_t N,             // Number of columns in B and C
+    size_t K,             // Number of columns in A and rows in B
+    float alpha,       // Scaling factor for the product of A and B
+    const float *A,    // [M x K] row-major
+    const float *B,    // [K x N] row-major
+    float beta,        // Scaling factor for C
+    float *C);         // [M x N] row-major  (in-out: C = alpha*A*B + beta*C)
+
+
+template <int BLOCKSIZE>
+__global__ void sgemm_coalesced(
+    size_t M,             // Number of rows in A and C
+    size_t N,             // Number of columns in B and C
+    size_t K,             // Number of columns in A and rows in B
+    float alpha,       // Scaling factor for the product of A and B
+    const float *A,    // [M x K] row-major
+    const float *B,    // [K x N] row-major
+    float beta,        // Scaling factor for C
+    float *C);         // [M x N] row-major  (in-out: C = alpha*A*B + beta*C)
+
+
+template <int TILE_SIZE>
+__global__ void sgemm_tiled(
+    size_t M,             // Number of rows in A and C
+    size_t N,             // Number of columns in B and C
+    size_t K,             // Number of columns in A and rows in B
+    float alpha,       // Scaling factor for the product of A and B
+    const float *A,    // [M x K] row-major
+    const float *B,    // [K x N] row-major
+    float beta,        // Scaling factor for C
+    float *C);         // [M x N] row-major  (in-out: C = alpha*A*B + beta*C)
diff --git a/cuda-mma/include/timer.h b/cuda-mma/include/timer.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+// ---------------------------------------------------------------------------
+// CUDA event-based timer.  Usage:
+//
+//   GpuTimer t;
+//   t.start();
+//   kernel<<<...>>>(...);
+//   float ms = t.stop();   // blocks until kernel finishes
+// ---------------------------------------------------------------------------
+struct GpuTimer {
+    cudaEvent_t _start, _stop;
+
+    GpuTimer()  { cudaEventCreate(&_start); cudaEventCreate(&_stop); }
+    ~GpuTimer() { cudaEventDestroy(_start); cudaEventDestroy(_stop);  }
+
+    void start() { cudaEventRecord(_start); }
+
+    // Returns elapsed milliseconds.
+    float stop() {
+        cudaEventRecord(_stop);
+        cudaEventSynchronize(_stop);
+        float ms = 0.0f;
+        cudaEventElapsedTime(&ms, _start, _stop);
+        return ms;
+    }
+};
diff --git a/cuda-mma/include/utils.h b/cuda-mma/include/utils.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <cstdlib>
+
+// Utility macros and functions for the CUDA MMA example.
+#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+
+// Fill an array with uniform random floats in [-1, 1].
+void fill_random(float* data, int n);
+
+// CPU reference: C += A * B  (row-major, m×k × k×n → m×n).
+void matmul_cpu(const float* A, const float* B, float* C, int m, int n, int k);
+
+// Element-wise comparison with absolute + relative tolerance.
+// Prints the first few mismatches to stderr; returns true if all match.
+bool verify(const float* ref, const float* gpu, int total_elements,
+            float atol = 1e-5f, float rtol = 1e-5f);
diff --git a/cuda-mma/main.cu b/cuda-mma/main.cu
@@ -0,0 +1,71 @@
+#include <cstdio>
+#include <cstdlib>
+#include <filesystem>
+
+#include "utils.h"
+#include "benchmarks.h"
+
+#define WARMUP 3
+#define ITERS  5
+
+using BenchmarkFn = float (*)(int, const float*, const float*);
+
+static void run_sweep(const char* label, BenchmarkFn fn,
+                      const int* sizes, int n_sizes,
+                      const float* h_A, const float* h_B,
+                      const char* csv_path)
+{
+    printf("%s roofline sweep (%d iters, %d warmup)\n", label, ITERS, WARMUP);
+    printf("%-6s  %9s  %9s  %11s  %9s\n",
+           "Size", "Time(ms)", "GFLOP/s", "BW(GB/s)", "AI(F/B)");
+    printf("------  ---------  ---------  -----------  ---------\n");
+
+    FILE* csv = fopen(csv_path, "w");
+    fprintf(csv, "size,time_ms,gflops,bandwidth_gbs,arithmetic_intensity\n");
+
+    for (int i = 0; i < n_sizes; ++i) {
+        int S = sizes[i];
+
+        float avg_ms = fn(S, h_A, h_B);
+
+        double flops = 2.0 * S * S * S;
+        double bytes = ((double)S * S
+                      + (double)S * S
+                      + 2.0 * S * S)
+                     * sizeof(float);
+        double ai        = flops / bytes;
+        double gflops    = flops / (avg_ms * 1e-3) / 1e9;
+        double bandwidth = bytes / (avg_ms * 1e-3) / 1e9;
+
+        printf("%-6d  %9.3f  %9.1f  %11.1f  %9.2f\n",
+               S, avg_ms, gflops, bandwidth, ai);
+        fprintf(csv, "%d,%.3f,%.3f,%.3f,%.4f\n",
+                S, avg_ms, gflops, bandwidth, ai);
+    }
+    printf("\n");
+    fclose(csv);
+}
+
+int main() {
+    srand(42);
+
+    std::filesystem::create_directories("output");
+
+    const int sizes[] = {128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
+    const int N_SIZES = sizeof(sizes) / sizeof(sizes[0]);
+
+    const int S_max = sizes[N_SIZES - 1];
+    float* h_A = new float[(size_t)S_max * S_max];
+    float* h_B = new float[(size_t)S_max * S_max];
+    fill_random(h_A, S_max * S_max);
+    fill_random(h_B, S_max * S_max);
+
+    run_sweep("Naive",     benchmark_naive,     sizes, N_SIZES, h_A, h_B, "output/naive.csv");
+    run_sweep("Tiled",     benchmark_tiled,     sizes, N_SIZES, h_A, h_B, "output/tiled.csv");
+    run_sweep("Coalesced", benchmark_coalesced, sizes, N_SIZES, h_A, h_B, "output/coalesced.csv");
+
+    delete[] h_A;
+    delete[] h_B;
+
+    return 0;
+}
diff --git a/cuda-mma/scripts/plot_results.py b/cuda-mma/scripts/plot_results.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+
+OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "..", "output")
+
+kernels = {
+    "Naive":     "naive.csv",
+    "Tiled":     "tiled.csv",
+    "Coalesced": "coalesced.csv",
+}
+
+def load(filename):
+    path = os.path.join(OUTPUT_DIR, filename)
+    if not os.path.exists(path):
+        return None
+    return pd.read_csv(path)
+
+fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+ax_gflops, ax_bw, ax_ai = axes
+
+for label, filename in kernels.items():
+    df = load(filename)
+    if df is None:
+        print(f"Warning: {filename} not found, skipping.")
+        continue
+    ax_gflops.plot(df["size"], df["gflops"],          marker="o", label=label)
+    ax_bw.plot    (df["size"], df["bandwidth_gbs"],   marker="o", label=label)
+    ax_ai.plot    (df["size"], df["arithmetic_intensity"], marker="o", label=label)
+
+for ax in axes:
+    ax.set_xscale("log", base=2)
+    ax.set_yscale("log")
+    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f"{int(x)}"))
+    ax.set_xlabel("Matrix size (S×S)")
+    ax.legend()
+    ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+
+ax_gflops.set_title("Throughput")
+ax_gflops.set_ylabel("GFLOP/s")
+
+ax_bw.set_title("Memory Bandwidth")
+ax_bw.set_ylabel("GB/s")
+
+ax_ai.set_title("Arithmetic Intensity")
+ax_ai.set_ylabel("FLOP/Byte")
+
+fig.tight_layout()
+out_path = os.path.join(OUTPUT_DIR, "roofline.png")
+plt.savefig(out_path, dpi=150)
+print(f"Saved {out_path}")
+plt.show()