Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cuda-mma/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
build/
output/
57 changes: 57 additions & 0 deletions cuda-mma/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# ---------------------------------------------------------------------------
# Build system for cuda-mma
# ---------------------------------------------------------------------------

NVCC := nvcc
CXX_FLAGS := -std=c++17 -O3

# sm_80 = Ampere (A100 / RTX 30xx) sm_86 = RTX 30xx consumer
# sm_70 = Volta (V100) sm_75 = Turing (RTX 20xx / T4)
# sm_89 = Ada (RTX 40xx)
# Override on the command line: make ARCH=sm_75
ARCH ?= sm_80

NVCC_FLAGS := $(CXX_FLAGS) -arch=$(ARCH) \
-rdc=true \
--generate-line-info \
-Xcompiler -Wall \
-I. -Iinclude

BIN_DIR := build/bin
TARGET := $(BIN_DIR)/cuda_mma
TEST := $(BIN_DIR)/test_correctness
SRCS := main.cu src/kernels.cu src/utils.cu src/benchmarks.cu
TEST_SRCS := test/test_correctness.cu src/kernels.cu src/utils.cu
HDRS := include/cuda_check.cuh include/kernels.cuh \
include/timer.h include/utils.h include/benchmarks.h

.PHONY: all clean run test profile

all: $(TARGET) $(TEST)

$(BIN_DIR):
mkdir -p $(BIN_DIR)

$(TARGET): $(SRCS) $(HDRS) | $(BIN_DIR)
$(NVCC) $(NVCC_FLAGS) -o $@ $(SRCS)

$(TEST): $(TEST_SRCS) $(HDRS) | $(BIN_DIR)
$(NVCC) $(NVCC_FLAGS) -o $@ $(TEST_SRCS)

run: $(TARGET)
./$(TARGET)

test: $(TEST)
./$(TEST)

# Requires Nsight Compute (ncu).
profile: $(TARGET)
ncu --metrics \
l1tex__t_bytes_pipe_lsu_mem_global_op_ld.sum,\
l1tex__t_bytes_pipe_lsu_mem_global_op_st.sum,\
sm__warps_active.avg.pct_of_peak_sustained_active,\
smsp__sass_thread_inst_executed_op_ffma_pred_on.sum \
./$(TARGET)

clean:
rm -rf build
5 changes: 5 additions & 0 deletions cuda-mma/include/benchmarks.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

float benchmark_naive(int S, const float* h_A, const float* h_B);
float benchmark_tiled(int S, const float* h_A, const float* h_B);
float benchmark_coalesced(int S, const float* h_A, const float* h_B);
40 changes: 40 additions & 0 deletions cuda-mma/include/cuda_check.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// cuda_check.cuh
#pragma once

#include <cuda_runtime.h>
#include <cstdio>
#include <cstdlib>

// ── Host-side API checks ──────────────────────────────────────────────
#define CUDA_CHECK(call) \
do { \
cudaError_t _e = (call); \
if (_e != cudaSuccess) { \
fprintf(stderr, "[CUDA ERROR] %s:%d %s\n → %s\n", \
__FILE__, __LINE__, #call, \
cudaGetErrorString(_e)); \
std::exit(EXIT_FAILURE); \
} \
} while (0)

// ── Kernel launch checks ──────────────────────────────────────────────
#define CHECK_LAST_ERROR() \
do { \
cudaError_t _e = cudaGetLastError(); \
if (_e != cudaSuccess) { \
fprintf(stderr, "[KERNEL LAUNCH ERROR] %s:%d → %s\n", \
__FILE__, __LINE__, cudaGetErrorString(_e)); \
std::exit(EXIT_FAILURE); \
} \
} while (0)

// ── Post-kernel execution checks ──────────────────────────────────────
#define CHECK_SYNC() \
do { \
cudaError_t _e = cudaDeviceSynchronize(); \
if (_e != cudaSuccess) { \
fprintf(stderr, "[KERNEL EXEC ERROR] %s:%d → %s\n", \
__FILE__, __LINE__, cudaGetErrorString(_e)); \
std::exit(EXIT_FAILURE); \
} \
} while (0)
38 changes: 38 additions & 0 deletions cuda-mma/include/kernels.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#pragma once

#include <cuda_runtime.h>


__global__ void sgemm_naive(
size_t M, // Number of rows in A and C
size_t N, // Number of columns in B and C
size_t K, // Number of columns in A and rows in B
float alpha, // Scaling factor for the product of A and B
const float *A, // [M x K] row-major
const float *B, // [K x N] row-major
float beta, // Scaling factor for C
float *C); // [M x N] row-major (in-out: C = alpha*A*B + beta*C)


template <int BLOCKSIZE>
__global__ void sgemm_coalesced(
size_t M, // Number of rows in A and C
size_t N, // Number of columns in B and C
size_t K, // Number of columns in A and rows in B
float alpha, // Scaling factor for the product of A and B
const float *A, // [M x K] row-major
const float *B, // [K x N] row-major
float beta, // Scaling factor for C
float *C); // [M x N] row-major (in-out: C = alpha*A*B + beta*C)


template <int TILE_SIZE>
__global__ void sgemm_tiled(
size_t M, // Number of rows in A and C
size_t N, // Number of columns in B and C
size_t K, // Number of columns in A and rows in B
float alpha, // Scaling factor for the product of A and B
const float *A, // [M x K] row-major
const float *B, // [K x N] row-major
float beta, // Scaling factor for C
float *C); // [M x N] row-major (in-out: C = alpha*A*B + beta*C)
29 changes: 29 additions & 0 deletions cuda-mma/include/timer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

#include <cuda_runtime.h>

// ---------------------------------------------------------------------------
// CUDA event-based timer. Usage:
//
// GpuTimer t;
// t.start();
// kernel<<<...>>>(...);
// float ms = t.stop(); // blocks until kernel finishes
// ---------------------------------------------------------------------------
struct GpuTimer {
cudaEvent_t _start, _stop;

GpuTimer() { cudaEventCreate(&_start); cudaEventCreate(&_stop); }
~GpuTimer() { cudaEventDestroy(_start); cudaEventDestroy(_stop); }

void start() { cudaEventRecord(_start); }

// Returns elapsed milliseconds.
float stop() {
cudaEventRecord(_stop);
cudaEventSynchronize(_stop);
float ms = 0.0f;
cudaEventElapsedTime(&ms, _start, _stop);
return ms;
}
};
17 changes: 17 additions & 0 deletions cuda-mma/include/utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#pragma once

#include <cstdlib>

// Utility macros and functions for the CUDA MMA example.
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))

// Fill an array with uniform random floats in [-1, 1].
void fill_random(float* data, int n);

// CPU reference: C += A * B (row-major, m×k × k×n → m×n).
void matmul_cpu(const float* A, const float* B, float* C, int m, int n, int k);

// Element-wise comparison with absolute + relative tolerance.
// Prints the first few mismatches to stderr; returns true if all match.
bool verify(const float* ref, const float* gpu, int total_elements,
float atol = 1e-5f, float rtol = 1e-5f);
71 changes: 71 additions & 0 deletions cuda-mma/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#include <cstdio>
#include <cstdlib>
#include <filesystem>

#include "utils.h"
#include "benchmarks.h"

#define WARMUP 3
#define ITERS 5

using BenchmarkFn = float (*)(int, const float*, const float*);

static void run_sweep(const char* label, BenchmarkFn fn,
const int* sizes, int n_sizes,
const float* h_A, const float* h_B,
const char* csv_path)
{
printf("%s roofline sweep (%d iters, %d warmup)\n", label, ITERS, WARMUP);
printf("%-6s %9s %9s %11s %9s\n",
"Size", "Time(ms)", "GFLOP/s", "BW(GB/s)", "AI(F/B)");
printf("------ --------- --------- ----------- ---------\n");

FILE* csv = fopen(csv_path, "w");
fprintf(csv, "size,time_ms,gflops,bandwidth_gbs,arithmetic_intensity\n");

for (int i = 0; i < n_sizes; ++i) {
int S = sizes[i];

float avg_ms = fn(S, h_A, h_B);

double flops = 2.0 * S * S * S;
double bytes = ((double)S * S
+ (double)S * S
+ 2.0 * S * S)
* sizeof(float);
double ai = flops / bytes;
double gflops = flops / (avg_ms * 1e-3) / 1e9;
double bandwidth = bytes / (avg_ms * 1e-3) / 1e9;

printf("%-6d %9.3f %9.1f %11.1f %9.2f\n",
S, avg_ms, gflops, bandwidth, ai);
fprintf(csv, "%d,%.3f,%.3f,%.3f,%.4f\n",
S, avg_ms, gflops, bandwidth, ai);
}
printf("\n");
fclose(csv);
}

int main() {
srand(42);

std::filesystem::create_directories("output");

const int sizes[] = {128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
const int N_SIZES = sizeof(sizes) / sizeof(sizes[0]);

const int S_max = sizes[N_SIZES - 1];
float* h_A = new float[(size_t)S_max * S_max];
float* h_B = new float[(size_t)S_max * S_max];
fill_random(h_A, S_max * S_max);
fill_random(h_B, S_max * S_max);

run_sweep("Naive", benchmark_naive, sizes, N_SIZES, h_A, h_B, "output/naive.csv");
run_sweep("Tiled", benchmark_tiled, sizes, N_SIZES, h_A, h_B, "output/tiled.csv");
run_sweep("Coalesced", benchmark_coalesced, sizes, N_SIZES, h_A, h_B, "output/coalesced.csv");

delete[] h_A;
delete[] h_B;

return 0;
}
54 changes: 54 additions & 0 deletions cuda-mma/scripts/plot_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python3
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "..", "output")

kernels = {
"Naive": "naive.csv",
"Tiled": "tiled.csv",
"Coalesced": "coalesced.csv",
}

def load(filename):
path = os.path.join(OUTPUT_DIR, filename)
if not os.path.exists(path):
return None
return pd.read_csv(path)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
ax_gflops, ax_bw, ax_ai = axes

for label, filename in kernels.items():
df = load(filename)
if df is None:
print(f"Warning: {filename} not found, skipping.")
continue
ax_gflops.plot(df["size"], df["gflops"], marker="o", label=label)
ax_bw.plot (df["size"], df["bandwidth_gbs"], marker="o", label=label)
ax_ai.plot (df["size"], df["arithmetic_intensity"], marker="o", label=label)

for ax in axes:
ax.set_xscale("log", base=2)
ax.set_yscale("log")
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f"{int(x)}"))
ax.set_xlabel("Matrix size (S×S)")
ax.legend()
ax.grid(True, which="both", linestyle="--", linewidth=0.5)

ax_gflops.set_title("Throughput")
ax_gflops.set_ylabel("GFLOP/s")

ax_bw.set_title("Memory Bandwidth")
ax_bw.set_ylabel("GB/s")

ax_ai.set_title("Arithmetic Intensity")
ax_ai.set_ylabel("FLOP/Byte")

fig.tight_layout()
out_path = os.path.join(OUTPUT_DIR, "roofline.png")
plt.savefig(out_path, dpi=150)
print(f"Saved {out_path}")
plt.show()
Loading