diff --git a/perf/.gitignore b/perf/.gitignore new file mode 100644 index 0000000..06e0f96 --- /dev/null +++ b/perf/.gitignore @@ -0,0 +1 @@ +.CondaPkg diff --git a/perf/arraydiff.jl b/perf/arraydiff.jl new file mode 100644 index 0000000..b79b44e --- /dev/null +++ b/perf/arraydiff.jl @@ -0,0 +1,109 @@ +module ArrayDiffNeural + +# ArrayDiff reverse-mode AD of the 2-layer MLP loss +# +# L = sum((W2 * tanh(W1 * X) - y) .^ 2) +# +# `W1` is the only `@variable`; `W2`, `X`, `y` are baked in as constant +# matrix blocks. With `gpu=true`, the AD tape, input vector, and gradient +# buffer all live on the device (`Mode{CuVector{T}}`); with `gpu=false`, +# everything stays on the host (`Mode{Vector{T}}`). +# +# `/n` is dropped from the loss to match `hand_cuda.jl` — ArrayDiff's `:/` +# operator currently has a latent bug on non-scalar tape offsets, and a +# constant scaling factor doesn't change the gradient pathway. + +using Random +using BenchmarkTools +using CUDA +using JuMP +using ArrayDiff +import MathOptInterface as MOI + +# Output dimension `m`. Fixed at 2: a 1-row `W2` would parse as a `:vcat` +# with a single child, which the current ArrayDiff parser unpacks via +# `idx1, idx2 = children_indices` and so requires at least two rows. +const OUT_DIM = 2 + +function _build(::Type{T}, h::Int, d::Int, n::Int, gpu::Bool) where {T<:Real} + Random.seed!(0) + W1 = randn(T, h, d) + W2 = randn(T, OUT_DIM, h) + X = randn(T, d, n) + y = randn(T, OUT_DIM, n) + + # JuMP works in Float64 internally; ArrayDiff converts the parsed + # `Expression{Float64}` constants into the `Mode`'s eltype when filling + # its tape. Promote constants to Float64 explicitly so the conversion + # is one-way and obvious. + W2_64 = Float64.(W2) + X_64 = Float64.(X) + y_64 = Float64.(y) + + model = JuMP.Model() + @variable(model, W1v[1:h, 1:d], container = ArrayDiff.ArrayOfVariables) + Y = W2_64 * tanh.(W1v * X_64) + diff = Y .- y_64 + loss = sum(diff .^ 2) + + mode = + gpu ? ArrayDiff.Mode{CUDA.CuVector{T}}() : ArrayDiff.Mode{Vector{T}}() + ad = ArrayDiff.model(mode) + MOI.Nonlinear.set_objective(ad, JuMP.moi_function(loss)) + ev = MOI.Nonlinear.Evaluator( + ad, + mode, + JuMP.index.(JuMP.all_variables(model)), + ) + MOI.initialize(ev, [:Grad]) + return (W1 = W1, evaluator = ev) +end + +""" + neural(T, h, d, n; gpu::Bool = false) -> BenchmarkTools.Trial + +Benchmark `∂L/∂W1` of the 2-layer MLP loss using ArrayDiff. Model parsing +and `MOI.initialize` (the expensive per-shape build) happen once before the +trial; the timed block is just `MOI.eval_objective_gradient` plus a +`CUDA.synchronize` when `gpu=true`. +""" +function neural( + ::Type{T}, + h::Int, + d::Int, + n::Int; + gpu::Bool = false, +) where {T<:Real} + state = _build(T, h, d, n, gpu) + if gpu + x = CUDA.CuVector{T}(vec(state.W1)) + g = CUDA.zeros(T, h * d) + else + x = vec(state.W1) + g = zeros(T, h * d) + end + # `_reverse_mode` short-circuits when `last_x == x`, so without `setup` + # the first benchmark sample runs the full forward+reverse pass and every + # subsequent sample exits immediately — the reported timing would just + # measure the equality check. Reset `last_x` to NaN before each iteration + # so every sample re-runs the AD pipeline. + return @benchmark( + begin + if $gpu + # `@allowscalar` covers residual scalar leaves the BLOCK + # rewrite can't fold; the hot path is bulk. + CUDA.@allowscalar MOI.eval_objective_gradient( + $state.evaluator, + $g, + $x, + ) + CUDA.synchronize() + else + MOI.eval_objective_gradient($state.evaluator, $g, $x) + end + end, + setup = (fill!($state.evaluator.backend.last_x, NaN)), + ) +end + +end # module diff --git a/perf/arraydiff_gpu.jl b/perf/arraydiff_gpu.jl deleted file mode 100644 index e2fb29b..0000000 --- a/perf/arraydiff_gpu.jl +++ /dev/null @@ -1,297 +0,0 @@ -# Benchmark `ArrayDiff` GPU gradient computation against the hand-written -# CUDA `reverse_diff` reference (the same one as `perf/cuda_vs_pytorch.jl`). -# -# Float32 throughout — this is the precision that maps onto the tensor cores -# / SP pipeline a GPU is actually optimized for, and matches what a typical -# ML workload (PyTorch default) feeds the device. -# -# What this measures -# ------------------ -# A 2-layer MLP `loss = sum((W2 * tanh.(W1 * X) - y).^2) / n`, where `W1` is -# the only `@variable` (so `MOI.eval_objective_gradient` returns ∂loss/∂W1) -# and `W2`, `X`, `y` are constants. -# -# * Hand-CUDA Float32 (allocating): `reverse_diff(W1, W2, X, y)` — same -# hand-written formula as `perf/cuda_vs_pytorch.jl`, allocates fresh -# intermediates per call. This is the "naive" hand-tuned baseline. -# -# * Hand-CUDA Float32 (preallocated): `reverse_diff_prealloc!(buf, ...)` — -# same arithmetic but every intermediate lives in a `HandPrealloc` buffer -# that is reused across calls. This is the apples-to-apples comparison -# against ArrayDiff, which preallocates its tape once at -# `MOI.initialize` and never allocates again. -# -# * ArrayDiff CPU: same model with `Mode{Vector{Float32}}()` — sanity check -# for what the existing CPU AD path costs at this problem size. -# -# * ArrayDiff GPU: same model with `Mode{CuVector{Float32}}()` — the path -# this benchmark exists to measure. -# -# Run -# --- -# cd ~/.julia/dev/ArrayDiff -# julia --project=perf -e 'using Pkg; Pkg.instantiate()' -# julia --project=perf perf/arraydiff_gpu.jl - -using Random, LinearAlgebra, Printf -using BenchmarkTools -using CUDA -using JuMP -using ArrayDiff -import MathOptInterface as MOI - -# ------------------------------------------------------------------------- -# Hand-written CUDA reverse_diff — same formulas as -# `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, but *without* the -# `/ n` scaling. ArrayDiff's `:/` operator currently has a latent bug with -# non-scalar tape offsets (it reads `forward_storage[node_idx]` instead of -# `forward_storage[storage_offset[node_idx]+1]`), so we drop the constant -# scaling factor on both sides — it doesn't change what the gradient -# pathway exercises. -# -# These functions are eltype-generic (they're just dotted broadcasts and -# `*`), so the same code runs at `Float32` and `Float64` depending on the -# eltype of the `CuArray` inputs. -# ------------------------------------------------------------------------- -function forward_pass(W1, W2, X, y) - y_1 = tanh.(W1 * X) - J_1 = 1 .- y_1 .^ 2 - J_2 = 2 .* (W2 * y_1 .- y) - return y_1, J_1, J_2 -end - -function reverse_diff(W1, W2, X, y) - _, J_1, J_2 = forward_pass(W1, W2, X, y) - return (J_1 .* (W2' * J_2)) * X' -end - -# ------------------------------------------------------------------------- -# Hand-written CUDA reverse_diff — preallocated variant. -# -# `reverse_diff` above allocates ~6 fresh `CuArray`s per call (one for -# `tanh.(...)`, one for `1 .- y_1.^2`, one for `2 .* (...)`, plus three for -# the chained `*` / `.*` / `*` in the reverse step). At h = 4096 each is a -# multi-MB matrix, and the allocator overhead dominates the actual GEMM -# compute. To be a fair "hand-tuned" baseline against ArrayDiff (which -# allocates its tape exactly once at `MOI.initialize` and reuses it), we -# pre-allocate all intermediates in `HandPrealloc` and run forward + reverse -# with `mul!` + in-place broadcasts. -# -# Buffer reuse: `y_1` doubles as the `W1 * X` output, then is overwritten -# in place by `tanh`. `J_2` doubles as the `W2 * y_1` output and is then -# overwritten by `2 .* (J_2 - y)`. `W2T_J2` is scaled in place by `J_1`. -# Five device buffers cover everything. -# ------------------------------------------------------------------------- -struct HandPrealloc{T<:Real} - y_1::CuArray{T,2} # h × n (= tanh.(W1 * X)) - J_1::CuArray{T,2} # h × n (= 1 .- y_1.^2) - J_2::CuArray{T,2} # out × n (= 2 .* (W2*y_1 - y)) - W2T_J2::CuArray{T,2} # h × n (= W2' * J_2, then ⊙= J_1) - grad::CuArray{T,2} # h × d (= W2T_J2 * X') -end - -function HandPrealloc{T}(h::Int, d::Int, n::Int, out_dim::Int) where {T} - return HandPrealloc{T}( - CuArray{T}(undef, h, n), - CuArray{T}(undef, h, n), - CuArray{T}(undef, out_dim, n), - CuArray{T}(undef, h, n), - CuArray{T}(undef, h, d), - ) -end - -function reverse_diff_prealloc!(buf::HandPrealloc, W1, W2, X, y) - # Forward - LinearAlgebra.mul!(buf.y_1, W1, X) # y_1 = W1 * X - buf.y_1 .= tanh.(buf.y_1) # y_1 = tanh.(y_1) - buf.J_1 .= 1 .- buf.y_1 .^ 2 # J_1 = 1 .- y_1.^2 - LinearAlgebra.mul!(buf.J_2, W2, buf.y_1) # J_2 = W2 * y_1 - buf.J_2 .= 2 .* (buf.J_2 .- y) # J_2 = 2 .* (W2*y_1 - y) - # Reverse — (J_1 .* (W2' * J_2)) * X' - LinearAlgebra.mul!(buf.W2T_J2, W2', buf.J_2) # W2T_J2 = W2' * J_2 - buf.W2T_J2 .= buf.J_1 .* buf.W2T_J2 # W2T_J2 ⊙= J_1 - LinearAlgebra.mul!(buf.grad, buf.W2T_J2, X') # grad = W2T_J2 * X' - return buf.grad -end - -# ------------------------------------------------------------------------- -# ArrayDiff path. Builds a JuMP model with `W1` as the only `@variable` and -# `W2 / X / y` baked in as constant matrices, parses to ArrayDiff, and -# returns an evaluator + a closure that computes ∂loss/∂W1 in `g`. -# ------------------------------------------------------------------------- -function build_arraydiff( - W2::Matrix{T}, - X::Matrix{T}, - y::Matrix{T}, - h::Int, - d::Int, - n::Int, - mode::ArrayDiff.Mode, -) where {T<:Real} - # JuMP works in Float64 internally; ArrayDiff converts the parsed - # `Expression{Float64}` constants into the `Mode`'s eltype (here Float32) - # when filling its tape. We promote constants to Float64 for the JuMP - # build to keep that conversion explicit / one-way. - W2_64 = Float64.(W2) - X_64 = Float64.(X) - y_64 = Float64.(y) - model = JuMP.Model() - @variable(model, W1[1:h, 1:d], container = ArrayDiff.ArrayOfVariables) - Y = W2_64 * tanh.(W1 * X_64) - diff = Y .- y_64 - loss = sum(diff .^ 2) # `/ n` dropped — see `forward_pass` comment. - ad = ArrayDiff.model(mode) - MOI.Nonlinear.set_objective(ad, JuMP.moi_function(loss)) - evaluator = MOI.Nonlinear.Evaluator( - ad, - mode, - JuMP.index.(JuMP.all_variables(model)), - ) - MOI.initialize(evaluator, [:Grad]) - return evaluator -end - -function arraydiff_grad_cpu!(g, evaluator, x) - MOI.eval_objective_gradient(evaluator, g, x) - return g -end - -# `@allowscalar` covers the residual scalar leaves that the BLOCK rewrite -# can't fold (e.g. a stand-alone `NODE_VALUE` inside a non-block subterm). -# The hot path is bulk; this is a guard so we don't trip GPUArrays' policy -# on the leftovers. -function arraydiff_grad_gpu!(g, evaluator, x) - CUDA.@allowscalar MOI.eval_objective_gradient(evaluator, g, x) - return g -end - -# ------------------------------------------------------------------------- -# One (h, d, n) sweep. -# ------------------------------------------------------------------------- -function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float32 = 1.0f-3) - println("\n" * "="^72) - @printf "h = %d, d = %d, n = %d (Float32)\n" h d n - println("="^72) - - T = Float32 - Random.seed!(0) - # `W2` and `y` are deliberately given 2 output rows (instead of the - # 1-row used in `perf/cuda_vs_pytorch.jl`). The current `:vcat` - # forward unpacks `idx1, idx2 = children_indices` and so requires at - # least two rows; a single-row matrix would be parsed as a `vcat` - # with one child and would error. Using 2 rows changes nothing about - # the gradient pathway being measured. - out_dim = 2 - W1 = randn(T, h, d) - W2 = randn(T, out_dim, h) - X = randn(T, d, n) - y = randn(T, out_dim, n) - - # Reference: hand-written CUDA (allocating). - W1g = CuArray(W1) - W2g = CuArray(W2) - Xg = CuArray(X) - yg = CuArray(y) - grad_ref = Array(reverse_diff(W1g, W2g, Xg, yg)) - CUDA.synchronize() - - # Hand-CUDA preallocated. Same arithmetic, but every intermediate is - # owned by `hand_buf` and reused across calls. - hand_buf = HandPrealloc{T}(h, d, n, out_dim) - grad_prealloc = Array(reverse_diff_prealloc!(hand_buf, W1g, W2g, Xg, yg)) - CUDA.synchronize() - - # ArrayDiff CPU. - print("ArrayDiff CPU build (h=$h) ... "); - flush(stdout) - t_cpu_build = @elapsed ev_cpu = - build_arraydiff(W2, X, y, h, d, n, ArrayDiff.Mode{Vector{T}}()) - @printf "%.2f s\n" t_cpu_build - x_cpu = vec(W1) - g_cpu = zeros(T, length(x_cpu)) - arraydiff_grad_cpu!(g_cpu, ev_cpu, x_cpu) - grad_cpu = reshape(g_cpu, h, d) - - # ArrayDiff GPU. Both `x` and `g` live on the GPU — same convention a - # GPU-resident solver (e.g. one whose ADAM step is on `CuVector`) would - # use: the AD tape, the input vector, and the gradient buffer all stay - # on the device, so there's no D2H round-trip on the gradient hot path. - print("ArrayDiff GPU build (h=$h) ... "); - flush(stdout) - t_gpu_build = @elapsed ev_gpu = build_arraydiff( - W2, - X, - y, - h, - d, - n, - ArrayDiff.Mode{CUDA.CuVector{T}}(), - ) - @printf "%.2f s\n" t_gpu_build - x_gpu = CUDA.CuVector{T}(vec(W1)) - g_gpu = CUDA.zeros(T, length(x_gpu)) - arraydiff_grad_gpu!(g_gpu, ev_gpu, x_gpu) - CUDA.synchronize() - grad_gpu = reshape(Array(g_gpu), h, d) - - # Numerical equivalence (rtol loosened for Float32 reductions). - candidates = [ - ("Hand-CUDA prealloc", grad_prealloc), - ("ArrayDiff CPU", grad_cpu), - ("ArrayDiff GPU", grad_gpu), - ] - for (name, g) in candidates - maxdiff = maximum(abs.(grad_ref .- g)) - relmag = maxdiff / max(maximum(abs.(grad_ref)), eps(T)) - ok = isapprox(grad_ref, g; rtol = rtol) - @printf "%-20s vs hand-CUDA alloc: max|Δ| = %.3e (rel %.2e) match=%s\n" name maxdiff relmag ok - end - - println("\n--- benchmark (median of N samples, post-sync) ---") - bj = @benchmark begin - reverse_diff($W1g, $W2g, $Xg, $yg) - CUDA.synchronize() - end samples = 30 evals = 1 seconds = 10 - bjp = @benchmark begin - reverse_diff_prealloc!($hand_buf, $W1g, $W2g, $Xg, $yg) - CUDA.synchronize() - end samples = 30 evals = 1 seconds = 10 - bcpu = @benchmark begin - arraydiff_grad_cpu!($g_cpu, $ev_cpu, $x_cpu) - end samples = 30 evals = 1 seconds = 10 - # `seconds = 60` for the GPU path because, at h = 4096, the - # scalar-cudaMemcpy path can run into the 10s cap on a single sample at - # warmup; bump the budget so we get a reliable median. - bgpu = @benchmark begin - arraydiff_grad_gpu!($g_gpu, $ev_gpu, $x_gpu) - CUDA.synchronize() - end samples = 30 evals = 1 seconds = 60 - @printf "Hand-CUDA alloc : median %12.2f µs\n" 1e-3 * median(bj).time - @printf "Hand-CUDA prealloc : median %12.2f µs\n" 1e-3 * median(bjp).time - @printf "ArrayDiff CPU : median %12.2f µs\n" 1e-3 * median(bcpu).time - @printf "ArrayDiff GPU : median %12.2f µs\n" 1e-3 * median(bgpu).time - - return nothing -end - -# ------------------------------------------------------------------------- -# Main -# ------------------------------------------------------------------------- -function main() - if !CUDA.functional() - error("CUDA is not functional in this environment.") - end - CUDA.math_mode!(CUDA.FAST_MATH) - println( - "CUDA.jl device : ", - CUDA.name(CUDA.device()), - " (math_mode=FAST_MATH)", - ) - for h in (16, 256, 4096) - run_one(; h = h) - GC.gc(true) - CUDA.reclaim() - end -end - -main() diff --git a/perf/cuda_vs_pytorch.jl b/perf/cuda_vs_pytorch.jl deleted file mode 100644 index e216088..0000000 --- a/perf/cuda_vs_pytorch.jl +++ /dev/null @@ -1,821 +0,0 @@ -# Compare hardcoded CUDA.jl forward+reverse for a 2-layer MLP gradient -# against PyTorch (via PythonCall) doing the equivalent loss + .backward(). -# -# Goals -# 1. Numerical equality of ∂L/∂W1 between the two paths. -# 2. Per-kernel CUDA trace from each, side by side, so we can see whether -# they issue the same GPU operations under the hood. -# 3. Wall-clock benchmark per hidden size h. -# -# Run -# cd ~/.julia/dev/ArrayDiff -# julia --project=perf -e 'using Pkg; Pkg.instantiate()' -# julia --project=perf perf/cuda_vs_pytorch.jl -# -# Formulas are transcribed verbatim from autodiff.jl:398-426. -# Loss: L = sum((W2 * tanh(W1 * X) - y) .^ 2) / size(y, 2) -# Returned grad: ∂L/∂W1 (∈ R^{h×d}). - -using Random, LinearAlgebra, Printf -using CUDA -using CUDA: AS -using BenchmarkTools -using PythonCall -using Lux -import Mooncake - -# ------------------------------------------------------------------------- -# Hardcoded CUDA.jl path -# ------------------------------------------------------------------------- -function forward_pass(W1, W2, X, y) - y_1 = tanh.(W1 * X) - J_1 = 1 .- y_1 .^ 2 - J_2 = 2 .* (W2 * y_1 .- y) ./ size(y, 2) - return y_1, J_1, J_2 -end - -function reverse_diff(W1, W2, X, y) - _, J_1, J_2 = forward_pass(W1, W2, X, y) - return (J_1 .* (W2' * J_2)) * X' -end - -# ------------------------------------------------------------------------- -# Hardcoded CUDA.jl path — vectorized custom-kernel version -# -# Replaces the two big elementwise broadcasts: -# 1. tanh.(W1 * X) and 1 .- y_1.^2 → one fused @cuda kernel -# 2. J_1 .* (W2' * J_2) → one @cuda kernel -# with kernels that issue `ld.global.v4.f32` / `st.global.v4.f32` PTX, -# matching PyTorch's `vectorized_elementwise_kernel<4, ...>` shape. -# -# The third broadcast (J_2 from a 1×n vector) is left as a regular .= since -# n is tiny (e.g. 178) and vectorization wouldn't measurably help. -# -# Pattern is taken from CUDA.jl's own ldg.jl tests, which assert that -# NTuple{4, Base.VecElement{Float32}} loads via Core.LLVMPtr lower to -# `ld.global.v4` PTX. -# ------------------------------------------------------------------------- -const Float4 = NTuple{4,Base.VecElement{Float32}} - -@inline _f4ptr(arr::CuDeviceArray{Float32}) = - reinterpret(Core.LLVMPtr{Float4,AS.Global}, pointer(arr)) - -@inline _vec4(t1, t2, t3, t4) = ( - Base.VecElement(t1), - Base.VecElement(t2), - Base.VecElement(t3), - Base.VecElement(t4), -) - -function _tanh_and_jac_kernel!(y, J, x, n::Int) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - base = 4 * (i - 1) - if base + 4 <= n - v = unsafe_load(_f4ptr(x), i, Val(16)) - t1 = tanh(v[1].value); - t2 = tanh(v[2].value) - t3 = tanh(v[3].value); - t4 = tanh(v[4].value) - unsafe_store!(_f4ptr(y), _vec4(t1, t2, t3, t4), i, Val(16)) - unsafe_store!( - _f4ptr(J), - _vec4(1.0f0 - t1*t1, 1.0f0 - t2*t2, 1.0f0 - t3*t3, 1.0f0 - t4*t4), - i, - Val(16), - ) - elseif base < n - for k in 1:(n-base) - @inbounds t = tanh(x[base+k]) - @inbounds y[base+k] = t - @inbounds J[base+k] = 1.0f0 - t*t - end - end - return nothing -end - -function tanh_and_jac!( - y::CuArray{Float32}, - J::CuArray{Float32}, - x::CuArray{Float32}, -) - n = length(x) - threads = 256 - blocks = cld(cld(n, 4), threads) - @cuda threads=threads blocks=blocks _tanh_and_jac_kernel!(y, J, x, n) - return (y, J) -end - -function _vmul_kernel!(out, a, b, n::Int) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - base = 4 * (i - 1) - if base + 4 <= n - va = unsafe_load(_f4ptr(a), i, Val(16)) - vb = unsafe_load(_f4ptr(b), i, Val(16)) - unsafe_store!( - _f4ptr(out), - _vec4( - va[1].value*vb[1].value, - va[2].value*vb[2].value, - va[3].value*vb[3].value, - va[4].value*vb[4].value, - ), - i, - Val(16), - ) - elseif base < n - for k in 1:(n-base) - @inbounds out[base+k] = a[base+k] * b[base+k] - end - end - return nothing -end - -function vmul!(out::CuArray{Float32}, a::CuArray{Float32}, b::CuArray{Float32}) - n = length(a) - threads = 256 - blocks = cld(cld(n, 4), threads) - @cuda threads=threads blocks=blocks _vmul_kernel!(out, a, b, n) - return out -end - -function reverse_diff_v4(W1, W2, X, y) - Z1 = W1 * X # GEMM (h, n) - y_1 = similar(Z1) - J_1 = similar(Z1) - tanh_and_jac!(y_1, J_1, Z1) # fused tanh + (1 - y²), vec=4 - J_2 = 2 .* (W2 * y_1 .- y) ./ size(y, 2) # 1×n broadcast, kept as-is - tmp = W2' * J_2 # GEMM (h, n) - out = similar(tmp) - vmul!(out, J_1, tmp) # J_1 .* tmp, vec=4 - return out * X' # GEMM (h, d) -end - -# ------------------------------------------------------------------------- -# v5: vec=4 elementwise + SIMT cuBLAS GEMM (no TF32 tensor cores) -# -# Under FAST_MATH, gemmExComputeType picks CUBLAS_COMPUTE_32F_FAST_TF32, which -# routes Float32 matmul through TF32 tensor cores. For our awkward k dims -# (k=13 for W1*X, k=178 for out*X', k=1 for W2'*J_2), tensor cores can't be -# filled efficiently and the resulting cutlass_80_tensorop kernel runs much -# slower than a SIMT FP32 GEMM. We bypass gemmExComputeType by calling -# cublasGemmEx directly with CUBLAS_COMPUTE_32F + CUBLAS_GEMM_DEFAULT. -# ------------------------------------------------------------------------- -function _gemm_simt!( - C::CuArray{Float32,2}, - transA::Char, - A::CuArray{Float32,2}, - transB::Char, - B::CuArray{Float32,2}; - alpha::Float32 = 1.0f0, - beta::Float32 = 0.0f0, -) - m = size(A, transA == 'N' ? 1 : 2) - k = size(A, transA == 'N' ? 2 : 1) - n = size(B, transB == 'N' ? 2 : 1) - lda = max(1, stride(A, 2)) - ldb = max(1, stride(B, 2)) - ldc = max(1, stride(C, 2)) - # CUDA.jl puts the cuBLAS handle in CUBLAS_POINTER_MODE_DEVICE, so alpha/beta - # MUST be device pointers (host Ref triggers UVA fault handling — 100× slowdown). - α = CUDA.CuRef{Float32}(alpha); - β = CUDA.CuRef{Float32}(beta) - h = CUDA.CUBLAS.handle() - # Under FAST_MATH the handle's math mode is CUBLAS_TF32_TENSOR_OP_MATH, which - # forces TF32 tensor cores even when we ask for CUBLAS_COMPUTE_32F. Flip it to - # DEFAULT_MATH for this call so cuBLAS picks a SIMT FP32 kernel. - CUDA.CUBLAS.math_mode!(h, CUDA.DEFAULT_MATH) - try - CUDA.CUBLAS.cublasGemmEx( - h, - transA, - transB, - m, - n, - k, - α, - A, - Float32, - lda, - B, - Float32, - ldb, - β, - C, - Float32, - ldc, - CUDA.CUBLAS.CUBLAS_COMPUTE_32F, - CUDA.CUBLAS.CUBLAS_GEMM_DEFAULT, - ) - finally - CUDA.CUBLAS.math_mode!(h, CUDA.math_mode()) # restore (FAST_MATH → TF32 tensor op) - end - return C -end - -function reverse_diff_v5(W1, W2, X, y) - h, d = size(W1) - nn = size(X, 2) - - Z1 = CuArray{Float32}(undef, h, nn) - _gemm_simt!(Z1, 'N', W1, 'N', X) # SIMT: (h,d) * (d,n) - - y_1 = similar(Z1) - J_1 = similar(Z1) - tanh_and_jac!(y_1, J_1, Z1) - - J_2 = 2 .* (W2 * y_1 .- y) ./ size(y, 2) # tiny, leave as broadcast - - tmp = CuArray{Float32}(undef, h, nn) - _gemm_simt!(tmp, 'T', W2, 'N', J_2) # SIMT: W2' * J_2 (k=1) - - out = similar(tmp) - vmul!(out, J_1, tmp) - - result = CuArray{Float32}(undef, h, d) - _gemm_simt!(result, 'N', out, 'T', X) # SIMT: out * X' (k=n) - return result -end - -# ------------------------------------------------------------------------- -# v6: vec=4 elementwise + cuBLASLt with per-shape heuristic-picked algo -# -# cuBLAS's standard heuristic, even with CUBLAS_COMPUTE_32F + DEFAULT_MATH, -# picks `cutlass_80_simt_sgemm_*` for our awkward shapes. PyTorch's process -# happens to land on `magma_sgemmEx_kernel` for the same compute type — same -# library, different choice. cuBLASLt exposes a fuller heuristic API with a -# workspace budget that often unlocks better algos. We build a matmul -# descriptor + layouts per (transA, transB, m, n, k) shape, ask cuBLASLt for -# its best algo, and reuse the cached plan on every call. -# ------------------------------------------------------------------------- -const _LT_WS_BYTES = Csize_t(32 * 1024 * 1024) # 32 MiB workspace - -# Lazy: created on first use, kept alive for the process. -const _LT_STATE = Ref{Any}(nothing) - -function _lt_state() - s = _LT_STATE[] - if s === nothing - h_ref = Ref{CUDA.CUBLAS.cublasLtHandle_t}(C_NULL) - CUDA.CUBLAS.cublasLtCreate(h_ref) - ws = CUDA.CuArray{UInt8}(undef, Int(_LT_WS_BYTES)) - s = (handle = h_ref[], ws = ws) - _LT_STATE[] = s - end - return s::NamedTuple{(:handle, :ws)} -end - -mutable struct LtPlan - desc::CUDA.CUBLAS.cublasLtMatmulDesc_t - Adesc::CUDA.CUBLAS.cublasLtMatrixLayout_t - Bdesc::CUDA.CUBLAS.cublasLtMatrixLayout_t - Cdesc::CUDA.CUBLAS.cublasLtMatrixLayout_t - algo::CUDA.CUBLAS.cublasLtMatmulAlgo_t -end - -function _build_lt_plan( - transA::Char, - transB::Char, - m::Int, - n::Int, - k::Int, - lda::Int, - ldb::Int, - ldc::Int, -) - state = _lt_state() - handle = state.handle - R32 = CUDA.CUDACore.R_32F # cudaDataType for Float32 - - desc_ref = Ref{CUDA.CUBLAS.cublasLtMatmulDesc_t}(C_NULL) - CUDA.CUBLAS.cublasLtMatmulDescCreate( - desc_ref, - CUDA.CUBLAS.CUBLAS_COMPUTE_32F, - R32, - ) - desc = desc_ref[] - - # Set transpose attributes. - tA = (transA == 'N') ? CUDA.CUBLAS.CUBLAS_OP_N : CUDA.CUBLAS.CUBLAS_OP_T - tB = (transB == 'N') ? CUDA.CUBLAS.CUBLAS_OP_N : CUDA.CUBLAS.CUBLAS_OP_T - let r = Ref(tA) - CUDA.CUBLAS.cublasLtMatmulDescSetAttribute( - desc, - CUDA.CUBLAS.CUBLASLT_MATMUL_DESC_TRANSA, - r, - sizeof(tA), - ) - end - let r = Ref(tB) - CUDA.CUBLAS.cublasLtMatmulDescSetAttribute( - desc, - CUDA.CUBLAS.CUBLASLT_MATMUL_DESC_TRANSB, - r, - sizeof(tB), - ) - end - - # Layout shape is the *storage* shape (pre-transpose). - Arows = transA == 'N' ? m : k - Acols = transA == 'N' ? k : m - Brows = transB == 'N' ? k : n - Bcols = transB == 'N' ? n : k - - Aref = Ref{CUDA.CUBLAS.cublasLtMatrixLayout_t}(C_NULL) - Bref = Ref{CUDA.CUBLAS.cublasLtMatrixLayout_t}(C_NULL) - Cref = Ref{CUDA.CUBLAS.cublasLtMatrixLayout_t}(C_NULL) - CUDA.CUBLAS.cublasLtMatrixLayoutCreate( - Aref, - R32, - UInt64(Arows), - UInt64(Acols), - Int64(lda), - ) - CUDA.CUBLAS.cublasLtMatrixLayoutCreate( - Bref, - R32, - UInt64(Brows), - UInt64(Bcols), - Int64(ldb), - ) - CUDA.CUBLAS.cublasLtMatrixLayoutCreate( - Cref, - R32, - UInt64(m), - UInt64(n), - Int64(ldc), - ) - - # Preference: tell the heuristic how much workspace it can use. - pref_ref = Ref{CUDA.CUBLAS.cublasLtMatmulPreference_t}(C_NULL) - CUDA.CUBLAS.cublasLtMatmulPreferenceCreate(pref_ref) - pref = pref_ref[] - let r = Ref(_LT_WS_BYTES) - CUDA.CUBLAS.cublasLtMatmulPreferenceSetAttribute( - pref, - CUDA.CUBLAS.CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, - r, - sizeof(_LT_WS_BYTES), - ) - end - - # Heuristic: top-1 algorithm. - heur = Vector{CUDA.CUBLAS.cublasLtMatmulHeuristicResult_t}(undef, 1) - returned = Ref{Cint}(0) - CUDA.CUBLAS.cublasLtMatmulAlgoGetHeuristic( - handle, - desc, - Aref[], - Bref[], - Cref[], - Cref[], - pref, - Cint(1), - heur, - returned, - ) - returned[] < 1 && error( - "cuBLASLt has no algo for shape (m=$m,n=$n,k=$k,trans=$transA$transB)", - ) - - return LtPlan(desc, Aref[], Bref[], Cref[], heur[1].algo) -end - -function _gemm_lt!( - plan::LtPlan, - C::CuArray{Float32,2}, - A::CuArray{Float32,2}, - B::CuArray{Float32,2}; - alpha::Float32 = 1.0f0, - beta::Float32 = 0.0f0, -) - state = _lt_state() - # cuBLASLt's matmul descriptor defaults to CUBLASLT_POINTER_MODE_HOST - # (independent of the cuBLAS handle's pointer mode), so alpha/beta are - # plain host Refs here — using CuRef would trigger UVA faults. - α = Ref{Float32}(alpha) - β = Ref{Float32}(beta) - algo_ref = Ref(plan.algo) - CUDA.CUBLAS.cublasLtMatmul( - state.handle, - plan.desc, - α, - A, - plan.Adesc, - B, - plan.Bdesc, - β, - C, - plan.Cdesc, - C, - plan.Cdesc, # D = C in place - algo_ref, - state.ws, - sizeof(state.ws), - CUDA.stream(), - ) - return C -end - -# Three plans for our specific 2-layer MLP shape. -struct LtPlans - p1::LtPlan # W1 * X : (h,d) * (d,n) → (h,n) - p2::LtPlan # W2' * J_2 : store (1,h),'T' * (1,n) → (h,n) - p3::LtPlan # out * X' : (h,n) * store (d,n),'T' → (h,d) -end - -function build_lt_plans( - W1::CuArray{Float32,2}, - W2::CuArray{Float32,2}, - X::CuArray{Float32,2}, -) - h, d = size(W1) - nn = size(X, 2) - p1 = _build_lt_plan('N', 'N', h, nn, d, h, d, h) - p2 = _build_lt_plan('T', 'N', h, nn, 1, 1, 1, h) - p3 = _build_lt_plan('N', 'T', h, d, nn, h, d, h) - return LtPlans(p1, p2, p3) -end - -function reverse_diff_v6(plans::LtPlans, W1, W2, X, y) - h, d = size(W1) - nn = size(X, 2) - - Z1 = CuArray{Float32}(undef, h, nn) - _gemm_lt!(plans.p1, Z1, W1, X) - - y_1 = similar(Z1) - J_1 = similar(Z1) - tanh_and_jac!(y_1, J_1, Z1) - - J_2 = 2 .* (W2 * y_1 .- y) ./ size(y, 2) - - tmp = CuArray{Float32}(undef, h, nn) - _gemm_lt!(plans.p2, tmp, W2, J_2) - - out = similar(tmp) - vmul!(out, J_1, tmp) - - result = CuArray{Float32}(undef, h, d) - _gemm_lt!(plans.p3, result, out, X) - return result -end - -# ------------------------------------------------------------------------- -# Lux + Mooncake path -# -# Builds an equivalent 2-layer MLP `Y = W2 * tanh(W1 * X)` (no bias) using -# Lux, plugs in the *same* CuArray weights so the gradient is comparable, -# and uses Mooncake (the modern Julia 1.12-friendly reverse-mode AD) for the -# backward. Goes through the same CUDA.jl + cuBLAS stack as `reverse_diff`, -# so kernels should look similar — what we're measuring is the AD/dispatch -# overhead Lux+Mooncake add on top. -# ------------------------------------------------------------------------- -struct LuxMooncake{M,P,S,L,R} - model::M - ps::P - st::S - loss_fn::L - rule::R -end - -function build_lux( - W1g::CuArray{Float32,2}, - W2g::CuArray{Float32,2}, - Xg::CuArray, - yg::CuArray, -) - h, d = size(W1g) - model = Lux.Chain( - Lux.Dense(d => h, tanh; use_bias = false), - Lux.Dense(h => 1, identity; use_bias = false), - ) - ps = (layer_1 = (weight = W1g,), layer_2 = (weight = W2g,)) - st = Lux.initialstates(Random.default_rng(), model) - - # Closure captures Xg, yg, model, st — only `p` is the differentiated arg. - loss_fn = let model = model, st = st, Xg = Xg, yg = yg - p -> begin - y_hat, _ = model(Xg, p, st) - return sum((y_hat .- yg) .^ 2) / size(yg, 2) - end - end - - # build_rrule is the expensive step (compiles the reverse pass for these - # types) — do it once at setup so the per-call cost in the benchmark is - # just the actual fwd+bwd execution. - rule = Mooncake.build_rrule(loss_fn, ps) - return LuxMooncake(model, ps, st, loss_fn, rule) -end - -function lux_grad(lm::LuxMooncake) - _, (_, ∂ps) = Mooncake.value_and_gradient!!(lm.rule, lm.loss_fn, lm.ps) - return ∂ps.layer_1.weight -end - -# ------------------------------------------------------------------------- -# PyTorch path -# ------------------------------------------------------------------------- -const torch = pyimport("torch") -const np = pyimport("numpy") -const profiler = pyimport("torch.profiler") - -# Build torch tensors once and reuse them across benchmark iterations, -# mirroring how the Julia path passes already-on-GPU CuArrays. -function build_torch_tensors(W1::Matrix, W2::Matrix, X::Matrix, y::Matrix) - npW1 = np.ascontiguousarray(np.asarray(PyArray(W1))) - npW2 = np.ascontiguousarray(np.asarray(PyArray(W2))) - npX = np.ascontiguousarray(np.asarray(PyArray(X))) - npY = np.ascontiguousarray(np.asarray(PyArray(y))) - W1t = torch.from_numpy(npW1).to("cuda").requires_grad_(true) - W2t = torch.from_numpy(npW2).to("cuda") - Xt = torch.from_numpy(npX).to("cuda") - yt = torch.from_numpy(npY).to("cuda") - return W1t, W2t, Xt, yt -end - -# Define eager + torch.compile'd versions in one Python namespace so the only -# difference between them is the `torch.compile` call. Each call goes through -# exactly one PythonCall round-trip, so wall-clock differences reflect what's -# actually happening on the GPU rather than per-op FFI cost. -const _grad_fn_eager, _grad_fn_compiled = let - # `import torch` inside _eager so it lands in the function's __globals__ - # at call time — @pyexec runs with separate globals/locals dicts, and a - # top-level `import torch` would only populate locals, leaving _eager - # unable to resolve `torch` when invoked later. - nt = @pyexec """ -import torch -def _eager(W1, W2, X, y): - import torch - y1 = torch.tanh(torch.matmul(W1, X)) - diff = torch.matmul(W2, y1) - y - loss = (diff * diff).sum() / y.shape[1] - return torch.autograd.grad(loss, W1)[0] -# mode="default" — change to "reduce-overhead" for CUDA Graphs, or -# "max-autotune" for an aggressive autotune pass. -_compiled = torch.compile(_eager) -""" => (_eager::Py, _compiled::Py) - (nt._eager, nt._compiled) -end - -pytorch_grad_eager(W1t, W2t, Xt, yt) = _grad_fn_eager(W1t, W2t, Xt, yt) -pytorch_grad_compiled(W1t, W2t, Xt, yt) = _grad_fn_compiled(W1t, W2t, Xt, yt) - -torch_to_julia(t) = pyconvert(Array, t.detach().cpu().numpy()) - -# ------------------------------------------------------------------------- -# Trace helpers -# ------------------------------------------------------------------------- -function julia_trace(f) - # Warmup so JIT + cuBLAS handle init don't show up. - f(); - CUDA.synchronize() - return CUDA.@profile trace = true begin - f() - CUDA.synchronize() - end -end - -# CUDATools.Profile.ProfileResults' default `show` calls `format_bytes` on a -# column that can contain `Inf` (e.g. for non-memcpy kernels), which throws -# `InexactError(Int64, Inf)`. Walk `trace.device` directly to sidestep it. -function summarize_julia_trace(io::IO, trace) - dev = trace.device - names_ = dev.name - starts = dev.start - stops = dev.stop - counts = Dict{String,Int}() - totals = Dict{String,Float64}() # in seconds - order = String[] - for i in eachindex(names_) - nm = String(names_[i]) - if !haskey(counts, nm) - push!(order, nm) - counts[nm] = 0 - totals[nm] = 0.0 - end - counts[nm] += 1 - totals[nm] += stops[i] - starts[i] - end - sorted = sort(order; by = nm -> -totals[nm]) - @printf io " %-66s %6s %10s\n" "kernel" "count" "total µs" - println(io, " ", "-"^66, " ", "-"^6, " ", "-"^10) - for nm in sorted - label = length(nm) <= 66 ? nm : first(nm, 63) * "..." - @printf io " %-66s %6d %10.2f\n" label counts[nm] 1e6 * totals[nm] - end -end - -function pytorch_trace(f) - f(); - torch.cuda.synchronize() # warmup - ProfilerActivity = profiler.ProfilerActivity - prof = profiler.profile(activities = pylist([ProfilerActivity.CUDA])) - prof.__enter__() - try - f() - torch.cuda.synchronize() - finally - prof.__exit__(pybuiltins.None, pybuiltins.None, pybuiltins.None) - end - return prof.key_averages().table(sort_by = "cuda_time_total") -end - -const _pygc = pyimport("gc") - -# ------------------------------------------------------------------------- -# Benchmark + verify for one (h, d, n) -# ------------------------------------------------------------------------- -function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float32 = 1.0f-3) - println("\n" * "="^72) - @printf "h = %d, d = %d, n = %d\n" h d n - println("="^72) - - Random.seed!(0) - W1 = randn(Float32, h, d) - W2 = randn(Float32, 1, h) - X = randn(Float32, d, n) - y = randn(Float32, 1, n) - - # ----- Julia / CUDA.jl ----- - W1g = CuArray(W1); - W2g = CuArray(W2); - Xg = CuArray(X); - yg = CuArray(y) - grad_julia = Array(reverse_diff(W1g, W2g, Xg, yg)) - grad_julia_v4 = Array(reverse_diff_v4(W1g, W2g, Xg, yg)) - grad_julia_v5 = Array(reverse_diff_v5(W1g, W2g, Xg, yg)) - print("cuBLASLt build_lt_plans for h=$h ... "); - flush(stdout) - t_lt_build = @elapsed lt_plans = build_lt_plans(W1g, W2g, Xg) - @printf "%.3f s\n" t_lt_build - grad_julia_v6 = Array(reverse_diff_v6(lt_plans, W1g, W2g, Xg, yg)) - CUDA.synchronize() - - # Lux + Mooncake setup. build_rrule compiles the reverse pass for these - # types (one-time cost per shape); first call afterwards still does some - # JIT, so we time both separately. - print("Lux+Mooncake build_rrule for h=$h ... "); - flush(stdout) - t_lux_build = @elapsed lm = build_lux(W1g, W2g, Xg, yg) - @printf "%.2f s, " t_lux_build - print("first call ... "); - flush(stdout) - t_lux_first = @elapsed begin - lux_grad(lm); - CUDA.synchronize() - end - @printf "%.2f s\n" t_lux_first - grad_lux = Array(lux_grad(lm)) - CUDA.synchronize() - - # ----- PyTorch ----- - W1t, W2t, Xt, yt = build_torch_tensors(W1, W2, X, y) - grad_pytorch_eager = torch_to_julia(pytorch_grad_eager(W1t, W2t, Xt, yt)) - torch.cuda.synchronize() - - # First call to the compiled fn for this shape triggers Inductor codegen - # (can take seconds). Time it so the user knows. - print("torch.compile codegen for h=$h ... "); - flush(stdout) - t_compile = @elapsed begin - pytorch_grad_compiled(W1t, W2t, Xt, yt) - torch.cuda.synchronize() - end - @printf "%.2f s\n" t_compile - grad_pytorch_compiled = - torch_to_julia(pytorch_grad_compiled(W1t, W2t, Xt, yt)) - torch.cuda.synchronize() - - # ----- Numerical equivalence ----- - for (name, g) in [ - ("Julia v4 (vec=4) ", grad_julia_v4), - ("Julia v5 (vec=4+SIMT)", grad_julia_v5), - ("Julia v6 (vec=4+Lt)", grad_julia_v6), - ("Lux + Mooncake ", grad_lux), - ("PyTorch eager ", grad_pytorch_eager), - ("PyTorch compiled ", grad_pytorch_compiled), - ] - maxdiff = maximum(abs.(grad_julia .- g)) - relmag = maxdiff / max(maximum(abs.(grad_julia)), eps(Float32)) - ok = isapprox(grad_julia, g; rtol = rtol, atol = 1.0f-4) - @printf "%s vs Julia broadcast: max|Δ| = %.3e (rel %.2e) match=%s\n" name maxdiff relmag ok - end - - # ----- Benchmarks ----- - # samples=30 evals=1 caps total iterations so the PyTorch caching allocator - # doesn't blow up at h=4096; setup= clears it between samples. - println("\n--- benchmark (median of 30 samples, post-sync) ---") - bj = @benchmark begin - reverse_diff($W1g, $W2g, $Xg, $yg) - CUDA.synchronize() - end samples=30 evals=1 seconds=10 - bj4 = @benchmark begin - reverse_diff_v4($W1g, $W2g, $Xg, $yg) - CUDA.synchronize() - end samples=30 evals=1 seconds=10 - bj5 = @benchmark begin - reverse_diff_v5($W1g, $W2g, $Xg, $yg) - CUDA.synchronize() - end samples=30 evals=1 seconds=10 - bj6 = @benchmark begin - reverse_diff_v6($lt_plans, $W1g, $W2g, $Xg, $yg) - CUDA.synchronize() - end samples=30 evals=1 seconds=10 - bjlux = @benchmark begin - lux_grad($lm) - CUDA.synchronize() - end samples=30 evals=1 seconds=10 - be = @benchmark begin - pytorch_grad_eager($W1t, $W2t, $Xt, $yt) - $torch.cuda.synchronize() - end samples=30 evals=1 seconds=10 setup=( - $(_pygc).collect(); - $torch.cuda.empty_cache() - ) - bc = @benchmark begin - pytorch_grad_compiled($W1t, $W2t, $Xt, $yt) - $torch.cuda.synchronize() - end samples=30 evals=1 seconds=10 setup=( - $(_pygc).collect(); - $torch.cuda.empty_cache() - ) - @printf "Julia broadcast : median %8.3f µs\n" 1e-3 * median(bj).time - @printf "Julia vec=4 : median %8.3f µs\n" 1e-3 * median(bj4).time - @printf "Julia vec=4 + SIMT : median %8.3f µs\n" 1e-3 * median(bj5).time - @printf "Julia vec=4 + cuBLASLt: median %8.3f µs\n" 1e-3 * median(bj6).time - @printf "Lux + Mooncake : median %8.3f µs\n" 1e-3 * median(bjlux).time - @printf "PyTorch eager : median %8.3f µs\n" 1e-3 * median(be).time - @printf "PyTorch compiled : median %8.3f µs\n" 1e-3 * median(bc).time - - # ----- CUDA traces ----- - println("\n--- CUDA trace: Julia broadcast ---") - summarize_julia_trace( - stdout, - julia_trace(() -> reverse_diff(W1g, W2g, Xg, yg)), - ) - - println("\n--- CUDA trace: Julia vec=4 ---") - summarize_julia_trace( - stdout, - julia_trace(() -> reverse_diff_v4(W1g, W2g, Xg, yg)), - ) - - println("\n--- CUDA trace: Julia vec=4 + SIMT ---") - summarize_julia_trace( - stdout, - julia_trace(() -> reverse_diff_v5(W1g, W2g, Xg, yg)), - ) - - println("\n--- CUDA trace: Julia vec=4 + cuBLASLt ---") - summarize_julia_trace( - stdout, - julia_trace(() -> reverse_diff_v6(lt_plans, W1g, W2g, Xg, yg)), - ) - - println("\n--- CUDA trace: Lux + Mooncake ---") - summarize_julia_trace(stdout, julia_trace(() -> lux_grad(lm))) - - println("\n--- CUDA trace: PyTorch eager ---") - println(pytorch_trace(() -> pytorch_grad_eager(W1t, W2t, Xt, yt))) - - println("\n--- CUDA trace: PyTorch compiled ---") - println(pytorch_trace(() -> pytorch_grad_compiled(W1t, W2t, Xt, yt))) - - return nothing -end - -# ------------------------------------------------------------------------- -# Main -# ------------------------------------------------------------------------- -function main() - if !CUDA.functional() - error("CUDA is not functional in this environment.") - end - if !pyconvert(Bool, torch.cuda.is_available()) - error("PyTorch reports CUDA is not available.") - end - # Match PyTorch's default of fast tanh / fast intrinsics. Affects BOTH - # Julia versions equally, so the broadcast-vs-vec=4 comparison still - # isolates the kernel-design effect. - CUDA.math_mode!(CUDA.FAST_MATH) - println( - "CUDA.jl device : ", - CUDA.name(CUDA.device()), - " (math_mode=FAST_MATH)", - ) - println( - "PyTorch device : ", - pyconvert(String, torch.cuda.get_device_name(0)), - ) - - for h in (16, 256, 4096) - run_one(; h = h) - # Release per-h tensors from both caching allocators before the next sweep. - GC.gc(true) - CUDA.reclaim() - _pygc.collect() - torch.cuda.empty_cache() - end -end - -main() diff --git a/perf/hand_cuda.jl b/perf/hand_cuda.jl new file mode 100644 index 0000000..b69de98 --- /dev/null +++ b/perf/hand_cuda.jl @@ -0,0 +1,114 @@ +module HandCuda + +using BenchmarkTools + +# Hand-written CUDA reverse-mode gradient of the 2-layer MLP loss +# +# L = sum((W2 * tanh(W1 * X) - y) .^ 2) +# +# `W1 ∈ R^{h×d}` is the only differentiated input; `W2 ∈ R^{m×h}`, +# `X ∈ R^{d×n}`, `y ∈ R^{m×n}` are constants. The `/n` scaling is dropped to +# match the `arraydiff.jl` path, which can't currently express it cleanly +# because of a latent bug in ArrayDiff's `:/` operator on non-scalar tape +# offsets — a constant scaling factor changes nothing about the gradient +# pathway being measured. +# +# Buffers are preallocated in `Buffers{T}` and reused across calls (Hand-CUDA +# is meant as the lower-bound baseline for ArrayDiff, which preallocates its +# tape once at `MOI.initialize`). Five device buffers cover everything: `y_1` +# also holds `W1*X` (overwritten in place by `tanh`); `J_2` also holds +# `W2*y_1` (overwritten by `2 .* (J_2 .- y)`); `W2T_J2` is scaled in place by +# `J_1`. + +using Random +using LinearAlgebra +using CUDA + +# Output dimension `m`. Fixed at 2 to match `arraydiff.jl`, which can't parse +# a 1-row `:vcat` (the forward unpacks two children). +const OUT_DIM = 2 + +struct Buffers{M<:AbstractMatrix} + y_1::M # h × n = tanh.(W1 * X) + J_1::M # h × n = 1 .- y_1.^2 + J_2::M # m × n = 2 .* (W2*y_1 .- y) + W2T_J2::M # h × n = W2' * J_2, then ⊙= J_1 + grad::M # h × d = W2T_J2 * X' +end + +function Buffers{M}(h::Int, d::Int, n::Int) where {M} + return Buffers{M}( + M(undef, h, n), + M(undef, h, n), + M(undef, OUT_DIM, n), + M(undef, h, n), + M(undef, h, d), + ) +end + +function gradient!(buf::Buffers, W1, W2, X, y) + LinearAlgebra.mul!(buf.y_1, W1, X) # y_1 = W1 * X + buf.y_1 .= tanh.(buf.y_1) # y_1 = tanh.(y_1) + buf.J_1 .= 1 .- buf.y_1 .^ 2 + LinearAlgebra.mul!(buf.J_2, W2, buf.y_1) + buf.J_2 .= 2 .* (buf.J_2 .- y) + LinearAlgebra.mul!(buf.W2T_J2, W2', buf.J_2) + buf.W2T_J2 .= buf.J_1 .* buf.W2T_J2 + LinearAlgebra.mul!(buf.grad, buf.W2T_J2, X') + return buf.grad +end + +# Allocating reverse pass — same arithmetic, but every intermediate is a +# fresh `CuArray`. Useful as a "what does naive Julia broadcast cost?" +# baseline; expect ~6 transient allocations per call (one per dotted result). +function gradient_alloc(W1, W2, X, y) + y_1 = tanh.(W1 * X) + J_1 = 1 .- y_1 .^ 2 + J_2 = 2 .* (W2 * y_1 .- y) + return (J_1 .* (W2' * J_2)) * X' +end + +""" + neural(T, h, d, n; prealloc::Bool = true) -> Matrix{T} + +Compute `∂L/∂W1` of the 2-layer MLP loss with hand-written CUDA. Returns the +gradient as a host `Matrix{T}` of size `(h, d)`. + +`prealloc = true` (default) reuses a single `Buffers{T}` across the forward ++ reverse pass — the apples-to-apples baseline against ArrayDiff (which +preallocates its tape once at `MOI.initialize`). `prealloc = false` runs the +same arithmetic with allocating dotted broadcasts, so each call mints fresh +`CuArray`s for every intermediate. +""" +function neural( + ::Type{T}, + h::Int, + d::Int, + n::Int; + prealloc::Bool = true, + gpu::Bool = false, +) where {T<:Real} + Random.seed!(0) + W1 = randn(T, h, d) + W2 = randn(T, OUT_DIM, h) + X = randn(T, d, n) + y = randn(T, OUT_DIM, n) + if gpu + W1g, W2g, Xg, yg = CuArray(W1), CuArray(W2), CuArray(X), CuArray(y) + CUDA.synchronize() + else + W1g, W2g, Xg, yg = W1, W2, X, y + end + return @benchmark begin + if $prealloc + gradient!(Buffers{$(typeof(W1g))}($h, $d, $n), $W1g, $W2g, $Xg, $yg) + else + gradient_alloc($W1g, $W2g, $Xg, $yg) + end + if $gpu + CUDA.synchronize() + end + end +end + +end # module diff --git a/perf/lux.jl b/perf/lux.jl new file mode 100644 index 0000000..e6774e7 --- /dev/null +++ b/perf/lux.jl @@ -0,0 +1,74 @@ +module LuxNeural + +# Lux + Mooncake reverse-mode AD of the 2-layer MLP loss +# +# L = sum((W2 * tanh(W1 * X) - y) .^ 2) +# +# Same arithmetic as the other `perf/*.jl` paths, expressed as a Lux `Chain` +# of two bias-less `Dense` layers (`tanh` then `identity`) and differentiated +# with `Mooncake.value_and_gradient!!`. Goes through the same CUDA.jl / +# cuBLAS stack as `hand_cuda.jl` — what we're isolating is the AD/dispatch +# overhead that Lux + Mooncake add on top. + +using Random +using BenchmarkTools +using CUDA +using Lux +import Mooncake + +const OUT_DIM = 2 + +function _build(::Type{T}, h::Int, d::Int, n::Int, gpu::Bool) where {T<:Real} + Random.seed!(0) + W1 = randn(T, h, d) + W2 = randn(T, OUT_DIM, h) + X = randn(T, d, n) + y = randn(T, OUT_DIM, n) + if gpu + W1, W2, X, y = CuArray(W1), CuArray(W2), CuArray(X), CuArray(y) + CUDA.synchronize() + end + + model = Lux.Chain( + Lux.Dense(d => h, tanh; use_bias = false), + Lux.Dense(h => OUT_DIM, identity; use_bias = false), + ) + ps = (layer_1 = (weight = W1,), layer_2 = (weight = W2,)) + st = Lux.initialstates(Random.default_rng(), model) + + # Closure captures everything except `p`, which is what we differentiate. + loss_fn = let model = model, st = st, X = X, y = y + p -> begin + y_hat, _ = model(X, p, st) + return sum((y_hat .- y) .^ 2) + end + end + rule = Mooncake.build_rrule(loss_fn, ps) + return (rule = rule, loss_fn = loss_fn, ps = ps) +end + +""" + neural(T, h, d, n; gpu::Bool = false) -> BenchmarkTools.Trial + +Benchmark `∂L/∂W1` of the 2-layer MLP loss using Lux + Mooncake. +`Mooncake.build_rrule` (the expensive per-shape compile) happens once before +the trial; the timed block is just `value_and_gradient!!` plus a +`CUDA.synchronize` when `gpu=true`. +""" +function neural( + ::Type{T}, + h::Int, + d::Int, + n::Int; + gpu::Bool = false, +) where {T<:Real} + state = _build(T, h, d, n, gpu) + return @benchmark begin + Mooncake.value_and_gradient!!($state.rule, $state.loss_fn, $state.ps) + if $gpu + CUDA.synchronize() + end + end +end + +end # module diff --git a/perf/neural_bench.jl b/perf/neural_bench.jl new file mode 100644 index 0000000..310c7be --- /dev/null +++ b/perf/neural_bench.jl @@ -0,0 +1,25 @@ +include("lux.jl") +include("hand_cuda.jl") +include("pytorch.jl") +include("arraydiff.jl") + +function compare(T, h, d, n; gpu) + println("Lux") + display(LuxNeural.neural(T, h, d, n; gpu)) + println("Hand-CUDA without prealloc") + display(HandCuda.neural(T, h, d, n; prealloc = false, gpu)) + println("Hand-CUDA with prealloc") + display(HandCuda.neural(T, h, d, n; prealloc = true, gpu)) + println("PyTorch eager") + display(PyTorchNeural.neural(T, h, d, n; eager = true, gpu)) + println("PyTorch compiled") + display(PyTorchNeural.neural(T, h, d, n; eager = false, gpu)) + println("ArrayDiff") + return display(ArrayDiffNeural.neural(T, h, d, n; gpu)) +end + +T, h, d, n = Float32, 4096, 13, 178 +println("CPU") +compare(T, h, d, n; gpu = false) +println("GPU") +compare(T, h, d, n; gpu = true) diff --git a/perf/pytorch.jl b/perf/pytorch.jl new file mode 100644 index 0000000..b33dcb5 --- /dev/null +++ b/perf/pytorch.jl @@ -0,0 +1,99 @@ +module PyTorchNeural + +# PyTorch autograd of the 2-layer MLP loss +# +# L = sum((W2 * tanh(W1 * X) - y) .^ 2) +# +# `neural(T, h, d, n; eager, gpu)` builds tensors + warms up the function +# (paying any `torch.compile` codegen cost), then returns a +# `BenchmarkTools.Trial` that times only the per-call forward+backward. + +using Random +using BenchmarkTools +using PythonCall + +const torch = pyimport("torch") +const np = pyimport("numpy") + +# Output dimension `m`. Fixed at 2 to match the other modules. +const OUT_DIM = 2 + +const _grad_fn_eager, _grad_fn_compiled = let + # `import torch` inside `_eager` so it lands in the function's + # `__globals__` at call time — `@pyexec` runs with separate globals and + # locals dicts, and the top-level `import torch` only populates locals, + # leaving `_eager` unable to resolve `torch` when invoked later. + # + # `_compiled = torch.compile(_eager)` runs Inductor codegen on the first + # call for each new input shape (multi-second cost); subsequent calls + # reuse the cached graph. + nt = @pyexec """ +import torch +def _eager(W1, W2, X, y): + import torch + y1 = torch.tanh(torch.matmul(W1, X)) + diff = torch.matmul(W2, y1) - y + loss = (diff * diff).sum() + return torch.autograd.grad(loss, W1)[0] +_compiled = torch.compile(_eager) +""" => (_eager::Py, _compiled::Py) + (nt._eager, nt._compiled) +end + +const _torch_dtype = + Dict{DataType,Py}(Float32 => torch.float32, Float64 => torch.float64) + +# Julia is column-major, NumPy is row-major. `PyArray(::Matrix)` exposes the +# same memory as an F-contiguous numpy view; `np.ascontiguousarray` copies it +# into C-contiguous layout so `torch.from_numpy` gets the shape PyTorch +# expects without surprises. +function _to_device(arr::AbstractArray, dtype::Py, device::String) + np_arr = np.ascontiguousarray(np.asarray(PyArray(arr))) + return torch.from_numpy(np_arr).to(device, dtype = dtype) +end + +""" + neural(T, h, d, n; eager::Bool = true, gpu::Bool = false) -> BenchmarkTools.Trial + +Benchmark `∂L/∂W1` of the 2-layer MLP loss using PyTorch autograd. Tensor +setup and the warmup call (which pays `torch.compile`'s codegen if +`eager=false`) happen once before the trial; the timed block is just the +per-call forward+backward plus a `cuda.synchronize` when `gpu=true`. + +`eager = true` (default) uses standard eager-mode autograd. `eager = false` +runs the same function under `torch.compile`, which fuses elementwise ops +into Triton kernels. +""" +function neural( + ::Type{T}, + h::Int, + d::Int, + n::Int; + eager::Bool = true, + gpu::Bool = false, +) where {T<:Real} + Random.seed!(0) + W1 = randn(T, h, d) + W2 = randn(T, OUT_DIM, h) + X = randn(T, d, n) + y = randn(T, OUT_DIM, n) + dt = _torch_dtype[T] + device = gpu ? "cuda" : "cpu" + W1t = _to_device(W1, dt, device).requires_grad_(true) + W2t = _to_device(W2, dt, device) + Xt = _to_device(X, dt, device) + yt = _to_device(y, dt, device) + fn = eager ? _grad_fn_eager : _grad_fn_compiled + fn(W1t, W2t, Xt, yt) # warmup (codegen for compiled path) + if gpu + torch.cuda.synchronize() + end + return @benchmark begin + $fn($W1t, $W2t, $Xt, $yt) + if $gpu + $torch.cuda.synchronize() + end + end +end + +end # module