From 1e4a01aa99a94329fd0bf2dc33728975705800b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Legat?= <benoit.legat@gmail.com>
Date: Wed, 6 May 2026 17:52:16 +0200
Subject: [PATCH 1/2] Add prealloc in benchmark

---
 perf/arraydiff_gpu.jl | 192 +++++++++++++++++++++++++++++-------------
 1 file changed, 132 insertions(+), 60 deletions(-)

diff --git a/perf/arraydiff_gpu.jl b/perf/arraydiff_gpu.jl
index 25ada54..232d9d1 100644
--- a/perf/arraydiff_gpu.jl
+++ b/perf/arraydiff_gpu.jl
@@ -1,6 +1,9 @@
 # Benchmark `ArrayDiff` GPU gradient computation against the hand-written
-# CUDA `reverse_diff` reference (the same one as `perf/cuda_vs_pytorch.jl`,
-# but in `Float64` so it matches `ArrayDiff`'s tape eltype).
+# CUDA `reverse_diff` reference (the same one as `perf/cuda_vs_pytorch.jl`).
+#
+# Float32 throughout — this is the precision that maps onto the tensor cores
+# / SP pipeline a GPU is actually optimized for, and matches what a typical
+# ML workload (PyTorch default) feeds the device.
 #
 # What this measures
 # ------------------
@@ -8,31 +11,22 @@
 # the only `@variable` (so `MOI.eval_objective_gradient` returns ∂loss/∂W1)
 # and `W2`, `X`, `y` are constants.
 #
-#   * Hand-CUDA Float64: `reverse_diff(W1, W2, X, y)` — the same hand-written
-#     formula as the existing perf script, just promoted to `Float64` so the
-#     numbers are directly comparable to ArrayDiff.
+#   * Hand-CUDA Float32 (allocating): `reverse_diff(W1, W2, X, y)` — same
+#     hand-written formula as `perf/cuda_vs_pytorch.jl`, allocates fresh
+#     intermediates per call. This is the "naive" hand-tuned baseline.
+#
+#   * Hand-CUDA Float32 (preallocated): `reverse_diff_prealloc!(buf, ...)` —
+#     same arithmetic but every intermediate lives in a `HandPrealloc` buffer
+#     that is reused across calls. This is the apples-to-apples comparison
+#     against ArrayDiff, which preallocates its tape once at
+#     `MOI.initialize` and never allocates again.
 #
-#   * ArrayDiff CPU: same model with `Mode{Vector{Float64}}()` — sanity check
+#   * ArrayDiff CPU: same model with `Mode{Vector{Float32}}()` — sanity check
 #     for what the existing CPU AD path costs at this problem size.
 #
-#   * ArrayDiff GPU: same model with `Mode{CuVector{Float64}}()` — the path
+#   * ArrayDiff GPU: same model with `Mode{CuVector{Float32}}()` — the path
 #     this benchmark exists to measure.
 #
-# Caveats (read this before reading the numbers)
-# ----------------------------------------------
-# The current `ArrayDiff` GPU path still does scalar reads/writes on the
-# tape for every `NODE_VARIABLE` (one per `W1[i,j]`) and every `NODE_VALUE`
-# (one per scalar inside the `vcat(row(...), row(...))` expansion of
-# constant matrices `W2`, `X`, `y`). On a `CuArray` each scalar load/store
-# is a separate `cudaMemcpy`, so for `h = 4096` and `d = 13` that's tens of
-# thousands of round-trips per call. We expect the GPU path to be much
-# slower than the hand-written CUDA reference until those leaves are
-# loaded/extracted in bulk — see the `NODE_VARIABLE_BLOCK` /
-# `NODE_VALUE_BLOCK` follow-up.
-#
-# The whole `optimize!` loop is wrapped in `CUDA.@allowscalar` so the
-# scalar paths don't error out under GPUArrays' default scalar policy.
-#
 # Run
 # ---
 #   cd ~/.julia/dev/ArrayDiff
@@ -47,13 +41,17 @@ using ArrayDiff
 import MathOptInterface as MOI
 
 # -------------------------------------------------------------------------
-# Hand-written CUDA Float64 reverse_diff — same formulas as
-# `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, but in `Float64`
-# and *without* the `/ n` scaling. ArrayDiff's `:/` operator currently
-# has a latent bug with non-scalar tape offsets (it reads
-# `forward_storage[node_idx]` instead of `forward_storage[storage_offset[node_idx]+1]`),
-# so we drop the constant scaling factor on both sides — it doesn't change
-# what the gradient pathway exercises.
+# Hand-written CUDA reverse_diff — same formulas as
+# `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, but *without* the
+# `/ n` scaling. ArrayDiff's `:/` operator currently has a latent bug with
+# non-scalar tape offsets (it reads `forward_storage[node_idx]` instead of
+# `forward_storage[storage_offset[node_idx]+1]`), so we drop the constant
+# scaling factor on both sides — it doesn't change what the gradient
+# pathway exercises.
+#
+# These functions are eltype-generic (they're just dotted broadcasts and
+# `*`), so the same code runs at `Float32` and `Float64` depending on the
+# eltype of the `CuArray` inputs.
 # -------------------------------------------------------------------------
 function forward_pass(W1, W2, X, y)
     y_1 = tanh.(W1 * X)
@@ -67,24 +65,80 @@ function reverse_diff(W1, W2, X, y)
     return (J_1 .* (W2' * J_2)) * X'
 end
 
+# -------------------------------------------------------------------------
+# Hand-written CUDA reverse_diff — preallocated variant.
+#
+# `reverse_diff` above allocates ~6 fresh `CuArray`s per call (one for
+# `tanh.(...)`, one for `1 .- y_1.^2`, one for `2 .* (...)`, plus three for
+# the chained `*` / `.*` / `*` in the reverse step). At h = 4096 each is a
+# multi-MB matrix, and the allocator overhead dominates the actual GEMM
+# compute. To be a fair "hand-tuned" baseline against ArrayDiff (which
+# allocates its tape exactly once at `MOI.initialize` and reuses it), we
+# pre-allocate all intermediates in `HandPrealloc` and run forward + reverse
+# with `mul!` + in-place broadcasts.
+#
+# Buffer reuse: `y_1` doubles as the `W1 * X` output, then is overwritten
+# in place by `tanh`. `J_2` doubles as the `W2 * y_1` output and is then
+# overwritten by `2 .* (J_2 - y)`. `W2T_J2` is scaled in place by `J_1`.
+# Five device buffers cover everything.
+# -------------------------------------------------------------------------
+struct HandPrealloc{T<:Real}
+    y_1::CuArray{T,2}      # h × n   (= tanh.(W1 * X))
+    J_1::CuArray{T,2}      # h × n   (= 1 .- y_1.^2)
+    J_2::CuArray{T,2}      # out × n (= 2 .* (W2*y_1 - y))
+    W2T_J2::CuArray{T,2}   # h × n   (= W2' * J_2, then ⊙= J_1)
+    grad::CuArray{T,2}     # h × d   (= W2T_J2 * X')
+end
+
+function HandPrealloc{T}(h::Int, d::Int, n::Int, out_dim::Int) where {T}
+    return HandPrealloc{T}(
+        CuArray{T}(undef, h, n),
+        CuArray{T}(undef, h, n),
+        CuArray{T}(undef, out_dim, n),
+        CuArray{T}(undef, h, n),
+        CuArray{T}(undef, h, d),
+    )
+end
+
+function reverse_diff_prealloc!(buf::HandPrealloc, W1, W2, X, y)
+    # Forward
+    LinearAlgebra.mul!(buf.y_1, W1, X)            # y_1 = W1 * X
+    buf.y_1 .= tanh.(buf.y_1)                     # y_1 = tanh.(y_1)
+    buf.J_1 .= 1 .- buf.y_1 .^ 2                  # J_1 = 1 .- y_1.^2
+    LinearAlgebra.mul!(buf.J_2, W2, buf.y_1)      # J_2 = W2 * y_1
+    buf.J_2 .= 2 .* (buf.J_2 .- y)                # J_2 = 2 .* (W2*y_1 - y)
+    # Reverse — (J_1 .* (W2' * J_2)) * X'
+    LinearAlgebra.mul!(buf.W2T_J2, W2', buf.J_2)  # W2T_J2 = W2' * J_2
+    buf.W2T_J2 .= buf.J_1 .* buf.W2T_J2           # W2T_J2 ⊙= J_1
+    LinearAlgebra.mul!(buf.grad, buf.W2T_J2, X')  # grad = W2T_J2 * X'
+    return buf.grad
+end
+
 # -------------------------------------------------------------------------
 # ArrayDiff path. Builds a JuMP model with `W1` as the only `@variable` and
 # `W2 / X / y` baked in as constant matrices, parses to ArrayDiff, and
 # returns an evaluator + a closure that computes ∂loss/∂W1 in `g`.
 # -------------------------------------------------------------------------
 function build_arraydiff(
-    W2::Matrix{Float64},
-    X::Matrix{Float64},
-    y::Matrix{Float64},
+    W2::Matrix{T},
+    X::Matrix{T},
+    y::Matrix{T},
     h::Int,
     d::Int,
     n::Int,
     mode::ArrayDiff.Mode,
-)
+) where {T<:Real}
+    # JuMP works in Float64 internally; ArrayDiff converts the parsed
+    # `Expression{Float64}` constants into the `Mode`'s eltype (here Float32)
+    # when filling its tape. We promote constants to Float64 for the JuMP
+    # build to keep that conversion explicit / one-way.
+    W2_64 = Float64.(W2)
+    X_64 = Float64.(X)
+    y_64 = Float64.(y)
     model = JuMP.Model()
     @variable(model, W1[1:h, 1:d], container = ArrayDiff.ArrayOfVariables)
-    Y = W2 * tanh.(W1 * X)
-    diff = Y .- y
+    Y = W2_64 * tanh.(W1 * X_64)
+    diff = Y .- y_64
     loss = sum(diff .^ 2)  # `/ n` dropped — see `forward_pass` comment.
     ad = ArrayDiff.model(mode)
     MOI.Nonlinear.set_objective(ad, JuMP.moi_function(loss))
@@ -97,16 +151,15 @@ function build_arraydiff(
     return evaluator
 end
 
-# CPU path — `Mode()` defaults to `Vector{Float64}` storage, no `@allowscalar`
-# needed.
 function arraydiff_grad_cpu!(g, evaluator, x)
     MOI.eval_objective_gradient(evaluator, g, x)
     return g
 end
 
-# GPU path — `Mode{CuVector{Float64}}()`. The current implementation still
-# uses scalar tape access for `NODE_VARIABLE`/`NODE_VALUE` leaves, so the
-# call needs `@allowscalar` to get past GPUArrays' scalar-indexing policy.
+# `@allowscalar` covers the residual scalar leaves that the BLOCK rewrite
+# can't fold (e.g. a stand-alone `NODE_VALUE` inside a non-block subterm).
+# The hot path is bulk; this is a guard so we don't trip GPUArrays' policy
+# on the leftovers.
 function arraydiff_grad_gpu!(g, evaluator, x)
     CUDA.@allowscalar MOI.eval_objective_gradient(evaluator, g, x)
     return g
@@ -115,11 +168,12 @@ end
 # -------------------------------------------------------------------------
 # One (h, d, n) sweep.
 # -------------------------------------------------------------------------
-function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
+function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float32 = 1f-3)
     println("\n" * "="^72)
-    @printf "h = %d, d = %d, n = %d\n" h d n
+    @printf "h = %d, d = %d, n = %d  (Float32)\n" h d n
     println("="^72)
 
+    T = Float32
     Random.seed!(0)
     # `W2` and `y` are deliberately given 2 output rows (instead of the
     # 1-row used in `perf/cuda_vs_pytorch.jl`). The current `:vcat`
@@ -127,12 +181,13 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
     # least two rows; a single-row matrix would be parsed as a `vcat`
     # with one child and would error. Using 2 rows changes nothing about
     # the gradient pathway being measured.
-    W1 = randn(Float64, h, d)
-    W2 = randn(Float64, 2, h)
-    X = randn(Float64, d, n)
-    y = randn(Float64, 2, n)
+    out_dim = 2
+    W1 = randn(T, h, d)
+    W2 = randn(T, out_dim, h)
+    X = randn(T, d, n)
+    y = randn(T, out_dim, n)
 
-    # Reference: hand-written CUDA in Float64.
+    # Reference: hand-written CUDA (allocating).
     W1g = CuArray(W1)
     W2g = CuArray(W2)
     Xg = CuArray(X)
@@ -140,14 +195,21 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
     grad_ref = Array(reverse_diff(W1g, W2g, Xg, yg))
     CUDA.synchronize()
 
+    # Hand-CUDA preallocated. Same arithmetic, but every intermediate is
+    # owned by `hand_buf` and reused across calls.
+    hand_buf = HandPrealloc{T}(h, d, n, out_dim)
+    grad_prealloc =
+        Array(reverse_diff_prealloc!(hand_buf, W1g, W2g, Xg, yg))
+    CUDA.synchronize()
+
     # ArrayDiff CPU.
     print("ArrayDiff CPU build (h=$h) ... ");
     flush(stdout)
-    t_cpu_build =
-        @elapsed ev_cpu = build_arraydiff(W2, X, y, h, d, n, ArrayDiff.Mode())
+    t_cpu_build = @elapsed ev_cpu =
+        build_arraydiff(W2, X, y, h, d, n, ArrayDiff.Mode{Vector{T}}())
     @printf "%.2f s\n" t_cpu_build
     x_cpu = vec(W1)
-    g_cpu = zeros(Float64, length(x_cpu))
+    g_cpu = zeros(T, length(x_cpu))
     arraydiff_grad_cpu!(g_cpu, ev_cpu, x_cpu)
     grad_cpu = reshape(g_cpu, h, d)
 
@@ -164,21 +226,26 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
         h,
         d,
         n,
-        ArrayDiff.Mode{CUDA.CuVector{Float64}}(),
+        ArrayDiff.Mode{CUDA.CuVector{T}}(),
     )
     @printf "%.2f s\n" t_gpu_build
-    x_gpu = CUDA.CuVector{Float64}(vec(W1))
-    g_gpu = CUDA.zeros(Float64, length(x_gpu))
+    x_gpu = CUDA.CuVector{T}(vec(W1))
+    g_gpu = CUDA.zeros(T, length(x_gpu))
     arraydiff_grad_gpu!(g_gpu, ev_gpu, x_gpu)
     CUDA.synchronize()
     grad_gpu = reshape(Array(g_gpu), h, d)
 
-    # Numerical equivalence.
-    for (name, g) in [("ArrayDiff CPU", grad_cpu), ("ArrayDiff GPU", grad_gpu)]
+    # Numerical equivalence (rtol loosened for Float32 reductions).
+    candidates = [
+        ("Hand-CUDA prealloc", grad_prealloc),
+        ("ArrayDiff CPU", grad_cpu),
+        ("ArrayDiff GPU", grad_gpu),
+    ]
+    for (name, g) in candidates
         maxdiff = maximum(abs.(grad_ref .- g))
-        relmag = maxdiff / max(maximum(abs.(grad_ref)), eps(Float64))
+        relmag = maxdiff / max(maximum(abs.(grad_ref)), eps(T))
         ok = isapprox(grad_ref, g; rtol = rtol)
-        @printf "%-15s vs hand-CUDA: max|Δ| = %.3e (rel %.2e)  match=%s\n" name maxdiff relmag ok
+        @printf "%-20s vs hand-CUDA alloc: max|Δ| = %.3e (rel %.2e)  match=%s\n" name maxdiff relmag ok
     end
 
     println("\n--- benchmark (median of N samples, post-sync) ---")
@@ -186,6 +253,10 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
         reverse_diff($W1g, $W2g, $Xg, $yg)
         CUDA.synchronize()
     end samples = 30 evals = 1 seconds = 10
+    bjp = @benchmark begin
+        reverse_diff_prealloc!($hand_buf, $W1g, $W2g, $Xg, $yg)
+        CUDA.synchronize()
+    end samples = 30 evals = 1 seconds = 10
     bcpu = @benchmark begin
         arraydiff_grad_cpu!($g_cpu, $ev_cpu, $x_cpu)
     end samples = 30 evals = 1 seconds = 10
@@ -196,9 +267,10 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
         arraydiff_grad_gpu!($g_gpu, $ev_gpu, $x_gpu)
         CUDA.synchronize()
     end samples = 30 evals = 1 seconds = 60
-    @printf "Hand-CUDA Float64 : median %12.2f µs\n" 1e-3 * median(bj).time
-    @printf "ArrayDiff CPU     : median %12.2f µs\n" 1e-3 * median(bcpu).time
-    @printf "ArrayDiff GPU     : median %12.2f µs\n" 1e-3 * median(bgpu).time
+    @printf "Hand-CUDA alloc    : median %12.2f µs\n" 1e-3 * median(bj).time
+    @printf "Hand-CUDA prealloc : median %12.2f µs\n" 1e-3 * median(bjp).time
+    @printf "ArrayDiff CPU      : median %12.2f µs\n" 1e-3 * median(bcpu).time
+    @printf "ArrayDiff GPU      : median %12.2f µs\n" 1e-3 * median(bgpu).time
 
     return nothing
 end

From 8e8eca41c1213a3990068db54e8491be63bb4aef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Legat?= <benoit.legat@gmail.com>
Date: Wed, 6 May 2026 18:03:11 +0200
Subject: [PATCH 2/2] Fix format

---
 perf/arraydiff_gpu.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/perf/arraydiff_gpu.jl b/perf/arraydiff_gpu.jl
index 232d9d1..e2fb29b 100644
--- a/perf/arraydiff_gpu.jl
+++ b/perf/arraydiff_gpu.jl
@@ -168,7 +168,7 @@ end
 # -------------------------------------------------------------------------
 # One (h, d, n) sweep.
 # -------------------------------------------------------------------------
-function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float32 = 1f-3)
+function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float32 = 1.0f-3)
     println("\n" * "="^72)
     @printf "h = %d, d = %d, n = %d  (Float32)\n" h d n
     println("="^72)
@@ -198,8 +198,7 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float32 = 1f-3)
     # Hand-CUDA preallocated. Same arithmetic, but every intermediate is
     # owned by `hand_buf` and reused across calls.
     hand_buf = HandPrealloc{T}(h, d, n, out_dim)
-    grad_prealloc =
-        Array(reverse_diff_prealloc!(hand_buf, W1g, W2g, Xg, yg))
+    grad_prealloc = Array(reverse_diff_prealloc!(hand_buf, W1g, W2g, Xg, yg))
     CUDA.synchronize()
 
     # ArrayDiff CPU.