From 3d5e0052746b56d83b34f42b20194e748e813917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Wed, 6 May 2026 16:28:29 +0200 Subject: [PATCH 1/3] Add benchmark vs hardcoded CUDA --- perf/Project.toml | 6 ++ perf/arraydiff_gpu.jl | 220 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 perf/arraydiff_gpu.jl diff --git a/perf/Project.toml b/perf/Project.toml index f7d180d..4aa5df2 100644 --- a/perf/Project.toml +++ b/perf/Project.toml @@ -4,12 +4,18 @@ authors = ["Benoît Legat "] version = "0.0.0" [deps] +ArrayDiff = "c45fa1ca-6901-44ac-ae5b-5513a4852d50" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab" +JuMP = "4076af6c-e467-56ae-b986-b466b2749572" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Lux = "b2108857-7c20-44ae-9111-449ecde12c47" +MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee" Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[sources] +ArrayDiff = {path = ".."} diff --git a/perf/arraydiff_gpu.jl b/perf/arraydiff_gpu.jl new file mode 100644 index 0000000..f7cb8df --- /dev/null +++ b/perf/arraydiff_gpu.jl @@ -0,0 +1,220 @@ +# Benchmark `ArrayDiff` GPU gradient computation against the hand-written +# CUDA `reverse_diff` reference (the same one as `perf/cuda_vs_pytorch.jl`, +# but in `Float64` so it matches `ArrayDiff`'s tape eltype). +# +# What this measures +# ------------------ +# A 2-layer MLP `loss = sum((W2 * tanh.(W1 * X) - y).^2) / n`, where `W1` is +# the only `@variable` (so `MOI.eval_objective_gradient` returns ∂loss/∂W1) +# and `W2`, `X`, `y` are constants. +# +# * Hand-CUDA Float64: `reverse_diff(W1, W2, X, y)` — the same hand-written +# formula as the existing perf script, just promoted to `Float64` so the +# numbers are directly comparable to ArrayDiff. +# +# * ArrayDiff CPU: same model with `Mode{Vector{Float64}}()` — sanity check +# for what the existing CPU AD path costs at this problem size. +# +# * ArrayDiff GPU: same model with `Mode{CuVector{Float64}}()` — the path +# this benchmark exists to measure. +# +# Caveats (read this before reading the numbers) +# ---------------------------------------------- +# The current `ArrayDiff` GPU path still does scalar reads/writes on the +# tape for every `NODE_VARIABLE` (one per `W1[i,j]`) and every `NODE_VALUE` +# (one per scalar inside the `vcat(row(...), row(...))` expansion of +# constant matrices `W2`, `X`, `y`). On a `CuArray` each scalar load/store +# is a separate `cudaMemcpy`, so for `h = 4096` and `d = 13` that's tens of +# thousands of round-trips per call. We expect the GPU path to be much +# slower than the hand-written CUDA reference until those leaves are +# loaded/extracted in bulk — see the `NODE_VARIABLE_BLOCK` / +# `NODE_VALUE_BLOCK` follow-up. +# +# The whole `optimize!` loop is wrapped in `CUDA.@allowscalar` so the +# scalar paths don't error out under GPUArrays' default scalar policy. +# +# Run +# --- +# cd ~/.julia/dev/ArrayDiff +# julia --project=perf -e 'using Pkg; Pkg.instantiate()' +# julia --project=perf perf/arraydiff_gpu.jl + +using Random, LinearAlgebra, Printf +using BenchmarkTools +using CUDA +using JuMP +using ArrayDiff +import MathOptInterface as MOI + +# ------------------------------------------------------------------------- +# Hand-written CUDA Float64 reverse_diff — same formulas as +# `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, but in `Float64` +# and *without* the `/ n` scaling. ArrayDiff's `:/` operator currently +# has a latent bug with non-scalar tape offsets (it reads +# `forward_storage[node_idx]` instead of `forward_storage[storage_offset[node_idx]+1]`), +# so we drop the constant scaling factor on both sides — it doesn't change +# what the gradient pathway exercises. +# ------------------------------------------------------------------------- +function forward_pass(W1, W2, X, y) + y_1 = tanh.(W1 * X) + J_1 = 1 .- y_1 .^ 2 + J_2 = 2 .* (W2 * y_1 .- y) + return y_1, J_1, J_2 +end + +function reverse_diff(W1, W2, X, y) + _, J_1, J_2 = forward_pass(W1, W2, X, y) + return (J_1 .* (W2' * J_2)) * X' +end + +# ------------------------------------------------------------------------- +# ArrayDiff path. Builds a JuMP model with `W1` as the only `@variable` and +# `W2 / X / y` baked in as constant matrices, parses to ArrayDiff, and +# returns an evaluator + a closure that computes ∂loss/∂W1 in `g`. +# ------------------------------------------------------------------------- +function build_arraydiff( + W2::Matrix{Float64}, + X::Matrix{Float64}, + y::Matrix{Float64}, + h::Int, + d::Int, + n::Int, + mode::ArrayDiff.Mode, +) + model = JuMP.Model() + @variable(model, W1[1:h, 1:d], container = ArrayDiff.ArrayOfVariables) + Y = W2 * tanh.(W1 * X) + diff = Y .- y + loss = sum(diff .^ 2) # `/ n` dropped — see `forward_pass` comment. + ad = ArrayDiff.model(mode) + MOI.Nonlinear.set_objective(ad, JuMP.moi_function(loss)) + evaluator = MOI.Nonlinear.Evaluator( + ad, + mode, + JuMP.index.(JuMP.all_variables(model)), + ) + MOI.initialize(evaluator, [:Grad]) + return evaluator +end + +# CPU path — `Mode()` defaults to `Vector{Float64}` storage, no `@allowscalar` +# needed. +function arraydiff_grad_cpu!(g, evaluator, x) + MOI.eval_objective_gradient(evaluator, g, x) + return g +end + +# GPU path — `Mode{CuVector{Float64}}()`. The current implementation still +# uses scalar tape access for `NODE_VARIABLE`/`NODE_VALUE` leaves, so the +# call needs `@allowscalar` to get past GPUArrays' scalar-indexing policy. +function arraydiff_grad_gpu!(g, evaluator, x) + CUDA.@allowscalar MOI.eval_objective_gradient(evaluator, g, x) + return g +end + +# ------------------------------------------------------------------------- +# One (h, d, n) sweep. +# ------------------------------------------------------------------------- +function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6) + println("\n" * "="^72) + @printf "h = %d, d = %d, n = %d\n" h d n + println("="^72) + + Random.seed!(0) + # `W2` and `y` are deliberately given 2 output rows (instead of the + # 1-row used in `perf/cuda_vs_pytorch.jl`). The current `:vcat` + # forward unpacks `idx1, idx2 = children_indices` and so requires at + # least two rows; a single-row matrix would be parsed as a `vcat` + # with one child and would error. Using 2 rows changes nothing about + # the gradient pathway being measured. + W1 = randn(Float64, h, d) + W2 = randn(Float64, 2, h) + X = randn(Float64, d, n) + y = randn(Float64, 2, n) + + # Reference: hand-written CUDA in Float64. + W1g = CuArray(W1) + W2g = CuArray(W2) + Xg = CuArray(X) + yg = CuArray(y) + grad_ref = Array(reverse_diff(W1g, W2g, Xg, yg)) + CUDA.synchronize() + + # ArrayDiff CPU. + print("ArrayDiff CPU build (h=$h) ... "); flush(stdout) + t_cpu_build = @elapsed ev_cpu = + build_arraydiff(W2, X, y, h, d, n, ArrayDiff.Mode()) + @printf "%.2f s\n" t_cpu_build + x_cpu = vec(W1) + g_cpu = zeros(Float64, length(x_cpu)) + arraydiff_grad_cpu!(g_cpu, ev_cpu, x_cpu) + grad_cpu = reshape(g_cpu, h, d) + + # ArrayDiff GPU. + print("ArrayDiff GPU build (h=$h) ... "); flush(stdout) + t_gpu_build = @elapsed ev_gpu = build_arraydiff( + W2, + X, + y, + h, + d, + n, + ArrayDiff.Mode{CUDA.CuVector{Float64}}(), + ) + @printf "%.2f s\n" t_gpu_build + x_gpu = vec(W1) + g_gpu = zeros(Float64, length(x_gpu)) + arraydiff_grad_gpu!(g_gpu, ev_gpu, x_gpu) + CUDA.synchronize() + grad_gpu = reshape(g_gpu, h, d) + + # Numerical equivalence. + for (name, g) in [ + ("ArrayDiff CPU", grad_cpu), + ("ArrayDiff GPU", grad_gpu), + ] + maxdiff = maximum(abs.(grad_ref .- g)) + relmag = maxdiff / max(maximum(abs.(grad_ref)), eps(Float64)) + ok = isapprox(grad_ref, g; rtol = rtol) + @printf "%-15s vs hand-CUDA: max|Δ| = %.3e (rel %.2e) match=%s\n" name maxdiff relmag ok + end + + println("\n--- benchmark (median of N samples, post-sync) ---") + bj = @benchmark begin + reverse_diff($W1g, $W2g, $Xg, $yg) + CUDA.synchronize() + end samples = 30 evals = 1 seconds = 10 + bcpu = @benchmark begin + arraydiff_grad_cpu!($g_cpu, $ev_cpu, $x_cpu) + end samples = 30 evals = 1 seconds = 10 + # `seconds = 60` for the GPU path because, at h = 4096, the + # scalar-cudaMemcpy path can run into the 10s cap on a single sample at + # warmup; bump the budget so we get a reliable median. + bgpu = @benchmark begin + arraydiff_grad_gpu!($g_gpu, $ev_gpu, $x_gpu) + CUDA.synchronize() + end samples = 30 evals = 1 seconds = 60 + @printf "Hand-CUDA Float64 : median %12.2f µs\n" 1e-3 * median(bj).time + @printf "ArrayDiff CPU : median %12.2f µs\n" 1e-3 * median(bcpu).time + @printf "ArrayDiff GPU : median %12.2f µs\n" 1e-3 * median(bgpu).time + + return nothing +end + +# ------------------------------------------------------------------------- +# Main +# ------------------------------------------------------------------------- +function main() + if !CUDA.functional() + error("CUDA is not functional in this environment.") + end + CUDA.math_mode!(CUDA.FAST_MATH) + println("CUDA.jl device : ", CUDA.name(CUDA.device()), " (math_mode=FAST_MATH)") + for h in (16, 256, 4096) + run_one(; h = h) + GC.gc(true) + CUDA.reclaim() + end +end + +main() From 66a5f5b053b4aac8e4bf9dd45ab2b6055758bb5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Wed, 6 May 2026 16:50:24 +0200 Subject: [PATCH 2/3] Fixes --- perf/arraydiff_gpu.jl | 11 +++++++---- src/mathoptinterface_api.jl | 6 +++++- src/types.jl | 9 ++++++--- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/perf/arraydiff_gpu.jl b/perf/arraydiff_gpu.jl index f7cb8df..bcd8133 100644 --- a/perf/arraydiff_gpu.jl +++ b/perf/arraydiff_gpu.jl @@ -150,7 +150,10 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6) arraydiff_grad_cpu!(g_cpu, ev_cpu, x_cpu) grad_cpu = reshape(g_cpu, h, d) - # ArrayDiff GPU. + # ArrayDiff GPU. Both `x` and `g` live on the GPU — same convention a + # GPU-resident solver (e.g. one whose ADAM step is on `CuVector`) would + # use: the AD tape, the input vector, and the gradient buffer all stay + # on the device, so there's no D2H round-trip on the gradient hot path. print("ArrayDiff GPU build (h=$h) ... "); flush(stdout) t_gpu_build = @elapsed ev_gpu = build_arraydiff( W2, @@ -162,11 +165,11 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6) ArrayDiff.Mode{CUDA.CuVector{Float64}}(), ) @printf "%.2f s\n" t_gpu_build - x_gpu = vec(W1) - g_gpu = zeros(Float64, length(x_gpu)) + x_gpu = CUDA.CuVector{Float64}(vec(W1)) + g_gpu = CUDA.zeros(Float64, length(x_gpu)) arraydiff_grad_gpu!(g_gpu, ev_gpu, x_gpu) CUDA.synchronize() - grad_gpu = reshape(g_gpu, h, d) + grad_gpu = reshape(Array(g_gpu), h, d) # Numerical equivalence. for (name, g) in [ diff --git a/src/mathoptinterface_api.jl b/src/mathoptinterface_api.jl index 413f80a..72d35bc 100644 --- a/src/mathoptinterface_api.jl +++ b/src/mathoptinterface_api.jl @@ -43,7 +43,11 @@ function MOI.initialize( d.user_output_buffer = zeros(T, largest_user_input_dimension) d.jac_storage = zeros(T, max(N, largest_user_input_dimension)) d.constraints = _FunctionStorage{T,S}[] - d.last_x = fill(T(NaN), N) + # Allocate `last_x` on the same device as the AD tape (S = Vector for the + # CPU path, S = CuVector for GPU). Filling with NaN forces the + # short-circuit check `last_x == x` to take the recompute branch on the + # first call. + d.last_x = fill!(S(undef, N), T(NaN)) d.want_hess = :Hess in requested_features want_hess_storage = (:HessVec in requested_features) || d.want_hess coloring_storage = Coloring.IndexedSet(N) diff --git a/src/types.jl b/src/types.jl index 00f6c31..3457911 100644 --- a/src/types.jl +++ b/src/types.jl @@ -320,9 +320,12 @@ mutable struct NLPEvaluator{T<:Real,S<:AbstractVector{T}} <: subexpression_reverse_values::Vector{T} subexpression_linearity::Vector{Linearity} - # A cache of the last x. This is used to guide whether we need to re-run - # reverse-mode automatic differentiation. - last_x::Vector{T} + # A cache of the last x. Used to short-circuit `_reverse_mode` when the + # primal hasn't changed. Matches the storage type so the comparison + # `last_x == x` and the `copyto!(last_x, x)` below stay device-local — + # otherwise the gradient hot path would do a full D2H + H2D round-trip + # of `x` on every call when the AD tape is on a `CuVector`. + last_x::S # Temporary storage for computing Jacobians. This is also used as temporary # storage for the input of multivariate functions. From d131cbb804394f965d5829a4f1ca9bc460088ba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Wed, 6 May 2026 17:23:59 +0200 Subject: [PATCH 3/3] Fix format --- perf/arraydiff_gpu.jl | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/perf/arraydiff_gpu.jl b/perf/arraydiff_gpu.jl index bcd8133..25ada54 100644 --- a/perf/arraydiff_gpu.jl +++ b/perf/arraydiff_gpu.jl @@ -141,9 +141,10 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6) CUDA.synchronize() # ArrayDiff CPU. - print("ArrayDiff CPU build (h=$h) ... "); flush(stdout) - t_cpu_build = @elapsed ev_cpu = - build_arraydiff(W2, X, y, h, d, n, ArrayDiff.Mode()) + print("ArrayDiff CPU build (h=$h) ... "); + flush(stdout) + t_cpu_build = + @elapsed ev_cpu = build_arraydiff(W2, X, y, h, d, n, ArrayDiff.Mode()) @printf "%.2f s\n" t_cpu_build x_cpu = vec(W1) g_cpu = zeros(Float64, length(x_cpu)) @@ -154,7 +155,8 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6) # GPU-resident solver (e.g. one whose ADAM step is on `CuVector`) would # use: the AD tape, the input vector, and the gradient buffer all stay # on the device, so there's no D2H round-trip on the gradient hot path. - print("ArrayDiff GPU build (h=$h) ... "); flush(stdout) + print("ArrayDiff GPU build (h=$h) ... "); + flush(stdout) t_gpu_build = @elapsed ev_gpu = build_arraydiff( W2, X, @@ -172,10 +174,7 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6) grad_gpu = reshape(Array(g_gpu), h, d) # Numerical equivalence. - for (name, g) in [ - ("ArrayDiff CPU", grad_cpu), - ("ArrayDiff GPU", grad_gpu), - ] + for (name, g) in [("ArrayDiff CPU", grad_cpu), ("ArrayDiff GPU", grad_gpu)] maxdiff = maximum(abs.(grad_ref .- g)) relmag = maxdiff / max(maximum(abs.(grad_ref)), eps(Float64)) ok = isapprox(grad_ref, g; rtol = rtol) @@ -212,7 +211,11 @@ function main() error("CUDA is not functional in this environment.") end CUDA.math_mode!(CUDA.FAST_MATH) - println("CUDA.jl device : ", CUDA.name(CUDA.device()), " (math_mode=FAST_MATH)") + println( + "CUDA.jl device : ", + CUDA.name(CUDA.device()), + " (math_mode=FAST_MATH)", + ) for h in (16, 256, 4096) run_one(; h = h) GC.gc(true)