From 3d5e0052746b56d83b34f42b20194e748e813917 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Legat?= <benoit.legat@gmail.com>
Date: Wed, 6 May 2026 16:28:29 +0200
Subject: [PATCH 1/3] Add benchmark vs hardcoded CUDA

---
 perf/Project.toml     |   6 ++
 perf/arraydiff_gpu.jl | 220 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 226 insertions(+)
 create mode 100644 perf/arraydiff_gpu.jl

diff --git a/perf/Project.toml b/perf/Project.toml
index f7d180d..4aa5df2 100644
--- a/perf/Project.toml
+++ b/perf/Project.toml
@@ -4,12 +4,18 @@ authors = ["Benoît Legat <benoit.legat@gmail.com>"]
 version = "0.0.0"
 
 [deps]
+ArrayDiff = "c45fa1ca-6901-44ac-ae5b-5513a4852d50"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab"
+JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[sources]
+ArrayDiff = {path = ".."}
diff --git a/perf/arraydiff_gpu.jl b/perf/arraydiff_gpu.jl
new file mode 100644
index 0000000..f7cb8df
--- /dev/null
+++ b/perf/arraydiff_gpu.jl
@@ -0,0 +1,220 @@
+# Benchmark `ArrayDiff` GPU gradient computation against the hand-written
+# CUDA `reverse_diff` reference (the same one as `perf/cuda_vs_pytorch.jl`,
+# but in `Float64` so it matches `ArrayDiff`'s tape eltype).
+#
+# What this measures
+# ------------------
+# A 2-layer MLP `loss = sum((W2 * tanh.(W1 * X) - y).^2) / n`, where `W1` is
+# the only `@variable` (so `MOI.eval_objective_gradient` returns ∂loss/∂W1)
+# and `W2`, `X`, `y` are constants.
+#
+#   * Hand-CUDA Float64: `reverse_diff(W1, W2, X, y)` — the same hand-written
+#     formula as the existing perf script, just promoted to `Float64` so the
+#     numbers are directly comparable to ArrayDiff.
+#
+#   * ArrayDiff CPU: same model with `Mode{Vector{Float64}}()` — sanity check
+#     for what the existing CPU AD path costs at this problem size.
+#
+#   * ArrayDiff GPU: same model with `Mode{CuVector{Float64}}()` — the path
+#     this benchmark exists to measure.
+#
+# Caveats (read this before reading the numbers)
+# ----------------------------------------------
+# The current `ArrayDiff` GPU path still does scalar reads/writes on the
+# tape for every `NODE_VARIABLE` (one per `W1[i,j]`) and every `NODE_VALUE`
+# (one per scalar inside the `vcat(row(...), row(...))` expansion of
+# constant matrices `W2`, `X`, `y`). On a `CuArray` each scalar load/store
+# is a separate `cudaMemcpy`, so for `h = 4096` and `d = 13` that's tens of
+# thousands of round-trips per call. We expect the GPU path to be much
+# slower than the hand-written CUDA reference until those leaves are
+# loaded/extracted in bulk — see the `NODE_VARIABLE_BLOCK` /
+# `NODE_VALUE_BLOCK` follow-up.
+#
+# The whole `optimize!` loop is wrapped in `CUDA.@allowscalar` so the
+# scalar paths don't error out under GPUArrays' default scalar policy.
+#
+# Run
+# ---
+#   cd ~/.julia/dev/ArrayDiff
+#   julia --project=perf -e 'using Pkg; Pkg.instantiate()'
+#   julia --project=perf perf/arraydiff_gpu.jl
+
+using Random, LinearAlgebra, Printf
+using BenchmarkTools
+using CUDA
+using JuMP
+using ArrayDiff
+import MathOptInterface as MOI
+
+# -------------------------------------------------------------------------
+# Hand-written CUDA Float64 reverse_diff — same formulas as
+# `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, but in `Float64`
+# and *without* the `/ n` scaling. ArrayDiff's `:/` operator currently
+# has a latent bug with non-scalar tape offsets (it reads
+# `forward_storage[node_idx]` instead of `forward_storage[storage_offset[node_idx]+1]`),
+# so we drop the constant scaling factor on both sides — it doesn't change
+# what the gradient pathway exercises.
+# -------------------------------------------------------------------------
+function forward_pass(W1, W2, X, y)
+    y_1 = tanh.(W1 * X)
+    J_1 = 1 .- y_1 .^ 2
+    J_2 = 2 .* (W2 * y_1 .- y)
+    return y_1, J_1, J_2
+end
+
+function reverse_diff(W1, W2, X, y)
+    _, J_1, J_2 = forward_pass(W1, W2, X, y)
+    return (J_1 .* (W2' * J_2)) * X'
+end
+
+# -------------------------------------------------------------------------
+# ArrayDiff path. Builds a JuMP model with `W1` as the only `@variable` and
+# `W2 / X / y` baked in as constant matrices, parses to ArrayDiff, and
+# returns an evaluator + a closure that computes ∂loss/∂W1 in `g`.
+# -------------------------------------------------------------------------
+function build_arraydiff(
+    W2::Matrix{Float64},
+    X::Matrix{Float64},
+    y::Matrix{Float64},
+    h::Int,
+    d::Int,
+    n::Int,
+    mode::ArrayDiff.Mode,
+)
+    model = JuMP.Model()
+    @variable(model, W1[1:h, 1:d], container = ArrayDiff.ArrayOfVariables)
+    Y = W2 * tanh.(W1 * X)
+    diff = Y .- y
+    loss = sum(diff .^ 2)  # `/ n` dropped — see `forward_pass` comment.
+    ad = ArrayDiff.model(mode)
+    MOI.Nonlinear.set_objective(ad, JuMP.moi_function(loss))
+    evaluator = MOI.Nonlinear.Evaluator(
+        ad,
+        mode,
+        JuMP.index.(JuMP.all_variables(model)),
+    )
+    MOI.initialize(evaluator, [:Grad])
+    return evaluator
+end
+
+# CPU path — `Mode()` defaults to `Vector{Float64}` storage, no `@allowscalar`
+# needed.
+function arraydiff_grad_cpu!(g, evaluator, x)
+    MOI.eval_objective_gradient(evaluator, g, x)
+    return g
+end
+
+# GPU path — `Mode{CuVector{Float64}}()`. The current implementation still
+# uses scalar tape access for `NODE_VARIABLE`/`NODE_VALUE` leaves, so the
+# call needs `@allowscalar` to get past GPUArrays' scalar-indexing policy.
+function arraydiff_grad_gpu!(g, evaluator, x)
+    CUDA.@allowscalar MOI.eval_objective_gradient(evaluator, g, x)
+    return g
+end
+
+# -------------------------------------------------------------------------
+# One (h, d, n) sweep.
+# -------------------------------------------------------------------------
+function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
+    println("\n" * "="^72)
+    @printf "h = %d, d = %d, n = %d\n" h d n
+    println("="^72)
+
+    Random.seed!(0)
+    # `W2` and `y` are deliberately given 2 output rows (instead of the
+    # 1-row used in `perf/cuda_vs_pytorch.jl`). The current `:vcat`
+    # forward unpacks `idx1, idx2 = children_indices` and so requires at
+    # least two rows; a single-row matrix would be parsed as a `vcat`
+    # with one child and would error. Using 2 rows changes nothing about
+    # the gradient pathway being measured.
+    W1 = randn(Float64, h, d)
+    W2 = randn(Float64, 2, h)
+    X = randn(Float64, d, n)
+    y = randn(Float64, 2, n)
+
+    # Reference: hand-written CUDA in Float64.
+    W1g = CuArray(W1)
+    W2g = CuArray(W2)
+    Xg = CuArray(X)
+    yg = CuArray(y)
+    grad_ref = Array(reverse_diff(W1g, W2g, Xg, yg))
+    CUDA.synchronize()
+
+    # ArrayDiff CPU.
+    print("ArrayDiff CPU build (h=$h) ... "); flush(stdout)
+    t_cpu_build = @elapsed ev_cpu =
+        build_arraydiff(W2, X, y, h, d, n, ArrayDiff.Mode())
+    @printf "%.2f s\n" t_cpu_build
+    x_cpu = vec(W1)
+    g_cpu = zeros(Float64, length(x_cpu))
+    arraydiff_grad_cpu!(g_cpu, ev_cpu, x_cpu)
+    grad_cpu = reshape(g_cpu, h, d)
+
+    # ArrayDiff GPU.
+    print("ArrayDiff GPU build (h=$h) ... "); flush(stdout)
+    t_gpu_build = @elapsed ev_gpu = build_arraydiff(
+        W2,
+        X,
+        y,
+        h,
+        d,
+        n,
+        ArrayDiff.Mode{CUDA.CuVector{Float64}}(),
+    )
+    @printf "%.2f s\n" t_gpu_build
+    x_gpu = vec(W1)
+    g_gpu = zeros(Float64, length(x_gpu))
+    arraydiff_grad_gpu!(g_gpu, ev_gpu, x_gpu)
+    CUDA.synchronize()
+    grad_gpu = reshape(g_gpu, h, d)
+
+    # Numerical equivalence.
+    for (name, g) in [
+        ("ArrayDiff CPU", grad_cpu),
+        ("ArrayDiff GPU", grad_gpu),
+    ]
+        maxdiff = maximum(abs.(grad_ref .- g))
+        relmag = maxdiff / max(maximum(abs.(grad_ref)), eps(Float64))
+        ok = isapprox(grad_ref, g; rtol = rtol)
+        @printf "%-15s vs hand-CUDA: max|Δ| = %.3e (rel %.2e)  match=%s\n" name maxdiff relmag ok
+    end
+
+    println("\n--- benchmark (median of N samples, post-sync) ---")
+    bj = @benchmark begin
+        reverse_diff($W1g, $W2g, $Xg, $yg)
+        CUDA.synchronize()
+    end samples = 30 evals = 1 seconds = 10
+    bcpu = @benchmark begin
+        arraydiff_grad_cpu!($g_cpu, $ev_cpu, $x_cpu)
+    end samples = 30 evals = 1 seconds = 10
+    # `seconds = 60` for the GPU path because, at h = 4096, the
+    # scalar-cudaMemcpy path can run into the 10s cap on a single sample at
+    # warmup; bump the budget so we get a reliable median.
+    bgpu = @benchmark begin
+        arraydiff_grad_gpu!($g_gpu, $ev_gpu, $x_gpu)
+        CUDA.synchronize()
+    end samples = 30 evals = 1 seconds = 60
+    @printf "Hand-CUDA Float64 : median %12.2f µs\n" 1e-3 * median(bj).time
+    @printf "ArrayDiff CPU     : median %12.2f µs\n" 1e-3 * median(bcpu).time
+    @printf "ArrayDiff GPU     : median %12.2f µs\n" 1e-3 * median(bgpu).time
+
+    return nothing
+end
+
+# -------------------------------------------------------------------------
+# Main
+# -------------------------------------------------------------------------
+function main()
+    if !CUDA.functional()
+        error("CUDA is not functional in this environment.")
+    end
+    CUDA.math_mode!(CUDA.FAST_MATH)
+    println("CUDA.jl device : ", CUDA.name(CUDA.device()), "  (math_mode=FAST_MATH)")
+    for h in (16, 256, 4096)
+        run_one(; h = h)
+        GC.gc(true)
+        CUDA.reclaim()
+    end
+end
+
+main()

From 66a5f5b053b4aac8e4bf9dd45ab2b6055758bb5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Legat?= <benoit.legat@gmail.com>
Date: Wed, 6 May 2026 16:50:24 +0200
Subject: [PATCH 2/3] Fixes

---
 perf/arraydiff_gpu.jl       | 11 +++++++----
 src/mathoptinterface_api.jl |  6 +++++-
 src/types.jl                |  9 ++++++---
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/perf/arraydiff_gpu.jl b/perf/arraydiff_gpu.jl
index f7cb8df..bcd8133 100644
--- a/perf/arraydiff_gpu.jl
+++ b/perf/arraydiff_gpu.jl
@@ -150,7 +150,10 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
     arraydiff_grad_cpu!(g_cpu, ev_cpu, x_cpu)
     grad_cpu = reshape(g_cpu, h, d)
 
-    # ArrayDiff GPU.
+    # ArrayDiff GPU. Both `x` and `g` live on the GPU — same convention a
+    # GPU-resident solver (e.g. one whose ADAM step is on `CuVector`) would
+    # use: the AD tape, the input vector, and the gradient buffer all stay
+    # on the device, so there's no D2H round-trip on the gradient hot path.
     print("ArrayDiff GPU build (h=$h) ... "); flush(stdout)
     t_gpu_build = @elapsed ev_gpu = build_arraydiff(
         W2,
@@ -162,11 +165,11 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
         ArrayDiff.Mode{CUDA.CuVector{Float64}}(),
     )
     @printf "%.2f s\n" t_gpu_build
-    x_gpu = vec(W1)
-    g_gpu = zeros(Float64, length(x_gpu))
+    x_gpu = CUDA.CuVector{Float64}(vec(W1))
+    g_gpu = CUDA.zeros(Float64, length(x_gpu))
     arraydiff_grad_gpu!(g_gpu, ev_gpu, x_gpu)
     CUDA.synchronize()
-    grad_gpu = reshape(g_gpu, h, d)
+    grad_gpu = reshape(Array(g_gpu), h, d)
 
     # Numerical equivalence.
     for (name, g) in [
diff --git a/src/mathoptinterface_api.jl b/src/mathoptinterface_api.jl
index 413f80a..72d35bc 100644
--- a/src/mathoptinterface_api.jl
+++ b/src/mathoptinterface_api.jl
@@ -43,7 +43,11 @@ function MOI.initialize(
     d.user_output_buffer = zeros(T, largest_user_input_dimension)
     d.jac_storage = zeros(T, max(N, largest_user_input_dimension))
     d.constraints = _FunctionStorage{T,S}[]
-    d.last_x = fill(T(NaN), N)
+    # Allocate `last_x` on the same device as the AD tape (S = Vector for the
+    # CPU path, S = CuVector for GPU). Filling with NaN forces the
+    # short-circuit check `last_x == x` to take the recompute branch on the
+    # first call.
+    d.last_x = fill!(S(undef, N), T(NaN))
     d.want_hess = :Hess in requested_features
     want_hess_storage = (:HessVec in requested_features) || d.want_hess
     coloring_storage = Coloring.IndexedSet(N)
diff --git a/src/types.jl b/src/types.jl
index 00f6c31..3457911 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -320,9 +320,12 @@ mutable struct NLPEvaluator{T<:Real,S<:AbstractVector{T}} <:
     subexpression_reverse_values::Vector{T}
     subexpression_linearity::Vector{Linearity}
 
-    # A cache of the last x. This is used to guide whether we need to re-run
-    # reverse-mode automatic differentiation.
-    last_x::Vector{T}
+    # A cache of the last x. Used to short-circuit `_reverse_mode` when the
+    # primal hasn't changed. Matches the storage type so the comparison
+    # `last_x == x` and the `copyto!(last_x, x)` below stay device-local —
+    # otherwise the gradient hot path would do a full D2H + H2D round-trip
+    # of `x` on every call when the AD tape is on a `CuVector`.
+    last_x::S
 
     # Temporary storage for computing Jacobians. This is also used as temporary
     # storage for the input of multivariate functions.

From d131cbb804394f965d5829a4f1ca9bc460088ba0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Legat?= <benoit.legat@gmail.com>
Date: Wed, 6 May 2026 17:23:59 +0200
Subject: [PATCH 3/3] Fix format

---
 perf/arraydiff_gpu.jl | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/perf/arraydiff_gpu.jl b/perf/arraydiff_gpu.jl
index bcd8133..25ada54 100644
--- a/perf/arraydiff_gpu.jl
+++ b/perf/arraydiff_gpu.jl
@@ -141,9 +141,10 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
     CUDA.synchronize()
 
     # ArrayDiff CPU.
-    print("ArrayDiff CPU build (h=$h) ... "); flush(stdout)
-    t_cpu_build = @elapsed ev_cpu =
-        build_arraydiff(W2, X, y, h, d, n, ArrayDiff.Mode())
+    print("ArrayDiff CPU build (h=$h) ... ");
+    flush(stdout)
+    t_cpu_build =
+        @elapsed ev_cpu = build_arraydiff(W2, X, y, h, d, n, ArrayDiff.Mode())
     @printf "%.2f s\n" t_cpu_build
     x_cpu = vec(W1)
     g_cpu = zeros(Float64, length(x_cpu))
@@ -154,7 +155,8 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
     # GPU-resident solver (e.g. one whose ADAM step is on `CuVector`) would
     # use: the AD tape, the input vector, and the gradient buffer all stay
     # on the device, so there's no D2H round-trip on the gradient hot path.
-    print("ArrayDiff GPU build (h=$h) ... "); flush(stdout)
+    print("ArrayDiff GPU build (h=$h) ... ");
+    flush(stdout)
     t_gpu_build = @elapsed ev_gpu = build_arraydiff(
         W2,
         X,
@@ -172,10 +174,7 @@ function run_one(; h::Int, d::Int = 13, n::Int = 178, rtol::Float64 = 1e-6)
     grad_gpu = reshape(Array(g_gpu), h, d)
 
     # Numerical equivalence.
-    for (name, g) in [
-        ("ArrayDiff CPU", grad_cpu),
-        ("ArrayDiff GPU", grad_gpu),
-    ]
+    for (name, g) in [("ArrayDiff CPU", grad_cpu), ("ArrayDiff GPU", grad_gpu)]
         maxdiff = maximum(abs.(grad_ref .- g))
         relmag = maxdiff / max(maximum(abs.(grad_ref)), eps(Float64))
         ok = isapprox(grad_ref, g; rtol = rtol)
@@ -212,7 +211,11 @@ function main()
         error("CUDA is not functional in this environment.")
     end
     CUDA.math_mode!(CUDA.FAST_MATH)
-    println("CUDA.jl device : ", CUDA.name(CUDA.device()), "  (math_mode=FAST_MATH)")
+    println(
+        "CUDA.jl device : ",
+        CUDA.name(CUDA.device()),
+        "  (math_mode=FAST_MATH)",
+    )
     for h in (16, 256, 4096)
         run_one(; h = h)
         GC.gc(true)