Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions perf/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.CondaPkg
109 changes: 109 additions & 0 deletions perf/arraydiff.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
module ArrayDiffNeural

# ArrayDiff reverse-mode AD of the 2-layer MLP loss
#
# L = sum((W2 * tanh(W1 * X) - y) .^ 2)
#
# `W1` is the only `@variable`; `W2`, `X`, `y` are baked in as constant
# matrix blocks. With `gpu=true`, the AD tape, input vector, and gradient
# buffer all live on the device (`Mode{CuVector{T}}`); with `gpu=false`,
# everything stays on the host (`Mode{Vector{T}}`).
#
# `/n` is dropped from the loss to match `hand_cuda.jl` — ArrayDiff's `:/`
# operator currently has a latent bug on non-scalar tape offsets, and a
# constant scaling factor doesn't change the gradient pathway.

using Random
using BenchmarkTools
using CUDA
using JuMP
using ArrayDiff
import MathOptInterface as MOI

# Output dimension `m`. Fixed at 2: a 1-row `W2` would parse as a `:vcat`
# with a single child, which the current ArrayDiff parser unpacks via
# `idx1, idx2 = children_indices` and so requires at least two rows.
const OUT_DIM = 2

function _build(::Type{T}, h::Int, d::Int, n::Int, gpu::Bool) where {T<:Real}
Random.seed!(0)
W1 = randn(T, h, d)
W2 = randn(T, OUT_DIM, h)
X = randn(T, d, n)
y = randn(T, OUT_DIM, n)

# JuMP works in Float64 internally; ArrayDiff converts the parsed
# `Expression{Float64}` constants into the `Mode`'s eltype when filling
# its tape. Promote constants to Float64 explicitly so the conversion
# is one-way and obvious.
W2_64 = Float64.(W2)
X_64 = Float64.(X)
y_64 = Float64.(y)

model = JuMP.Model()
@variable(model, W1v[1:h, 1:d], container = ArrayDiff.ArrayOfVariables)
Y = W2_64 * tanh.(W1v * X_64)
diff = Y .- y_64
loss = sum(diff .^ 2)

mode =
gpu ? ArrayDiff.Mode{CUDA.CuVector{T}}() : ArrayDiff.Mode{Vector{T}}()
ad = ArrayDiff.model(mode)
MOI.Nonlinear.set_objective(ad, JuMP.moi_function(loss))
ev = MOI.Nonlinear.Evaluator(
ad,
mode,
JuMP.index.(JuMP.all_variables(model)),
)
MOI.initialize(ev, [:Grad])
return (W1 = W1, evaluator = ev)
end

"""
neural(T, h, d, n; gpu::Bool = false) -> BenchmarkTools.Trial

Benchmark `∂L/∂W1` of the 2-layer MLP loss using ArrayDiff. Model parsing
and `MOI.initialize` (the expensive per-shape build) happen once before the
trial; the timed block is just `MOI.eval_objective_gradient` plus a
`CUDA.synchronize` when `gpu=true`.
"""
function neural(
::Type{T},
h::Int,
d::Int,
n::Int;
gpu::Bool = false,
) where {T<:Real}
state = _build(T, h, d, n, gpu)
if gpu
x = CUDA.CuVector{T}(vec(state.W1))
g = CUDA.zeros(T, h * d)
else
x = vec(state.W1)
g = zeros(T, h * d)
end
# `_reverse_mode` short-circuits when `last_x == x`, so without `setup`
# the first benchmark sample runs the full forward+reverse pass and every
# subsequent sample exits immediately — the reported timing would just
# measure the equality check. Reset `last_x` to NaN before each iteration
# so every sample re-runs the AD pipeline.
return @benchmark(
begin
if $gpu
# `@allowscalar` covers residual scalar leaves the BLOCK
# rewrite can't fold; the hot path is bulk.
CUDA.@allowscalar MOI.eval_objective_gradient(
$state.evaluator,
$g,
$x,
)
CUDA.synchronize()
else
MOI.eval_objective_gradient($state.evaluator, $g, $x)
end
end,
setup = (fill!($state.evaluator.backend.last_x, NaN)),
)
end

end # module
Loading
Loading