diff --git a/src/ArrayDiff.jl b/src/ArrayDiff.jl index 9595d82..6ad4d05 100644 --- a/src/ArrayDiff.jl +++ b/src/ArrayDiff.jl @@ -20,10 +20,11 @@ Fork of `MOI.Nonlinear.SparseReverseMode` to add array support. The type parameter `S` is the storage type used for the AD tape (forward, partials, and reverse storage of each subexpression). It must satisfy -`S<:AbstractVector{Float64}`. Defaults to `Vector{Float64}`. Pass a different -`S` (for example `CuVector{Float64}`) to keep the tape on a GPU. +`S<:AbstractVector{<:Real}`. Defaults to `Vector{Float64}`. Pass a different +`S` (for example `Vector{Float32}` or `CuVector{Float64}`) to run AD in +another precision or keep the tape on a GPU. """ -struct Mode{S<:AbstractVector{Float64}} <: +struct Mode{S<:AbstractVector{<:Real}} <: MOI.Nonlinear.AbstractAutomaticDifferentiation end Mode() = Mode{Vector{Float64}}() @@ -65,7 +66,7 @@ include("evaluator.jl") include("array_nonlinear_function.jl") include("parse_moi.jl") -model(::Mode{S}) where {S} = Model() +model(::Mode{S}) where {S} = Model{eltype(S)}() # Extend MOI.Nonlinear.set_objective so that solvers calling # MOI.Nonlinear.set_objective(arraydiff_model, snf) dispatch here. @@ -84,8 +85,8 @@ function Evaluator( model::ArrayDiff.Model, ::Mode{S}, ordered_variables::Vector{MOI.VariableIndex}, -) where {S<:AbstractVector{Float64}} - return Evaluator(model, NLPEvaluator{S}(model, ordered_variables)) +) where {S<:AbstractVector{<:Real}} + return Evaluator(model, NLPEvaluator{eltype(S),S}(model, ordered_variables)) end # Called by solvers via MOI.Nonlinear.Evaluator(nlp_model, ad_backend, vars). diff --git a/src/JuMP/moi_bridge.jl b/src/JuMP/moi_bridge.jl index a0e818d..6ebcb70 100644 --- a/src/JuMP/moi_bridge.jl +++ b/src/JuMP/moi_bridge.jl @@ -12,9 +12,9 @@ function _to_moi_arg(x::GenericArrayExpr{V,N}) where {V,N} return ArrayNonlinearFunction{N}(x.head, args, x.size, x.broadcasted) end -_to_moi_arg(x::Matrix{Float64}) = x +_to_moi_arg(x::Matrix{<:Real}) = x -_to_moi_arg(x::Real) = Float64(x) +_to_moi_arg(x::Real) = x function JuMP.moi_function(x::GenericArrayExpr{V,N}) where {V,N} return _to_moi_arg(x) diff --git a/src/array_nonlinear_function.jl b/src/array_nonlinear_function.jl index c1f7491..49cfce4 100644 --- a/src/array_nonlinear_function.jl +++ b/src/array_nonlinear_function.jl @@ -87,7 +87,7 @@ function _map_indices_arg(index_map::F, x::ArrayOfContiguousVariables) where {F} return MOI.Utilities.map_indices(index_map, x) end -function _map_indices_arg(::F, x::Matrix{Float64}) where {F} +function _map_indices_arg(::F, x::Matrix{<:Real}) where {F} return x end diff --git a/src/mathoptinterface_api.jl b/src/mathoptinterface_api.jl index 126630f..413f80a 100644 --- a/src/mathoptinterface_api.jl +++ b/src/mathoptinterface_api.jl @@ -20,9 +20,9 @@ function MOI.features_available(d::NLPEvaluator) end function MOI.initialize( - d::NLPEvaluator{S}, + d::NLPEvaluator{T,S}, requested_features::Vector{Symbol}, -) where {S<:AbstractVector{Float64}} +) where {T<:Real,S<:AbstractVector{T}} # Check that we support the features requested by the user. available_features = MOI.features_available(d) for feature in requested_features @@ -40,10 +40,10 @@ function MOI.initialize( end d.objective = nothing d.residual = nothing - d.user_output_buffer = zeros(largest_user_input_dimension) - d.jac_storage = zeros(max(N, largest_user_input_dimension)) - d.constraints = _FunctionStorage{S}[] - d.last_x = fill(NaN, N) + d.user_output_buffer = zeros(T, largest_user_input_dimension) + d.jac_storage = zeros(T, max(N, largest_user_input_dimension)) + d.constraints = _FunctionStorage{T,S}[] + d.last_x = fill(T(NaN), N) d.want_hess = :Hess in requested_features want_hess_storage = (:HessVec in requested_features) || d.want_hess coloring_storage = Coloring.IndexedSet(N) @@ -67,9 +67,9 @@ function MOI.initialize( subexpression_edgelist = Vector{Set{Tuple{Int,Int}}}(undef, num_subexpressions) d.subexpressions = - Vector{_SubexpressionStorage{S}}(undef, num_subexpressions) - d.subexpression_forward_values = zeros(num_subexpressions) - d.subexpression_reverse_values = zeros(num_subexpressions) + Vector{_SubexpressionStorage{T,S}}(undef, num_subexpressions) + d.subexpression_forward_values = zeros(T, num_subexpressions) + d.subexpression_reverse_values = zeros(T, num_subexpressions) for k in d.subexpression_order # Only load expressions which actually are used d.subexpression_forward_values[k] = NaN @@ -145,6 +145,7 @@ function MOI.initialize( moi_index_to_consecutive_index, shared_partials_storage_ϵ, d, + S, ) residual = _FunctionStorage( subexpr, @@ -235,7 +236,7 @@ function MOI.eval_objective_gradient(d::NLPEvaluator, g, x) error("No nonlinear objective.") end _reverse_mode(d, x) - fill!(g, 0.0) + fill!(g, zero(eltype(g))) _extract_reverse_pass(g, d, something(d.objective)) return end diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl index 63bff8a..5a90faa 100644 --- a/src/reverse_mode.jl +++ b/src/reverse_mode.jl @@ -945,9 +945,9 @@ function _extract_reverse_pass( f::_FunctionStorage, ) where {T} for i in f.dependent_subexpressions - d.subexpression_reverse_values[i] = 0.0 + d.subexpression_reverse_values[i] = zero(T) end - _extract_reverse_pass_inner(g, f, d.subexpression_reverse_values, 1.0) + _extract_reverse_pass_inner(g, f, d.subexpression_reverse_values, one(T)) for i in length(f.dependent_subexpressions):-1:1 k = f.dependent_subexpressions[i] _extract_reverse_pass_inner( diff --git a/src/sizes.jl b/src/sizes.jl index dde5277..7fc3c90 100644 --- a/src/sizes.jl +++ b/src/sizes.jl @@ -438,11 +438,11 @@ function _infer_sizes( return sizes end -struct _SubexpressionStorage{S<:AbstractVector{Float64}} +struct _SubexpressionStorage{T<:Real,S<:AbstractVector{T}} nodes::Vector{Node} adj::SparseArrays.SparseMatrixCSC{Bool,Int} sizes::Sizes - const_values::Vector{Float64} + const_values::Vector{T} forward_storage::S partials_storage::S reverse_storage::S @@ -452,19 +452,19 @@ struct _SubexpressionStorage{S<:AbstractVector{Float64}} function _SubexpressionStorage( nodes::Vector{Node}, adj::SparseArrays.SparseMatrixCSC{Bool,Int}, - const_values::Vector{Float64}, + const_values::Vector{T}, block_shapes::Dict{Int,Vector{Int}}, partials_storage_ϵ::Vector{Float64}, linearity::Linearity, - ::Type{S} = Vector{Float64}, - ) where {S<:AbstractVector{Float64}} + ::Type{S} = Vector{T}, + ) where {T<:Real,S<:AbstractVector{T}} sizes = _infer_sizes(nodes, adj, block_shapes) N = _length(sizes) # Pre-load value blocks into forward_storage once at construction; # each block is a contiguous-to-contiguous bulk copy. Individual # `NODE_VALUE` scalars (rare — exponents, constant divisors, etc) and # variable nodes are loaded by `_forward_eval` in the per-node loop. - cpu_buffer = zeros(N) + cpu_buffer = zeros(T, N) for k in 1:length(nodes) node = nodes[k] if node.type == NODE_VALUE_BLOCK @@ -475,14 +475,14 @@ struct _SubexpressionStorage{S<:AbstractVector{Float64}} end end forward_storage = convert(S, cpu_buffer) - return new{S}( + return new{T,S}( nodes, adj, sizes, const_values, forward_storage, - fill!(S(undef, N), 0.0), # partials_storage, - fill!(S(undef, N), 0.0), # reverse_storage, + fill!(S(undef, N), zero(T)), # partials_storage, + fill!(S(undef, N), zero(T)), # reverse_storage, partials_storage_ϵ, linearity, ) diff --git a/src/types.jl b/src/types.jl index 589bb92..00f6c31 100644 --- a/src/types.jl +++ b/src/types.jl @@ -103,7 +103,7 @@ function _subexpression_and_linearity( partials_storage_ϵ::Vector{Float64}, d, ::Type{S} = Vector{Float64}, -) where {S<:AbstractVector{Float64}} +) where {S<:AbstractVector{<:Real}} nodes = _replace_moi_variables(expr.nodes, moi_index_to_consecutive_index) adj = adjacency_matrix(nodes) linearity = if d.want_hess @@ -114,7 +114,7 @@ function _subexpression_and_linearity( return _SubexpressionStorage( nodes, adj, - convert(Vector{Float64}, expr.values), + convert(Vector{eltype(S)}, expr.values), copy(expr.block_shapes), partials_storage_ϵ, linearity[1], @@ -123,28 +123,28 @@ function _subexpression_and_linearity( linearity end -struct _FunctionStorage{S<:AbstractVector{Float64}} - expr::_SubexpressionStorage{S} +struct _FunctionStorage{T<:Real,S<:AbstractVector{T}} + expr::_SubexpressionStorage{T,S} grad_sparsity::Vector{Int} # Nonzero pattern of Hessian matrix hess_I::Vector{Int} hess_J::Vector{Int} rinfo::Coloring.RecoveryInfo # coloring info for hessians - seed_matrix::Matrix{Float64} + seed_matrix::Matrix{T} # subexpressions which this function depends on, ordered for forward pass. dependent_subexpressions::Vector{Int} function _FunctionStorage( - expr::_SubexpressionStorage{S}, + expr::_SubexpressionStorage{T,S}, num_variables, coloring_storage::Coloring.IndexedSet, want_hess::Bool, - subexpressions::Vector{_SubexpressionStorage{S}}, + subexpressions::Vector{_SubexpressionStorage{T,S}}, dependent_subexpressions, subexpression_edgelist, subexpression_variables, linearity::Vector{Linearity}, - ) where {S<:AbstractVector{Float64}} + ) where {T<:Real,S<:AbstractVector{T}} empty!(coloring_storage) _compute_gradient_sparsity!(coloring_storage, expr) for k in dependent_subexpressions @@ -166,7 +166,7 @@ struct _FunctionStorage{S<:AbstractVector{Float64}} coloring_storage, ) seed_matrix = Coloring.seed_matrix(rinfo) - return new{S}( + return new{T,S}( expr, grad_sparsity, hess_I, @@ -176,13 +176,13 @@ struct _FunctionStorage{S<:AbstractVector{Float64}} dependent_subexpressions, ) else - return new{S}( + return new{T,S}( expr, grad_sparsity, Int[], Int[], Coloring.RecoveryInfo(), - Array{Float64}(undef, 0, 0), + Array{T}(undef, 0, 0), dependent_subexpressions, ) end @@ -305,30 +305,30 @@ interface. !!! warning Before using, you must initialize the evaluator using `MOI.initialize`. """ -mutable struct NLPEvaluator{S<:AbstractVector{Float64}} <: +mutable struct NLPEvaluator{T<:Real,S<:AbstractVector{T}} <: MOI.AbstractNLPEvaluator data::Model ordered_variables::Vector{MOI.VariableIndex} - objective::Union{Nothing,_FunctionStorage{S}} - residual::Union{Nothing,_FunctionStorage{S}} - constraints::Vector{_FunctionStorage{S}} - subexpressions::Vector{_SubexpressionStorage{S}} + objective::Union{Nothing,_FunctionStorage{T,S}} + residual::Union{Nothing,_FunctionStorage{T,S}} + constraints::Vector{_FunctionStorage{T,S}} + subexpressions::Vector{_SubexpressionStorage{T,S}} subexpression_order::Vector{Int} # Storage for the subexpressions in reverse-mode automatic differentiation. - subexpression_forward_values::Vector{Float64} - subexpression_reverse_values::Vector{Float64} + subexpression_forward_values::Vector{T} + subexpression_reverse_values::Vector{T} subexpression_linearity::Vector{Linearity} # A cache of the last x. This is used to guide whether we need to re-run # reverse-mode automatic differentiation. - last_x::Vector{Float64} + last_x::Vector{T} # Temporary storage for computing Jacobians. This is also used as temporary # storage for the input of multivariate functions. - jac_storage::Vector{Float64} + jac_storage::Vector{T} # Temporary storage for the gradient of multivariate functions - user_output_buffer::Vector{Float64} + user_output_buffer::Vector{T} # storage for computing hessians # these Float64 vectors are reinterpreted to hold multiple epsilon components @@ -343,10 +343,10 @@ mutable struct NLPEvaluator{S<:AbstractVector{Float64}} <: hessian_sparsity::Vector{Tuple{Int64,Int64}} max_chunk::Int # chunk size for which we've allocated storage - function NLPEvaluator{S}( + function NLPEvaluator{T,S}( data::Model, ordered_variables::Vector{MOI.VariableIndex}, - ) where {S<:AbstractVector{Float64}} - return new{S}(data, ordered_variables) + ) where {T<:Real,S<:AbstractVector{T}} + return new{T,S}(data, ordered_variables) end end diff --git a/test/JuMP.jl b/test/JuMP.jl index 3043900..9c44143 100644 --- a/test/JuMP.jl +++ b/test/JuMP.jl @@ -164,8 +164,8 @@ function test_parse_moi() return end -function _eval(model, func, x) - mode = ArrayDiff.Mode() +function _eval(model::JuMP.GenericModel{T}, func, x) where {T} + mode = ArrayDiff.Mode{Vector{T}}() ad = ArrayDiff.model(mode) MOI.Nonlinear.set_objective(ad, JuMP.moi_function(func)) evaluator = MOI.Nonlinear.Evaluator( @@ -176,7 +176,7 @@ function _eval(model, func, x) MOI.initialize(evaluator, [:Grad]) val = MOI.eval_objective(evaluator, x) g = zero(x) - MOI.eval_objective_gradient(evaluator, g, Float64.(collect(1:8))) + MOI.eval_objective_gradient(evaluator, g, T.(collect(1:8))) MOI.Nonlinear.set_objective(ad, nothing) @test isnothing(ad.objective) return val, g @@ -188,18 +188,20 @@ function _test_neural( plus::Bool, wrap::Bool, swap::Bool, + T::Type, ) n = 2 - X = [1.0 0.5; 0.3 0.8] - target = [0.5 0.2; 0.1 0.7] + X = T[1.0 0.5; 0.3 0.8] + target = T[0.5 0.2; 0.1 0.7] if wrap - X = ArrayDiff.MatrixExpr(:+, Any[X], size(X), false) - target = ArrayDiff.MatrixExpr(:+, Any[target], size(target), false) + ME = ArrayDiff.GenericMatrixExpr{JuMP.GenericVariableRef{T}} + X = ME(:+, Any[X], size(X), false) + target = ME(:+, Any[target], size(target), false) end if plus target = -target end - model = Model() + model = GenericModel{T}() @variable(model, W1[1:n, 1:n], container = ArrayDiff.ArrayOfVariables) @variable(model, W2[1:n, 1:n], container = ArrayDiff.ArrayOfVariables) # Use distinct starting values to break symmetry @@ -238,8 +240,8 @@ function _test_neural( else loss = sum(E .^ 2) end - W1_val = [0.3 -0.2; 0.1 0.4] - W2_val = [-0.1 0.5; 0.2 -0.3] + W1_val = T[0.3 -0.2; 0.1 0.4] + W2_val = T[-0.1 0.5; 0.2 -0.3] obj, g = _eval(model, loss, [vec(W1_val); vec(W2_val)]) # Reference computed from the same hand-written forward/reverse formulas # as `perf/cuda_vs_pytorch.jl::forward_pass`/`reverse_diff`, adapted to @@ -247,15 +249,15 @@ function _test_neural( # over both `W1` and `W2`). `_eval` evaluates the objective at `xstart` # and the gradient at `x = [1, ..., 8]`, so we need the references at the # corresponding inputs. - X_const = [1.0 0.5; 0.3 0.8] - target_const = [0.5 0.2; 0.1 0.7] + X_const = T[1.0 0.5; 0.3 0.8] + target_const = T[0.5 0.2; 0.1 0.7] obj_val = _ref_objective(W1_val, W2_val, X_const, target_const) if with_norm obj_val = sqrt(obj_val) end @test obj ≈ obj_val - W1_at_grad = reshape([1.0, 2.0, 3.0, 4.0], 2, 2) - W2_at_grad = reshape([5.0, 6.0, 7.0, 8.0], 2, 2) + W1_at_grad = reshape(T[1.0, 2.0, 3.0, 4.0], 2, 2) + W2_at_grad = reshape(T[5.0, 6.0, 7.0, 8.0], 2, 2) grad_sumsq = _ref_gradient(W1_at_grad, W2_at_grad, X_const, target_const) if with_norm # `d/dx ‖E‖₂ = (1/(2‖E‖₂)) · d/dx ‖E‖₂² = grad_sumsq / (2 sqrt(sumsq))`, @@ -299,7 +301,16 @@ function test_neural() @testset "$(plus ? "+" : "-")" for plus in bin @testset "$(wrap ? "wrap" : "nowrap")" for wrap in bin @testset "$(swap ? "swap" : "noswap")" for swap in bin - _test_neural(with_norm, broadcast, plus, wrap, swap) + @testset "$T" for T in [Float64, Float32] + _test_neural( + with_norm, + broadcast, + plus, + wrap, + swap, + T, + ) + end end end end