From f8010e91364c38c954658488848e89621451b63a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Wed, 6 May 2026 13:12:17 +0200 Subject: [PATCH 1/3] Vectorized loading --- src/graph_tools.jl | 69 ++++++++++++++++++------ src/mathoptinterface_api.jl | 2 +- src/parse_moi.jl | 104 ++++++++---------------------------- src/reverse_mode.jl | 25 ++++++++- src/sizes.jl | 32 +++++++++-- src/types.jl | 25 ++++++--- test/ReverseAD.jl | 10 +++- 7 files changed, 157 insertions(+), 110 deletions(-) diff --git a/src/graph_tools.jl b/src/graph_tools.jl index 5edf062..b8dcfe4 100644 --- a/src/graph_tools.jl +++ b/src/graph_tools.jl @@ -23,6 +23,16 @@ field, which should be interpreted as follows: * `NODE_CALL_UNIVARIATE_BROADCASTED`: the index into `operators.univariate_operators` * `NODE_CALL_MULTIVARIATE_BROADCASTED`: the index into `operators.multivariate_operators` * `NODE_CALL_REDUCE`: the index into `operators.multivariate_operators` + * `NODE_MOI_VARIABLE_BLOCK`: a contiguous block of `MOI.VariableIndex`. The + `index` field is the value of the FIRST `MOI.VariableIndex` in the block; + the shape `(m, n)` is stored in `Expression.block_shapes`. The block holds + `m * n` variables in column-major order. + * `NODE_VARIABLE_BLOCK`: same as `NODE_MOI_VARIABLE_BLOCK` but after MOI + variable indices have been remapped to consecutive 1-based internal + indices. `index` is the FIRST internal index of the block. + * `NODE_VALUE_BLOCK`: a contiguous block of constants. `index` is the start + index in the `.values` field; the next `m * n` entries (column-major) are + the block's data. Shape stored in `Expression.block_shapes`. """ @enum( NodeType, @@ -51,6 +61,12 @@ field, which should be interpreted as follows: NODE_CALL_MULTIVARIATE_BROADCASTED, # Index into the multivariate operators, with reduction NODE_CALL_REDUCE, + # Block-of-variables node, before MOI → internal index remap. + NODE_MOI_VARIABLE_BLOCK, + # Block-of-variables node, after MOI → internal index remap. + NODE_VARIABLE_BLOCK, + # Block-of-constants node. + NODE_VALUE_BLOCK, ) @enum(Linearity, CONSTANT, LINEAR, PIECEWISE_LINEAR, NONLINEAR) @@ -96,6 +112,16 @@ function _replace_moi_variables( moi_index_to_consecutive_index[MOI.VariableIndex(node.index)], node.parent, ) + elseif node.type == NODE_MOI_VARIABLE_BLOCK + # `node.index` is the FIRST MOI variable index in the block. For an + # `ArrayOfContiguousVariables`, all variables in the block are + # contiguous in MOI's ordering, so we just remap the first index + # and reuse it as the consecutive offset. The block's length comes + # from `Expression.block_shapes[i]`, which the caller threads + # through separately. + first_consec = + moi_index_to_consecutive_index[MOI.VariableIndex(node.index)] + new_nodes[i] = Node(NODE_VARIABLE_BLOCK, first_consec, node.parent) else new_nodes[i] = node end @@ -122,10 +148,10 @@ function _classify_linearity( children_arr = SparseArrays.rowvals(adj) for k in length(nodes):-1:1 node = nodes[k] - if node.type == NODE_VARIABLE + if node.type == NODE_VARIABLE || node.type == NODE_VARIABLE_BLOCK linearity[k] = LINEAR continue - elseif node.type == NODE_VALUE + elseif node.type == NODE_VALUE || node.type == NODE_VALUE_BLOCK linearity[k] = CONSTANT continue elseif node.type == NODE_PARAMETER @@ -218,24 +244,29 @@ function _classify_linearity( end """ - _compute_gradient_sparsity!( - indices::Coloring.IndexedSet, - nodes::Vector{Nonlinear.Node}, - ) + _compute_gradient_sparsity!(indices::Coloring.IndexedSet, f) + +Compute the sparsity pattern of the gradient of an expression (that is, a list +of which variable indices are present). -Compute the sparsity pattern of the gradient of an expression (that is, a list of -which variable indices are present). +`f` is duck-typed (as its type is defined later) to a +`_SubexpressionStorage`-like object exposing `f.nodes` and `f.sizes`. +For `NODE_VARIABLE_BLOCK` nodes, the block's length is read from `f.sizes` +(via `_length`), and the block contributes that many consecutive variable +indices starting at `nodes[k].index`. """ -function _compute_gradient_sparsity!( - indices::Coloring.IndexedSet, - nodes::Vector{Node}, -) - for node in nodes +function _compute_gradient_sparsity!(indices::Coloring.IndexedSet, f) + for (k, node) in enumerate(f.nodes) if node.type == NODE_VARIABLE push!(indices, node.index) - elseif node.type == NODE_MOI_VARIABLE + elseif node.type == NODE_VARIABLE_BLOCK + len = _length(f.sizes, k) + for i in 0:(len - 1) + push!(indices, node.index + i) + end + elseif node.type == NODE_MOI_VARIABLE || node.type == NODE_MOI_VARIABLE_BLOCK error( - "Internal error: Invalid to compute sparsity if NODE_MOI_VARIABLE " * + "Internal error: Invalid to compute sparsity if $(node.type) " * "nodes are present.", ) end @@ -341,6 +372,7 @@ function _compute_hessian_sparsity( child_group_variables = Dict{Int,Set{Int}}() for (k, node) in enumerate(nodes) @assert node.type != NODE_MOI_VARIABLE + @assert node.type != NODE_MOI_VARIABLE_BLOCK if input_linearity[k] == CONSTANT continue # No hessian contribution from constant nodes end @@ -376,6 +408,13 @@ function _compute_hessian_sparsity( child_group_variables[child_group_idx], nodes[r].index, ) + elseif nodes[r].type == NODE_VARIABLE_BLOCK + # TODO `NODE_VARIABLE_BLOCK` would need its block shape to + # enumerate the variable indices. + error( + "Internal error: Hessian sparsity for " * + "NODE_VARIABLE_BLOCK is not yet supported.", + ) elseif nodes[r].type == NODE_SUBEXPRESSION sub_vars = subexpression_variables[nodes[r].index] if !haskey(child_group_variables, child_group_idx) diff --git a/src/mathoptinterface_api.jl b/src/mathoptinterface_api.jl index bab095a..126630f 100644 --- a/src/mathoptinterface_api.jl +++ b/src/mathoptinterface_api.jl @@ -87,7 +87,7 @@ function MOI.initialize( max(max_expr_with_sub_length, length(subex.nodes)) if d.want_hess empty!(coloring_storage) - _compute_gradient_sparsity!(coloring_storage, subex.nodes) + _compute_gradient_sparsity!(coloring_storage, subex) # union with all dependent expressions for idx in _list_subexpressions(subex.nodes) union!(coloring_storage, subexpression_variables[idx]) diff --git a/src/parse_moi.jl b/src/parse_moi.jl index 31b09f1..4598058 100644 --- a/src/parse_moi.jl +++ b/src/parse_moi.jl @@ -140,98 +140,38 @@ end # ── ArrayOfContiguousVariables ─────────────────────────────────────────────────── function _parse_moi_stack!( - stack::Vector{Tuple{Int,Any}}, - data::Model, - expr::Expression, - x::ArrayOfContiguousVariables{2}, - parent_index::Int, -) - m, n = x.size - # Build vcat(row(v11, v12, ...), row(v21, v22, ...), ...). - # - # The outer loop is `1:m` (forward order), NOT `m:-1:1`. The `:row` nodes - # we push end up at consecutive positions in `expr.nodes`, and `:vcat` - # later reads its children in tape-index order (CSC `nzrange`) — so the - # row with the smallest tape index becomes row 1 of the output matrix. - # If the outer loop ran in reverse, `row_m` would land at the smallest - # tape index and `:vcat` would silently place it as row 1, producing a - # row-flipped matrix on the tape (a latent bug, fixed here). - # - # The inner loop stays `n:-1:1` because the items go on the stack and pop - # in LIFO order — pushing in reverse j order gives forward j-order on - # pop, which matches the column-major layout below. - vcat_id = data.operators.multivariate_operator_to_id[:vcat] - row_id = data.operators.multivariate_operator_to_id[:row] - push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index)) - vcat_idx = length(expr.nodes) - for i in 1:m - push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx)) - row_idx = length(expr.nodes) - for j in n:-1:1 - vi = MOI.VariableIndex(x.offset + (j - 1) * m + i) - push!(stack, (row_idx, vi)) - end - end - return -end - -function _parse_moi_stack!( - stack::Vector{Tuple{Int,Any}}, - data::Model, + ::Vector{Tuple{Int,Any}}, + ::Model, expr::Expression, - x::ArrayOfContiguousVariables{1}, + x::ArrayOfContiguousVariables, parent_index::Int, ) - m = x.size[1] - vect_id = data.operators.multivariate_operator_to_id[:vect] - push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vect_id, parent_index)) - vect_idx = length(expr.nodes) - for i in m:-1:1 - vi = MOI.VariableIndex(x.offset + i) - push!(stack, (vect_idx, vi)) - end + # Emit a single block node. The block represents the contiguous range of + # MOI variable indices `x.offset+1, ...`, laid out in + # column-major order (matching `Array{Float64}` and `Base.LinearIndices`), + # which is the layout `_view_array` will see at evaluation time. + push!(expr.nodes, Node(NODE_MOI_VARIABLE_BLOCK, x.offset + 1, parent_index)) + expr.block_shapes[length(expr.nodes)] = collect(x.size) return end -# ── Constant matrices and vectors ──────────────────────────────────────────── +# ── Constant arrays ──────────────────────────────────────────── function _parse_moi_stack!( - stack::Vector{Tuple{Int,Any}}, - data::Model, - expr::Expression, - x::AbstractMatrix{<:Real}, - parent_index::Int, -) - m, n = size(x) - # See the `ArrayOfContiguousVariables{2}` overload for the rationale on - # the `1:m` outer loop (the previous `m:-1:1` produced a row-flipped - # matrix on the tape). - vcat_id = data.operators.multivariate_operator_to_id[:vcat] - row_id = data.operators.multivariate_operator_to_id[:row] - push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index)) - vcat_idx = length(expr.nodes) - for i in 1:m - push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx)) - row_idx = length(expr.nodes) - for j in n:-1:1 - push!(stack, (row_idx, x[i, j])) - end - end - return -end - -function _parse_moi_stack!( - stack::Vector{Tuple{Int,Any}}, - data::Model, + ::Vector{Tuple{Int,Any}}, + ::Model, expr::Expression, - x::AbstractVector{<:Real}, + x::AbstractArray{<:Real}, parent_index::Int, ) - vect_id = data.operators.multivariate_operator_to_id[:vect] - push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vect_id, parent_index)) - vect_idx = length(expr.nodes) - for i in length(x):-1:1 - push!(stack, (vect_idx, x[i])) - end + # Emit a single value block. We push the flat values to + # `expr.values` in column-major order (matching `Array{Float64}`'s memory + # layout); `node.index` records the start of that contiguous range so + # `_SubexpressionStorage` can copy it into the tape in one block at + # construction time. + start_idx = length(expr.values) + 1 + append!(expr.values, x) + push!(expr.nodes, Node(NODE_VALUE_BLOCK, start_idx, parent_index)) + expr.block_shapes[length(expr.nodes)] = collect(size(x)) return end diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl index c789461..b4125cb 100644 --- a/src/reverse_mode.jl +++ b/src/reverse_mode.jl @@ -125,6 +125,18 @@ function _forward_eval( # f.forward_storage[k] = x[node.index] elseif node.type == NODE_VALUE f.forward_storage[j] = f.const_values[node.index] + elseif node.type == NODE_VARIABLE_BLOCK + # Contiguous-to-contiguous copy from `x` into the tape: on CPU a + # `copyto!`, on GPU a single `cudaMemcpy`. This is the fast path + # for matrix variables from `ArrayOfContiguousVariables{2}`. + tape_range = _storage_range(f.sizes, k) + len = length(tape_range) + copyto!( + view(f.forward_storage, tape_range), + view(x, node.index:(node.index + len - 1)), + ) + elseif node.type == NODE_VALUE_BLOCK + # Pre-loaded into `forward_storage` at construction. elseif node.type == NODE_SUBEXPRESSION f.forward_storage[j] = d.subexpression_forward_values[node.index] elseif node.type == NODE_PARAMETER @@ -955,7 +967,18 @@ function _extract_reverse_pass_inner( ) where {T} @assert length(f.reverse_storage) >= _length(f.sizes) for (k, node) in enumerate(f.nodes) - if node.type == NODE_VARIABLE + if node.type == NODE_VARIABLE_BLOCK + # Each block has a contiguous tape range and a contiguous `output` + # range: gather the adjoint, transfer to host in one memcpy, and + # accumulate into the matching slice of `output`. + tape_range = _storage_range(f.sizes, k) + len = length(tape_range) + x_range = node.index:(node.index + len - 1) + cpu_buf = + convert(Vector{T}, view(f.reverse_storage, tape_range)) + view(output, x_range) .+= scale .* cpu_buf + elseif node.type == NODE_VARIABLE + # Per-leaf scalar — rare, so the per-leaf `cudaMemcpy` is fine. output[node.index] += scale * @s f.reverse_storage[k] elseif node.type == NODE_SUBEXPRESSION subexpressions[node.index] += scale * @s f.reverse_storage[k] diff --git a/src/sizes.jl b/src/sizes.jl index 1c76881..ac41c97 100644 --- a/src/sizes.jl +++ b/src/sizes.jl @@ -179,7 +179,7 @@ macro j(expr) end # /!\ Can only be called in decreasing `k` order -function _add_size!(sizes::Sizes, k::Int, size::Tuple) +function _add_size!(sizes::Sizes, k::Int, size) sizes.ndims[k] = length(size) sizes.size_offset[k] = length(sizes.size) append!(sizes.size, size) @@ -207,6 +207,7 @@ end function _infer_sizes( nodes::Vector{Node}, adj::SparseArrays.SparseMatrixCSC{Bool,Int}, + block_shapes::Dict{Int,Vector{Int}} = Dict{Int,Vector{Int}}(), ) sizes = Sizes( zeros(Int, length(nodes)), @@ -217,6 +218,15 @@ function _infer_sizes( children_arr = SparseArrays.rowvals(adj) for k in length(nodes):-1:1 node = nodes[k] + # Block leaves carry their shape in `block_shapes`; they're 2D (m, n) + # by construction. + if node.type == NODE_VARIABLE_BLOCK || + node.type == NODE_VALUE_BLOCK || + node.type == NODE_MOI_VARIABLE_BLOCK + shape = block_shapes[k] + _add_size!(sizes, k, shape) + continue + end children_indices = SparseArrays.nzrange(adj, k) N = length(children_indices) if node.type == NODE_CALL_MULTIVARIATE @@ -438,18 +448,34 @@ struct _SubexpressionStorage{S<:AbstractVector{Float64}} nodes::Vector{Node}, adj::SparseArrays.SparseMatrixCSC{Bool,Int}, const_values::Vector{Float64}, + block_shapes::Dict{Int,Vector{Int}}, partials_storage_ϵ::Vector{Float64}, linearity::Linearity, ::Type{S} = Vector{Float64}, ) where {S<:AbstractVector{Float64}} - sizes = _infer_sizes(nodes, adj) + sizes = _infer_sizes(nodes, adj, block_shapes) N = _length(sizes) + # Pre-load value blocks into forward_storage once at construction; + # each block is a contiguous-to-contiguous bulk copy. Individual + # `NODE_VALUE` scalars (rare — exponents, constant divisors, etc) and + # variable nodes are loaded by `_forward_eval` in the per-node loop. + cpu_buffer = zeros(N) + for k in 1:length(nodes) + node = nodes[k] + if node.type == NODE_VALUE_BLOCK + j = sizes.storage_offset[k] + 1 + len = _length(sizes, k) + cpu_buffer[j:(j + len - 1)] .= + view(const_values, node.index:(node.index + len - 1)) + end + end + forward_storage = convert(S, cpu_buffer) return new{S}( nodes, adj, sizes, const_values, - fill!(S(undef, N), 0.0), # forward_storage, + forward_storage, fill!(S(undef, N), 0.0), # partials_storage, fill!(S(undef, N), 0.0), # reverse_storage, partials_storage_ϵ, diff --git a/src/types.jl b/src/types.jl index 1127f62..03c321f 100644 --- a/src/types.jl +++ b/src/types.jl @@ -8,20 +8,33 @@ struct Expression nodes::Vector{Node} values::Vector{Float64} + block_shapes::Dict{Int,Vector{Int}} end The core type that represents a nonlinear expression. See the MathOptInterface documentation for information on how the nodes and values form an expression tree. + +`block_shapes[k]` is the shape of node `k` (as a `Vector{Int}` of dimensions: +`[m]` for a 1D vector block, `[m, n]` for a 2D matrix block, `[m, n, p]` for a +3D tensor block, ...) when `nodes[k]` is one of `NODE_MOI_VARIABLE_BLOCK`, +`NODE_VARIABLE_BLOCK`, or `NODE_VALUE_BLOCK`. Block nodes are leaves that +stand in for an entire `prod(shape)`-element array of variables (or +constants), preserving contiguity end-to-end so the AD tape can be filled and +gathered with single contiguous bulk operations. """ struct Expression{T} nodes::Vector{Node} values::Vector{T} - Expression{T}() where {T} = new{T}(Node[], T[]) + block_shapes::Dict{Int,Vector{Int}} + Expression{T}() where {T} = + new{T}(Node[], T[], Dict{Int,Vector{Int}}()) end function Base.:(==)(x::Expression, y::Expression) - return x.nodes == y.nodes && x.values == y.values + return x.nodes == y.nodes && + x.values == y.values && + x.block_shapes == y.block_shapes end """ @@ -103,6 +116,7 @@ function _subexpression_and_linearity( nodes, adj, convert(Vector{Float64}, expr.values), + copy(expr.block_shapes), partials_storage_ϵ, linearity[1], S, @@ -133,12 +147,9 @@ struct _FunctionStorage{S<:AbstractVector{Float64}} linearity::Vector{Linearity}, ) where {S<:AbstractVector{Float64}} empty!(coloring_storage) - _compute_gradient_sparsity!(coloring_storage, expr.nodes) + _compute_gradient_sparsity!(coloring_storage, expr) for k in dependent_subexpressions - _compute_gradient_sparsity!( - coloring_storage, - subexpressions[k].nodes, - ) + _compute_gradient_sparsity!(coloring_storage, subexpressions[k]) end grad_sparsity = sort!(collect(coloring_storage)) empty!(coloring_storage) diff --git a/test/ReverseAD.jl b/test/ReverseAD.jl index d13e15f..eed7c10 100644 --- a/test/ReverseAD.jl +++ b/test/ReverseAD.jl @@ -562,7 +562,15 @@ function test_linearity() end if length(indices) > 0 empty!(indexed_set) - ArrayDiff._compute_gradient_sparsity!(indexed_set, nodes) + f = ArrayDiff._SubexpressionStorage( + nodes, + adj, + convert(Vector{Float64}, expr.values), + expr.block_shapes, + Float64[], + ret[1], + ) + ArrayDiff._compute_gradient_sparsity!(indexed_set, f) ix = sort(collect(indexed_set)) @test indices == ix end From bcc45c3245063756a6e749e4bc575f2b5c85616d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Wed, 6 May 2026 13:26:13 +0200 Subject: [PATCH 2/3] Fix format --- src/graph_tools.jl | 5 +++-- src/reverse_mode.jl | 7 +++---- src/sizes.jl | 4 ++-- src/types.jl | 3 +-- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/graph_tools.jl b/src/graph_tools.jl index b8dcfe4..4d20d47 100644 --- a/src/graph_tools.jl +++ b/src/graph_tools.jl @@ -261,10 +261,11 @@ function _compute_gradient_sparsity!(indices::Coloring.IndexedSet, f) push!(indices, node.index) elseif node.type == NODE_VARIABLE_BLOCK len = _length(f.sizes, k) - for i in 0:(len - 1) + for i in 0:(len-1) push!(indices, node.index + i) end - elseif node.type == NODE_MOI_VARIABLE || node.type == NODE_MOI_VARIABLE_BLOCK + elseif node.type == NODE_MOI_VARIABLE || + node.type == NODE_MOI_VARIABLE_BLOCK error( "Internal error: Invalid to compute sparsity if $(node.type) " * "nodes are present.", diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl index b4125cb..ba35fdb 100644 --- a/src/reverse_mode.jl +++ b/src/reverse_mode.jl @@ -133,7 +133,7 @@ function _forward_eval( len = length(tape_range) copyto!( view(f.forward_storage, tape_range), - view(x, node.index:(node.index + len - 1)), + view(x, node.index:(node.index+len-1)), ) elseif node.type == NODE_VALUE_BLOCK # Pre-loaded into `forward_storage` at construction. @@ -973,9 +973,8 @@ function _extract_reverse_pass_inner( # accumulate into the matching slice of `output`. tape_range = _storage_range(f.sizes, k) len = length(tape_range) - x_range = node.index:(node.index + len - 1) - cpu_buf = - convert(Vector{T}, view(f.reverse_storage, tape_range)) + x_range = node.index:(node.index+len-1) + cpu_buf = convert(Vector{T}, view(f.reverse_storage, tape_range)) view(output, x_range) .+= scale .* cpu_buf elseif node.type == NODE_VARIABLE # Per-leaf scalar — rare, so the per-leaf `cudaMemcpy` is fine. diff --git a/src/sizes.jl b/src/sizes.jl index ac41c97..b603a3a 100644 --- a/src/sizes.jl +++ b/src/sizes.jl @@ -465,8 +465,8 @@ struct _SubexpressionStorage{S<:AbstractVector{Float64}} if node.type == NODE_VALUE_BLOCK j = sizes.storage_offset[k] + 1 len = _length(sizes, k) - cpu_buffer[j:(j + len - 1)] .= - view(const_values, node.index:(node.index + len - 1)) + cpu_buffer[j:(j+len-1)] .= + view(const_values, node.index:(node.index+len-1)) end end forward_storage = convert(S, cpu_buffer) diff --git a/src/types.jl b/src/types.jl index 03c321f..589bb92 100644 --- a/src/types.jl +++ b/src/types.jl @@ -27,8 +27,7 @@ struct Expression{T} nodes::Vector{Node} values::Vector{T} block_shapes::Dict{Int,Vector{Int}} - Expression{T}() where {T} = - new{T}(Node[], T[], Dict{Int,Vector{Int}}()) + Expression{T}() where {T} = new{T}(Node[], T[], Dict{Int,Vector{Int}}()) end function Base.:(==)(x::Expression, y::Expression) From 78634c5d4858221bc15e15b6a6e4effd512e7840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Wed, 6 May 2026 15:20:06 +0200 Subject: [PATCH 3/3] Fix alloc --- src/reverse_mode.jl | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl index ba35fdb..3f8a372 100644 --- a/src/reverse_mode.jl +++ b/src/reverse_mode.jl @@ -968,14 +968,11 @@ function _extract_reverse_pass_inner( @assert length(f.reverse_storage) >= _length(f.sizes) for (k, node) in enumerate(f.nodes) if node.type == NODE_VARIABLE_BLOCK - # Each block has a contiguous tape range and a contiguous `output` - # range: gather the adjoint, transfer to host in one memcpy, and - # accumulate into the matching slice of `output`. tape_range = _storage_range(f.sizes, k) len = length(tape_range) x_range = node.index:(node.index+len-1) - cpu_buf = convert(Vector{T}, view(f.reverse_storage, tape_range)) - view(output, x_range) .+= scale .* cpu_buf + view(output, x_range) .+= + scale .* view(f.reverse_storage, tape_range) elseif node.type == NODE_VARIABLE # Per-leaf scalar — rare, so the per-leaf `cudaMemcpy` is fine. output[node.index] += scale * @s f.reverse_storage[k]