From f8010e91364c38c954658488848e89621451b63a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Legat?= <benoit.legat@gmail.com>
Date: Wed, 6 May 2026 13:12:17 +0200
Subject: [PATCH 1/3] Vectorized loading

---
 src/graph_tools.jl          |  69 ++++++++++++++++++------
 src/mathoptinterface_api.jl |   2 +-
 src/parse_moi.jl            | 104 ++++++++----------------------------
 src/reverse_mode.jl         |  25 ++++++++-
 src/sizes.jl                |  32 +++++++++--
 src/types.jl                |  25 ++++++---
 test/ReverseAD.jl           |  10 +++-
 7 files changed, 157 insertions(+), 110 deletions(-)

diff --git a/src/graph_tools.jl b/src/graph_tools.jl
index 5edf062..b8dcfe4 100644
--- a/src/graph_tools.jl
+++ b/src/graph_tools.jl
@@ -23,6 +23,16 @@ field, which should be interpreted as follows:
  * `NODE_CALL_UNIVARIATE_BROADCASTED`: the index into `operators.univariate_operators`
  * `NODE_CALL_MULTIVARIATE_BROADCASTED`: the index into `operators.multivariate_operators`
  * `NODE_CALL_REDUCE`: the index into `operators.multivariate_operators`
+ * `NODE_MOI_VARIABLE_BLOCK`: a contiguous block of `MOI.VariableIndex`. The
+   `index` field is the value of the FIRST `MOI.VariableIndex` in the block;
+   the shape `(m, n)` is stored in `Expression.block_shapes`. The block holds
+   `m * n` variables in column-major order.
+ * `NODE_VARIABLE_BLOCK`: same as `NODE_MOI_VARIABLE_BLOCK` but after MOI
+   variable indices have been remapped to consecutive 1-based internal
+   indices. `index` is the FIRST internal index of the block.
+ * `NODE_VALUE_BLOCK`: a contiguous block of constants. `index` is the start
+   index in the `.values` field; the next `m * n` entries (column-major) are
+   the block's data. Shape stored in `Expression.block_shapes`.
 """
 @enum(
     NodeType,
@@ -51,6 +61,12 @@ field, which should be interpreted as follows:
     NODE_CALL_MULTIVARIATE_BROADCASTED,
     # Index into the multivariate operators, with reduction
     NODE_CALL_REDUCE,
+    # Block-of-variables node, before MOI → internal index remap.
+    NODE_MOI_VARIABLE_BLOCK,
+    # Block-of-variables node, after MOI → internal index remap.
+    NODE_VARIABLE_BLOCK,
+    # Block-of-constants node.
+    NODE_VALUE_BLOCK,
 )
 
 @enum(Linearity, CONSTANT, LINEAR, PIECEWISE_LINEAR, NONLINEAR)
@@ -96,6 +112,16 @@ function _replace_moi_variables(
                 moi_index_to_consecutive_index[MOI.VariableIndex(node.index)],
                 node.parent,
             )
+        elseif node.type == NODE_MOI_VARIABLE_BLOCK
+            # `node.index` is the FIRST MOI variable index in the block. For an
+            # `ArrayOfContiguousVariables`, all variables in the block are
+            # contiguous in MOI's ordering, so we just remap the first index
+            # and reuse it as the consecutive offset. The block's length comes
+            # from `Expression.block_shapes[i]`, which the caller threads
+            # through separately.
+            first_consec =
+                moi_index_to_consecutive_index[MOI.VariableIndex(node.index)]
+            new_nodes[i] = Node(NODE_VARIABLE_BLOCK, first_consec, node.parent)
         else
             new_nodes[i] = node
         end
@@ -122,10 +148,10 @@ function _classify_linearity(
     children_arr = SparseArrays.rowvals(adj)
     for k in length(nodes):-1:1
         node = nodes[k]
-        if node.type == NODE_VARIABLE
+        if node.type == NODE_VARIABLE || node.type == NODE_VARIABLE_BLOCK
             linearity[k] = LINEAR
             continue
-        elseif node.type == NODE_VALUE
+        elseif node.type == NODE_VALUE || node.type == NODE_VALUE_BLOCK
             linearity[k] = CONSTANT
             continue
         elseif node.type == NODE_PARAMETER
@@ -218,24 +244,29 @@ function _classify_linearity(
 end
 
 """
-    _compute_gradient_sparsity!(
-        indices::Coloring.IndexedSet,
-        nodes::Vector{Nonlinear.Node},
-    )
+    _compute_gradient_sparsity!(indices::Coloring.IndexedSet, f)
+
+Compute the sparsity pattern of the gradient of an expression (that is, a list
+of which variable indices are present).
 
-Compute the sparsity pattern of the gradient of an expression (that is, a list of
-which variable indices are present).
+`f` is duck-typed (as its type is defined later) to a
+`_SubexpressionStorage`-like object exposing `f.nodes` and `f.sizes`.
+For `NODE_VARIABLE_BLOCK` nodes, the block's length is read from `f.sizes`
+(via `_length`), and the block contributes that many consecutive variable
+indices starting at `nodes[k].index`.
 """
-function _compute_gradient_sparsity!(
-    indices::Coloring.IndexedSet,
-    nodes::Vector{Node},
-)
-    for node in nodes
+function _compute_gradient_sparsity!(indices::Coloring.IndexedSet, f)
+    for (k, node) in enumerate(f.nodes)
         if node.type == NODE_VARIABLE
             push!(indices, node.index)
-        elseif node.type == NODE_MOI_VARIABLE
+        elseif node.type == NODE_VARIABLE_BLOCK
+            len = _length(f.sizes, k)
+            for i in 0:(len - 1)
+                push!(indices, node.index + i)
+            end
+        elseif node.type == NODE_MOI_VARIABLE || node.type == NODE_MOI_VARIABLE_BLOCK
             error(
-                "Internal error: Invalid to compute sparsity if NODE_MOI_VARIABLE " *
+                "Internal error: Invalid to compute sparsity if $(node.type) " *
                 "nodes are present.",
             )
         end
@@ -341,6 +372,7 @@ function _compute_hessian_sparsity(
     child_group_variables = Dict{Int,Set{Int}}()
     for (k, node) in enumerate(nodes)
         @assert node.type != NODE_MOI_VARIABLE
+        @assert node.type != NODE_MOI_VARIABLE_BLOCK
         if input_linearity[k] == CONSTANT
             continue  # No hessian contribution from constant nodes
         end
@@ -376,6 +408,13 @@ function _compute_hessian_sparsity(
                         child_group_variables[child_group_idx],
                         nodes[r].index,
                     )
+                elseif nodes[r].type == NODE_VARIABLE_BLOCK
+                    # TODO `NODE_VARIABLE_BLOCK` would need its block shape to
+                    # enumerate the variable indices.
+                    error(
+                        "Internal error: Hessian sparsity for " *
+                        "NODE_VARIABLE_BLOCK is not yet supported.",
+                    )
                 elseif nodes[r].type == NODE_SUBEXPRESSION
                     sub_vars = subexpression_variables[nodes[r].index]
                     if !haskey(child_group_variables, child_group_idx)
diff --git a/src/mathoptinterface_api.jl b/src/mathoptinterface_api.jl
index bab095a..126630f 100644
--- a/src/mathoptinterface_api.jl
+++ b/src/mathoptinterface_api.jl
@@ -87,7 +87,7 @@ function MOI.initialize(
             max(max_expr_with_sub_length, length(subex.nodes))
         if d.want_hess
             empty!(coloring_storage)
-            _compute_gradient_sparsity!(coloring_storage, subex.nodes)
+            _compute_gradient_sparsity!(coloring_storage, subex)
             # union with all dependent expressions
             for idx in _list_subexpressions(subex.nodes)
                 union!(coloring_storage, subexpression_variables[idx])
diff --git a/src/parse_moi.jl b/src/parse_moi.jl
index 31b09f1..4598058 100644
--- a/src/parse_moi.jl
+++ b/src/parse_moi.jl
@@ -140,98 +140,38 @@ end
 # ── ArrayOfContiguousVariables ───────────────────────────────────────────────────
 
 function _parse_moi_stack!(
-    stack::Vector{Tuple{Int,Any}},
-    data::Model,
-    expr::Expression,
-    x::ArrayOfContiguousVariables{2},
-    parent_index::Int,
-)
-    m, n = x.size
-    # Build vcat(row(v11, v12, ...), row(v21, v22, ...), ...).
-    #
-    # The outer loop is `1:m` (forward order), NOT `m:-1:1`. The `:row` nodes
-    # we push end up at consecutive positions in `expr.nodes`, and `:vcat`
-    # later reads its children in tape-index order (CSC `nzrange`) — so the
-    # row with the smallest tape index becomes row 1 of the output matrix.
-    # If the outer loop ran in reverse, `row_m` would land at the smallest
-    # tape index and `:vcat` would silently place it as row 1, producing a
-    # row-flipped matrix on the tape (a latent bug, fixed here).
-    #
-    # The inner loop stays `n:-1:1` because the items go on the stack and pop
-    # in LIFO order — pushing in reverse j order gives forward j-order on
-    # pop, which matches the column-major layout below.
-    vcat_id = data.operators.multivariate_operator_to_id[:vcat]
-    row_id = data.operators.multivariate_operator_to_id[:row]
-    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index))
-    vcat_idx = length(expr.nodes)
-    for i in 1:m
-        push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx))
-        row_idx = length(expr.nodes)
-        for j in n:-1:1
-            vi = MOI.VariableIndex(x.offset + (j - 1) * m + i)
-            push!(stack, (row_idx, vi))
-        end
-    end
-    return
-end
-
-function _parse_moi_stack!(
-    stack::Vector{Tuple{Int,Any}},
-    data::Model,
+    ::Vector{Tuple{Int,Any}},
+    ::Model,
     expr::Expression,
-    x::ArrayOfContiguousVariables{1},
+    x::ArrayOfContiguousVariables,
     parent_index::Int,
 )
-    m = x.size[1]
-    vect_id = data.operators.multivariate_operator_to_id[:vect]
-    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vect_id, parent_index))
-    vect_idx = length(expr.nodes)
-    for i in m:-1:1
-        vi = MOI.VariableIndex(x.offset + i)
-        push!(stack, (vect_idx, vi))
-    end
+    # Emit a single block node. The block represents the contiguous range of
+    # MOI variable indices `x.offset+1, ...`, laid out in
+    # column-major order (matching `Array{Float64}` and `Base.LinearIndices`),
+    # which is the layout `_view_array` will see at evaluation time.
+    push!(expr.nodes, Node(NODE_MOI_VARIABLE_BLOCK, x.offset + 1, parent_index))
+    expr.block_shapes[length(expr.nodes)] = collect(x.size)
     return
 end
 
-# ── Constant matrices and vectors ────────────────────────────────────────────
+# ── Constant arrays ────────────────────────────────────────────
 
 function _parse_moi_stack!(
-    stack::Vector{Tuple{Int,Any}},
-    data::Model,
-    expr::Expression,
-    x::AbstractMatrix{<:Real},
-    parent_index::Int,
-)
-    m, n = size(x)
-    # See the `ArrayOfContiguousVariables{2}` overload for the rationale on
-    # the `1:m` outer loop (the previous `m:-1:1` produced a row-flipped
-    # matrix on the tape).
-    vcat_id = data.operators.multivariate_operator_to_id[:vcat]
-    row_id = data.operators.multivariate_operator_to_id[:row]
-    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index))
-    vcat_idx = length(expr.nodes)
-    for i in 1:m
-        push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx))
-        row_idx = length(expr.nodes)
-        for j in n:-1:1
-            push!(stack, (row_idx, x[i, j]))
-        end
-    end
-    return
-end
-
-function _parse_moi_stack!(
-    stack::Vector{Tuple{Int,Any}},
-    data::Model,
+    ::Vector{Tuple{Int,Any}},
+    ::Model,
     expr::Expression,
-    x::AbstractVector{<:Real},
+    x::AbstractArray{<:Real},
     parent_index::Int,
 )
-    vect_id = data.operators.multivariate_operator_to_id[:vect]
-    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vect_id, parent_index))
-    vect_idx = length(expr.nodes)
-    for i in length(x):-1:1
-        push!(stack, (vect_idx, x[i]))
-    end
+    # Emit a single value block. We push the flat values to
+    # `expr.values` in column-major order (matching `Array{Float64}`'s memory
+    # layout); `node.index` records the start of that contiguous range so
+    # `_SubexpressionStorage` can copy it into the tape in one block at
+    # construction time.
+    start_idx = length(expr.values) + 1
+    append!(expr.values, x)
+    push!(expr.nodes, Node(NODE_VALUE_BLOCK, start_idx, parent_index))
+    expr.block_shapes[length(expr.nodes)] = collect(size(x))
     return
 end
diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl
index c789461..b4125cb 100644
--- a/src/reverse_mode.jl
+++ b/src/reverse_mode.jl
@@ -125,6 +125,18 @@ function _forward_eval(
             #     f.forward_storage[k] = x[node.index]
         elseif node.type == NODE_VALUE
             f.forward_storage[j] = f.const_values[node.index]
+        elseif node.type == NODE_VARIABLE_BLOCK
+            # Contiguous-to-contiguous copy from `x` into the tape: on CPU a
+            # `copyto!`, on GPU a single `cudaMemcpy`. This is the fast path
+            # for matrix variables from `ArrayOfContiguousVariables{2}`.
+            tape_range = _storage_range(f.sizes, k)
+            len = length(tape_range)
+            copyto!(
+                view(f.forward_storage, tape_range),
+                view(x, node.index:(node.index + len - 1)),
+            )
+        elseif node.type == NODE_VALUE_BLOCK
+            # Pre-loaded into `forward_storage` at construction.
         elseif node.type == NODE_SUBEXPRESSION
             f.forward_storage[j] = d.subexpression_forward_values[node.index]
         elseif node.type == NODE_PARAMETER
@@ -955,7 +967,18 @@ function _extract_reverse_pass_inner(
 ) where {T}
     @assert length(f.reverse_storage) >= _length(f.sizes)
     for (k, node) in enumerate(f.nodes)
-        if node.type == NODE_VARIABLE
+        if node.type == NODE_VARIABLE_BLOCK
+            # Each block has a contiguous tape range and a contiguous `output`
+            # range: gather the adjoint, transfer to host in one memcpy, and
+            # accumulate into the matching slice of `output`.
+            tape_range = _storage_range(f.sizes, k)
+            len = length(tape_range)
+            x_range = node.index:(node.index + len - 1)
+            cpu_buf =
+                convert(Vector{T}, view(f.reverse_storage, tape_range))
+            view(output, x_range) .+= scale .* cpu_buf
+        elseif node.type == NODE_VARIABLE
+            # Per-leaf scalar — rare, so the per-leaf `cudaMemcpy` is fine.
             output[node.index] += scale * @s f.reverse_storage[k]
         elseif node.type == NODE_SUBEXPRESSION
             subexpressions[node.index] += scale * @s f.reverse_storage[k]
diff --git a/src/sizes.jl b/src/sizes.jl
index 1c76881..ac41c97 100644
--- a/src/sizes.jl
+++ b/src/sizes.jl
@@ -179,7 +179,7 @@ macro j(expr)
 end
 
 # /!\ Can only be called in decreasing `k` order
-function _add_size!(sizes::Sizes, k::Int, size::Tuple)
+function _add_size!(sizes::Sizes, k::Int, size)
     sizes.ndims[k] = length(size)
     sizes.size_offset[k] = length(sizes.size)
     append!(sizes.size, size)
@@ -207,6 +207,7 @@ end
 function _infer_sizes(
     nodes::Vector{Node},
     adj::SparseArrays.SparseMatrixCSC{Bool,Int},
+    block_shapes::Dict{Int,Vector{Int}} = Dict{Int,Vector{Int}}(),
 )
     sizes = Sizes(
         zeros(Int, length(nodes)),
@@ -217,6 +218,15 @@ function _infer_sizes(
     children_arr = SparseArrays.rowvals(adj)
     for k in length(nodes):-1:1
         node = nodes[k]
+        # Block leaves carry their shape in `block_shapes`; they're 2D (m, n)
+        # by construction.
+        if node.type == NODE_VARIABLE_BLOCK ||
+           node.type == NODE_VALUE_BLOCK ||
+           node.type == NODE_MOI_VARIABLE_BLOCK
+            shape = block_shapes[k]
+            _add_size!(sizes, k, shape)
+            continue
+        end
         children_indices = SparseArrays.nzrange(adj, k)
         N = length(children_indices)
         if node.type == NODE_CALL_MULTIVARIATE
@@ -438,18 +448,34 @@ struct _SubexpressionStorage{S<:AbstractVector{Float64}}
         nodes::Vector{Node},
         adj::SparseArrays.SparseMatrixCSC{Bool,Int},
         const_values::Vector{Float64},
+        block_shapes::Dict{Int,Vector{Int}},
         partials_storage_ϵ::Vector{Float64},
         linearity::Linearity,
         ::Type{S} = Vector{Float64},
     ) where {S<:AbstractVector{Float64}}
-        sizes = _infer_sizes(nodes, adj)
+        sizes = _infer_sizes(nodes, adj, block_shapes)
         N = _length(sizes)
+        # Pre-load value blocks into forward_storage once at construction;
+        # each block is a contiguous-to-contiguous bulk copy. Individual
+        # `NODE_VALUE` scalars (rare — exponents, constant divisors, etc) and
+        # variable nodes are loaded by `_forward_eval` in the per-node loop.
+        cpu_buffer = zeros(N)
+        for k in 1:length(nodes)
+            node = nodes[k]
+            if node.type == NODE_VALUE_BLOCK
+                j = sizes.storage_offset[k] + 1
+                len = _length(sizes, k)
+                cpu_buffer[j:(j + len - 1)] .=
+                    view(const_values, node.index:(node.index + len - 1))
+            end
+        end
+        forward_storage = convert(S, cpu_buffer)
         return new{S}(
             nodes,
             adj,
             sizes,
             const_values,
-            fill!(S(undef, N), 0.0),  # forward_storage,
+            forward_storage,
             fill!(S(undef, N), 0.0),  # partials_storage,
             fill!(S(undef, N), 0.0),  # reverse_storage,
             partials_storage_ϵ,
diff --git a/src/types.jl b/src/types.jl
index 1127f62..03c321f 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -8,20 +8,33 @@
     struct Expression
         nodes::Vector{Node}
         values::Vector{Float64}
+        block_shapes::Dict{Int,Vector{Int}}
     end
 
 The core type that represents a nonlinear expression. See the MathOptInterface
 documentation for information on how the nodes and values form an expression
 tree.
+
+`block_shapes[k]` is the shape of node `k` (as a `Vector{Int}` of dimensions:
+`[m]` for a 1D vector block, `[m, n]` for a 2D matrix block, `[m, n, p]` for a
+3D tensor block, ...) when `nodes[k]` is one of `NODE_MOI_VARIABLE_BLOCK`,
+`NODE_VARIABLE_BLOCK`, or `NODE_VALUE_BLOCK`. Block nodes are leaves that
+stand in for an entire `prod(shape)`-element array of variables (or
+constants), preserving contiguity end-to-end so the AD tape can be filled and
+gathered with single contiguous bulk operations.
 """
 struct Expression{T}
     nodes::Vector{Node}
     values::Vector{T}
-    Expression{T}() where {T} = new{T}(Node[], T[])
+    block_shapes::Dict{Int,Vector{Int}}
+    Expression{T}() where {T} =
+        new{T}(Node[], T[], Dict{Int,Vector{Int}}())
 end
 
 function Base.:(==)(x::Expression, y::Expression)
-    return x.nodes == y.nodes && x.values == y.values
+    return x.nodes == y.nodes &&
+           x.values == y.values &&
+           x.block_shapes == y.block_shapes
 end
 
 """
@@ -103,6 +116,7 @@ function _subexpression_and_linearity(
         nodes,
         adj,
         convert(Vector{Float64}, expr.values),
+        copy(expr.block_shapes),
         partials_storage_ϵ,
         linearity[1],
         S,
@@ -133,12 +147,9 @@ struct _FunctionStorage{S<:AbstractVector{Float64}}
         linearity::Vector{Linearity},
     ) where {S<:AbstractVector{Float64}}
         empty!(coloring_storage)
-        _compute_gradient_sparsity!(coloring_storage, expr.nodes)
+        _compute_gradient_sparsity!(coloring_storage, expr)
         for k in dependent_subexpressions
-            _compute_gradient_sparsity!(
-                coloring_storage,
-                subexpressions[k].nodes,
-            )
+            _compute_gradient_sparsity!(coloring_storage, subexpressions[k])
         end
         grad_sparsity = sort!(collect(coloring_storage))
         empty!(coloring_storage)
diff --git a/test/ReverseAD.jl b/test/ReverseAD.jl
index d13e15f..eed7c10 100644
--- a/test/ReverseAD.jl
+++ b/test/ReverseAD.jl
@@ -562,7 +562,15 @@ function test_linearity()
         end
         if length(indices) > 0
             empty!(indexed_set)
-            ArrayDiff._compute_gradient_sparsity!(indexed_set, nodes)
+            f = ArrayDiff._SubexpressionStorage(
+                nodes,
+                adj,
+                convert(Vector{Float64}, expr.values),
+                expr.block_shapes,
+                Float64[],
+                ret[1],
+            )
+            ArrayDiff._compute_gradient_sparsity!(indexed_set, f)
             ix = sort(collect(indexed_set))
             @test indices == ix
         end

From bcc45c3245063756a6e749e4bc575f2b5c85616d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Legat?= <benoit.legat@gmail.com>
Date: Wed, 6 May 2026 13:26:13 +0200
Subject: [PATCH 2/3] Fix format

---
 src/graph_tools.jl  | 5 +++--
 src/reverse_mode.jl | 7 +++----
 src/sizes.jl        | 4 ++--
 src/types.jl        | 3 +--
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/graph_tools.jl b/src/graph_tools.jl
index b8dcfe4..4d20d47 100644
--- a/src/graph_tools.jl
+++ b/src/graph_tools.jl
@@ -261,10 +261,11 @@ function _compute_gradient_sparsity!(indices::Coloring.IndexedSet, f)
             push!(indices, node.index)
         elseif node.type == NODE_VARIABLE_BLOCK
             len = _length(f.sizes, k)
-            for i in 0:(len - 1)
+            for i in 0:(len-1)
                 push!(indices, node.index + i)
             end
-        elseif node.type == NODE_MOI_VARIABLE || node.type == NODE_MOI_VARIABLE_BLOCK
+        elseif node.type == NODE_MOI_VARIABLE ||
+               node.type == NODE_MOI_VARIABLE_BLOCK
             error(
                 "Internal error: Invalid to compute sparsity if $(node.type) " *
                 "nodes are present.",
diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl
index b4125cb..ba35fdb 100644
--- a/src/reverse_mode.jl
+++ b/src/reverse_mode.jl
@@ -133,7 +133,7 @@ function _forward_eval(
             len = length(tape_range)
             copyto!(
                 view(f.forward_storage, tape_range),
-                view(x, node.index:(node.index + len - 1)),
+                view(x, node.index:(node.index+len-1)),
             )
         elseif node.type == NODE_VALUE_BLOCK
             # Pre-loaded into `forward_storage` at construction.
@@ -973,9 +973,8 @@ function _extract_reverse_pass_inner(
             # accumulate into the matching slice of `output`.
             tape_range = _storage_range(f.sizes, k)
             len = length(tape_range)
-            x_range = node.index:(node.index + len - 1)
-            cpu_buf =
-                convert(Vector{T}, view(f.reverse_storage, tape_range))
+            x_range = node.index:(node.index+len-1)
+            cpu_buf = convert(Vector{T}, view(f.reverse_storage, tape_range))
             view(output, x_range) .+= scale .* cpu_buf
         elseif node.type == NODE_VARIABLE
             # Per-leaf scalar — rare, so the per-leaf `cudaMemcpy` is fine.
diff --git a/src/sizes.jl b/src/sizes.jl
index ac41c97..b603a3a 100644
--- a/src/sizes.jl
+++ b/src/sizes.jl
@@ -465,8 +465,8 @@ struct _SubexpressionStorage{S<:AbstractVector{Float64}}
             if node.type == NODE_VALUE_BLOCK
                 j = sizes.storage_offset[k] + 1
                 len = _length(sizes, k)
-                cpu_buffer[j:(j + len - 1)] .=
-                    view(const_values, node.index:(node.index + len - 1))
+                cpu_buffer[j:(j+len-1)] .=
+                    view(const_values, node.index:(node.index+len-1))
             end
         end
         forward_storage = convert(S, cpu_buffer)
diff --git a/src/types.jl b/src/types.jl
index 03c321f..589bb92 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -27,8 +27,7 @@ struct Expression{T}
     nodes::Vector{Node}
     values::Vector{T}
     block_shapes::Dict{Int,Vector{Int}}
-    Expression{T}() where {T} =
-        new{T}(Node[], T[], Dict{Int,Vector{Int}}())
+    Expression{T}() where {T} = new{T}(Node[], T[], Dict{Int,Vector{Int}}())
 end
 
 function Base.:(==)(x::Expression, y::Expression)

From 78634c5d4858221bc15e15b6a6e4effd512e7840 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Legat?= <benoit.legat@gmail.com>
Date: Wed, 6 May 2026 15:20:06 +0200
Subject: [PATCH 3/3] Fix alloc

---
 src/reverse_mode.jl | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl
index ba35fdb..3f8a372 100644
--- a/src/reverse_mode.jl
+++ b/src/reverse_mode.jl
@@ -968,14 +968,11 @@ function _extract_reverse_pass_inner(
     @assert length(f.reverse_storage) >= _length(f.sizes)
     for (k, node) in enumerate(f.nodes)
         if node.type == NODE_VARIABLE_BLOCK
-            # Each block has a contiguous tape range and a contiguous `output`
-            # range: gather the adjoint, transfer to host in one memcpy, and
-            # accumulate into the matching slice of `output`.
             tape_range = _storage_range(f.sizes, k)
             len = length(tape_range)
             x_range = node.index:(node.index+len-1)
-            cpu_buf = convert(Vector{T}, view(f.reverse_storage, tape_range))
-            view(output, x_range) .+= scale .* cpu_buf
+            view(output, x_range) .+=
+                scale .* view(f.reverse_storage, tape_range)
         elseif node.type == NODE_VARIABLE
             # Per-leaf scalar — rare, so the per-leaf `cudaMemcpy` is fine.
             output[node.index] += scale * @s f.reverse_storage[k]