blegat · blegat · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/src/graph_tools.jl b/src/graph_tools.jl
@@ -23,6 +23,16 @@ field, which should be interpreted as follows:
  * `NODE_CALL_UNIVARIATE_BROADCASTED`: the index into `operators.univariate_operators`
  * `NODE_CALL_MULTIVARIATE_BROADCASTED`: the index into `operators.multivariate_operators`
  * `NODE_CALL_REDUCE`: the index into `operators.multivariate_operators`
+ * `NODE_MOI_VARIABLE_BLOCK`: a contiguous block of `MOI.VariableIndex`. The
+   `index` field is the value of the FIRST `MOI.VariableIndex` in the block;
+   the shape `(m, n)` is stored in `Expression.block_shapes`. The block holds
+   `m * n` variables in column-major order.
+ * `NODE_VARIABLE_BLOCK`: same as `NODE_MOI_VARIABLE_BLOCK` but after MOI
+   variable indices have been remapped to consecutive 1-based internal
+   indices. `index` is the FIRST internal index of the block.
+ * `NODE_VALUE_BLOCK`: a contiguous block of constants. `index` is the start
+   index in the `.values` field; the next `m * n` entries (column-major) are
+   the block's data. Shape stored in `Expression.block_shapes`.
 """
 @enum(
     NodeType,
@@ -51,6 +61,12 @@ field, which should be interpreted as follows:
     NODE_CALL_MULTIVARIATE_BROADCASTED,
     # Index into the multivariate operators, with reduction
     NODE_CALL_REDUCE,
+    # Block-of-variables node, before MOI → internal index remap.
+    NODE_MOI_VARIABLE_BLOCK,
+    # Block-of-variables node, after MOI → internal index remap.
+    NODE_VARIABLE_BLOCK,
+    # Block-of-constants node.
+    NODE_VALUE_BLOCK,
 )
 
 @enum(Linearity, CONSTANT, LINEAR, PIECEWISE_LINEAR, NONLINEAR)
@@ -96,6 +112,16 @@ function _replace_moi_variables(
                 moi_index_to_consecutive_index[MOI.VariableIndex(node.index)],
                 node.parent,
             )
+        elseif node.type == NODE_MOI_VARIABLE_BLOCK
+            # `node.index` is the FIRST MOI variable index in the block. For an
+            # `ArrayOfContiguousVariables`, all variables in the block are
+            # contiguous in MOI's ordering, so we just remap the first index
+            # and reuse it as the consecutive offset. The block's length comes
+            # from `Expression.block_shapes[i]`, which the caller threads
+            # through separately.
+            first_consec =
+                moi_index_to_consecutive_index[MOI.VariableIndex(node.index)]
+            new_nodes[i] = Node(NODE_VARIABLE_BLOCK, first_consec, node.parent)
         else
             new_nodes[i] = node
         end
@@ -122,10 +148,10 @@ function _classify_linearity(
     children_arr = SparseArrays.rowvals(adj)
     for k in length(nodes):-1:1
         node = nodes[k]
-        if node.type == NODE_VARIABLE
+        if node.type == NODE_VARIABLE || node.type == NODE_VARIABLE_BLOCK
             linearity[k] = LINEAR
             continue
-        elseif node.type == NODE_VALUE
+        elseif node.type == NODE_VALUE || node.type == NODE_VALUE_BLOCK
             linearity[k] = CONSTANT
             continue
         elseif node.type == NODE_PARAMETER
@@ -218,24 +244,30 @@ function _classify_linearity(
 end
 
 """
-    _compute_gradient_sparsity!(
-        indices::Coloring.IndexedSet,
-        nodes::Vector{Nonlinear.Node},
-    )
+    _compute_gradient_sparsity!(indices::Coloring.IndexedSet, f)
+
+Compute the sparsity pattern of the gradient of an expression (that is, a list
+of which variable indices are present).
 
-Compute the sparsity pattern of the gradient of an expression (that is, a list of
-which variable indices are present).
+`f` is duck-typed (as its type is defined later) to a
+`_SubexpressionStorage`-like object exposing `f.nodes` and `f.sizes`.
+For `NODE_VARIABLE_BLOCK` nodes, the block's length is read from `f.sizes`
+(via `_length`), and the block contributes that many consecutive variable
+indices starting at `nodes[k].index`.
 """
-function _compute_gradient_sparsity!(
-    indices::Coloring.IndexedSet,
-    nodes::Vector{Node},
-)
-    for node in nodes
+function _compute_gradient_sparsity!(indices::Coloring.IndexedSet, f)
+    for (k, node) in enumerate(f.nodes)
         if node.type == NODE_VARIABLE
             push!(indices, node.index)
-        elseif node.type == NODE_MOI_VARIABLE
+        elseif node.type == NODE_VARIABLE_BLOCK
+            len = _length(f.sizes, k)
+            for i in 0:(len-1)
+                push!(indices, node.index + i)
+            end
+        elseif node.type == NODE_MOI_VARIABLE ||
+               node.type == NODE_MOI_VARIABLE_BLOCK
             error(
-                "Internal error: Invalid to compute sparsity if NODE_MOI_VARIABLE " *
+                "Internal error: Invalid to compute sparsity if $(node.type) " *
                 "nodes are present.",
             )
         end
@@ -341,6 +373,7 @@ function _compute_hessian_sparsity(
     child_group_variables = Dict{Int,Set{Int}}()
     for (k, node) in enumerate(nodes)
         @assert node.type != NODE_MOI_VARIABLE
+        @assert node.type != NODE_MOI_VARIABLE_BLOCK
         if input_linearity[k] == CONSTANT
             continue  # No hessian contribution from constant nodes
         end
@@ -376,6 +409,13 @@ function _compute_hessian_sparsity(
                         child_group_variables[child_group_idx],
                         nodes[r].index,
                     )
+                elseif nodes[r].type == NODE_VARIABLE_BLOCK
+                    # TODO `NODE_VARIABLE_BLOCK` would need its block shape to
+                    # enumerate the variable indices.
+                    error(
+                        "Internal error: Hessian sparsity for " *
+                        "NODE_VARIABLE_BLOCK is not yet supported.",
+                    )
                 elseif nodes[r].type == NODE_SUBEXPRESSION
                     sub_vars = subexpression_variables[nodes[r].index]
                     if !haskey(child_group_variables, child_group_idx)

diff --git a/src/mathoptinterface_api.jl b/src/mathoptinterface_api.jl
@@ -87,7 +87,7 @@ function MOI.initialize(
             max(max_expr_with_sub_length, length(subex.nodes))
         if d.want_hess
             empty!(coloring_storage)
-            _compute_gradient_sparsity!(coloring_storage, subex.nodes)
+            _compute_gradient_sparsity!(coloring_storage, subex)
             # union with all dependent expressions
             for idx in _list_subexpressions(subex.nodes)
                 union!(coloring_storage, subexpression_variables[idx])

diff --git a/src/parse_moi.jl b/src/parse_moi.jl
@@ -140,98 +140,38 @@ end
 # ── ArrayOfContiguousVariables ───────────────────────────────────────────────────
 
 function _parse_moi_stack!(
-    stack::Vector{Tuple{Int,Any}},
-    data::Model,
-    expr::Expression,
-    x::ArrayOfContiguousVariables{2},
-    parent_index::Int,
-)
-    m, n = x.size
-    # Build vcat(row(v11, v12, ...), row(v21, v22, ...), ...).
-    #
-    # The outer loop is `1:m` (forward order), NOT `m:-1:1`. The `:row` nodes
-    # we push end up at consecutive positions in `expr.nodes`, and `:vcat`
-    # later reads its children in tape-index order (CSC `nzrange`) — so the
-    # row with the smallest tape index becomes row 1 of the output matrix.
-    # If the outer loop ran in reverse, `row_m` would land at the smallest
-    # tape index and `:vcat` would silently place it as row 1, producing a
-    # row-flipped matrix on the tape (a latent bug, fixed here).
-    #
-    # The inner loop stays `n:-1:1` because the items go on the stack and pop
-    # in LIFO order — pushing in reverse j order gives forward j-order on
-    # pop, which matches the column-major layout below.
-    vcat_id = data.operators.multivariate_operator_to_id[:vcat]
-    row_id = data.operators.multivariate_operator_to_id[:row]
-    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index))
-    vcat_idx = length(expr.nodes)
-    for i in 1:m
-        push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx))
-        row_idx = length(expr.nodes)
-        for j in n:-1:1
-            vi = MOI.VariableIndex(x.offset + (j - 1) * m + i)
-            push!(stack, (row_idx, vi))
-        end
-    end
-    return
-end
-
-function _parse_moi_stack!(
-    stack::Vector{Tuple{Int,Any}},
-    data::Model,
+    ::Vector{Tuple{Int,Any}},
+    ::Model,
     expr::Expression,
-    x::ArrayOfContiguousVariables{1},
+    x::ArrayOfContiguousVariables,
     parent_index::Int,
 )
-    m = x.size[1]
-    vect_id = data.operators.multivariate_operator_to_id[:vect]
-    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vect_id, parent_index))
-    vect_idx = length(expr.nodes)
-    for i in m:-1:1
-        vi = MOI.VariableIndex(x.offset + i)
-        push!(stack, (vect_idx, vi))
-    end
+    # Emit a single block node. The block represents the contiguous range of
+    # MOI variable indices `x.offset+1, ...`, laid out in
+    # column-major order (matching `Array{Float64}` and `Base.LinearIndices`),
+    # which is the layout `_view_array` will see at evaluation time.
+    push!(expr.nodes, Node(NODE_MOI_VARIABLE_BLOCK, x.offset + 1, parent_index))
+    expr.block_shapes[length(expr.nodes)] = collect(x.size)
     return
 end
 
-# ── Constant matrices and vectors ────────────────────────────────────────────
+# ── Constant arrays ────────────────────────────────────────────
 
 function _parse_moi_stack!(
-    stack::Vector{Tuple{Int,Any}},
-    data::Model,
-    expr::Expression,
-    x::AbstractMatrix{<:Real},
-    parent_index::Int,
-)
-    m, n = size(x)
-    # See the `ArrayOfContiguousVariables{2}` overload for the rationale on
-    # the `1:m` outer loop (the previous `m:-1:1` produced a row-flipped
-    # matrix on the tape).
-    vcat_id = data.operators.multivariate_operator_to_id[:vcat]
-    row_id = data.operators.multivariate_operator_to_id[:row]
-    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vcat_id, parent_index))
-    vcat_idx = length(expr.nodes)
-    for i in 1:m
-        push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, row_id, vcat_idx))
-        row_idx = length(expr.nodes)
-        for j in n:-1:1
-            push!(stack, (row_idx, x[i, j]))
-        end
-    end
-    return
-end
-
-function _parse_moi_stack!(
-    stack::Vector{Tuple{Int,Any}},
-    data::Model,
+    ::Vector{Tuple{Int,Any}},
+    ::Model,
     expr::Expression,
-    x::AbstractVector{<:Real},
+    x::AbstractArray{<:Real},
     parent_index::Int,
 )
-    vect_id = data.operators.multivariate_operator_to_id[:vect]
-    push!(expr.nodes, Node(NODE_CALL_MULTIVARIATE, vect_id, parent_index))
-    vect_idx = length(expr.nodes)
-    for i in length(x):-1:1
-        push!(stack, (vect_idx, x[i]))
-    end
+    # Emit a single value block. We push the flat values to
+    # `expr.values` in column-major order (matching `Array{Float64}`'s memory
+    # layout); `node.index` records the start of that contiguous range so
+    # `_SubexpressionStorage` can copy it into the tape in one block at
+    # construction time.
+    start_idx = length(expr.values) + 1
+    append!(expr.values, x)
+    push!(expr.nodes, Node(NODE_VALUE_BLOCK, start_idx, parent_index))
+    expr.block_shapes[length(expr.nodes)] = collect(size(x))
     return
 end
diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl
@@ -125,6 +125,18 @@ function _forward_eval(
             #     f.forward_storage[k] = x[node.index]
         elseif node.type == NODE_VALUE
             f.forward_storage[j] = f.const_values[node.index]
+        elseif node.type == NODE_VARIABLE_BLOCK
+            # Contiguous-to-contiguous copy from `x` into the tape: on CPU a
+            # `copyto!`, on GPU a single `cudaMemcpy`. This is the fast path
+            # for matrix variables from `ArrayOfContiguousVariables{2}`.
+            tape_range = _storage_range(f.sizes, k)
+            len = length(tape_range)
+            copyto!(
+                view(f.forward_storage, tape_range),
+                view(x, node.index:(node.index+len-1)),
+            )
+        elseif node.type == NODE_VALUE_BLOCK
+            # Pre-loaded into `forward_storage` at construction.
         elseif node.type == NODE_SUBEXPRESSION
             f.forward_storage[j] = d.subexpression_forward_values[node.index]
         elseif node.type == NODE_PARAMETER
@@ -955,7 +967,14 @@ function _extract_reverse_pass_inner(
 ) where {T}
     @assert length(f.reverse_storage) >= _length(f.sizes)
     for (k, node) in enumerate(f.nodes)
-        if node.type == NODE_VARIABLE
+        if node.type == NODE_VARIABLE_BLOCK
+            tape_range = _storage_range(f.sizes, k)
+            len = length(tape_range)
+            x_range = node.index:(node.index+len-1)
+            view(output, x_range) .+=
+                scale .* view(f.reverse_storage, tape_range)
+        elseif node.type == NODE_VARIABLE
+            # Per-leaf scalar — rare, so the per-leaf `cudaMemcpy` is fine.
             output[node.index] += scale * @s f.reverse_storage[k]
         elseif node.type == NODE_SUBEXPRESSION
             subexpressions[node.index] += scale * @s f.reverse_storage[k]

diff --git a/src/sizes.jl b/src/sizes.jl
@@ -179,7 +179,7 @@ macro j(expr)
 end
 
 # /!\ Can only be called in decreasing `k` order
-function _add_size!(sizes::Sizes, k::Int, size::Tuple)
+function _add_size!(sizes::Sizes, k::Int, size)
     sizes.ndims[k] = length(size)
     sizes.size_offset[k] = length(sizes.size)
     append!(sizes.size, size)
@@ -207,6 +207,7 @@ end
 function _infer_sizes(
     nodes::Vector{Node},
     adj::SparseArrays.SparseMatrixCSC{Bool,Int},
+    block_shapes::Dict{Int,Vector{Int}} = Dict{Int,Vector{Int}}(),
 )
     sizes = Sizes(
         zeros(Int, length(nodes)),
@@ -217,6 +218,15 @@ function _infer_sizes(
     children_arr = SparseArrays.rowvals(adj)
     for k in length(nodes):-1:1
         node = nodes[k]
+        # Block leaves carry their shape in `block_shapes`; they're 2D (m, n)
+        # by construction.
+        if node.type == NODE_VARIABLE_BLOCK ||
+           node.type == NODE_VALUE_BLOCK ||
+           node.type == NODE_MOI_VARIABLE_BLOCK
+            shape = block_shapes[k]
+            _add_size!(sizes, k, shape)
+            continue
+        end
         children_indices = SparseArrays.nzrange(adj, k)
         N = length(children_indices)
         if node.type == NODE_CALL_MULTIVARIATE
@@ -438,18 +448,34 @@ struct _SubexpressionStorage{S<:AbstractVector{Float64}}
         nodes::Vector{Node},
         adj::SparseArrays.SparseMatrixCSC{Bool,Int},
         const_values::Vector{Float64},
+        block_shapes::Dict{Int,Vector{Int}},
         partials_storage_ϵ::Vector{Float64},
         linearity::Linearity,
         ::Type{S} = Vector{Float64},
     ) where {S<:AbstractVector{Float64}}
-        sizes = _infer_sizes(nodes, adj)
+        sizes = _infer_sizes(nodes, adj, block_shapes)
         N = _length(sizes)
+        # Pre-load value blocks into forward_storage once at construction;
+        # each block is a contiguous-to-contiguous bulk copy. Individual
+        # `NODE_VALUE` scalars (rare — exponents, constant divisors, etc) and
+        # variable nodes are loaded by `_forward_eval` in the per-node loop.
+        cpu_buffer = zeros(N)
+        for k in 1:length(nodes)
+            node = nodes[k]
+            if node.type == NODE_VALUE_BLOCK
+                j = sizes.storage_offset[k] + 1
+                len = _length(sizes, k)
+                cpu_buffer[j:(j+len-1)] .=
+                    view(const_values, node.index:(node.index+len-1))
+            end
+        end
+        forward_storage = convert(S, cpu_buffer)
         return new{S}(
             nodes,
             adj,
             sizes,
             const_values,
-            fill!(S(undef, N), 0.0),  # forward_storage,
+            forward_storage,
             fill!(S(undef, N), 0.0),  # partials_storage,
             fill!(S(undef, N), 0.0),  # reverse_storage,
             partials_storage_ϵ,