From b8f137d11b2c77c14f201e5389b3882959cd7993 Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Fri, 24 Apr 2026 22:32:10 +0200
Subject: [PATCH 01/39] add ex19 (first version)

---
 Project.toml                     |   3 +-
 examples/ex19.jl                 | 316 +++++++++++++++++++++++++++++++
 src/autowrapped/SNES_wrappers.jl |   2 +-
 3 files changed, 319 insertions(+), 2 deletions(-)
 create mode 100644 examples/ex19.jl

diff --git a/Project.toml b/Project.toml
index 20894395..96dcceb5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -34,6 +34,7 @@ julia = "^1.10"
 
 [extras]
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -43,4 +44,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 
 [targets]
-test = ["ForwardDiff", "UnicodePlots", "Test", "Plots", "SparseDiffTools", "Printf", "Random", "CairoMakie"]
+test = ["ForwardDiff", "UnicodePlots", "Test", "Plots", "SparseDiffTools", "Printf", "Random", "CairoMakie", "KernelAbstractions"]
diff --git a/examples/ex19.jl b/examples/ex19.jl
new file mode 100644
index 00000000..e3863d46
--- /dev/null
+++ b/examples/ex19.jl
@@ -0,0 +1,316 @@
+# EXCLUDE FROM TESTING
+#=
+  ex19.jl — 2D Driven Cavity: velocity–vorticity–temperature formulation
+  Port of PETSc snes/tutorials/ex19.c
+
+  Solves on the unit square [0,1]²:
+    −∇²u  − ∂ω/∂y               = 0
+    −∇²v  + ∂ω/∂x               = 0
+    −∇²ω  + ∇·(uω, vω) − Gr·∂T/∂x = 0
+    −∇²T  + Pr·∇·(uT, vT)       = 0
+
+  4 DOFs per node: (u, v, ω, T).  Convective terms are upwinded.
+
+  Boundary conditions:
+    All walls:   u = v = 0  (no-slip),  except top lid: u = lidvelocity
+    Left:        T = 0  (cold, Dirichlet)
+    Right:       T = 1  (hot, Dirichlet, only when Gr > 0)
+    Top/bottom:  ∂T/∂n = 0  (insulated, Neumann)
+    ω:           derived from the no-slip condition at each wall
+
+  Usage:
+    julia --project ex19.jl
+    julia --project ex19.jl -snes_monitor -ksp_monitor -da_grid_x 32 -da_grid_y 32
+    mpiexec -n 4 julia --project ex19.jl -snes_monitor -pc_type mg -da_grid_x 64 -da_grid_y 64
+
+  To switch to GPU (once withlocalarray! supports device arrays):
+    Replace  `KernelAbstractions.CPU()`  with  `CUDABackend()`  and adapt the
+    array-wrapping inside the residual callback accordingly.
+=#
+
+using MPI
+using PETSc
+using KernelAbstractions
+
+backend = KernelAbstractions.CPU()
+
+
+# ── Physical parameters ──────────────────────────────────────────────────────
+Base.@kwdef mutable struct AppCtx{T}
+    lidvelocity :: T = one(T)
+    prandtl     :: T = one(T)
+    grashof     :: T = one(T)
+end
+
+# ── Residual kernel ───────────────────────────────────────────────────────────
+#
+# Iterates over the locally-owned grid: (li, lj) ∈ 1..nx_own × 1..ny_own.
+#
+# Array layout (plain 1-based, no OffsetArray — GPU-compatible):
+#   x_par[dof, xi, xj]   ghost input,  xi = li + ox, xj = lj + oy
+#   f_par[dof, li, lj]   owned output
+#
+# ox = xs - xsg, oy = ys - ysg  (ghost offsets; 0 at a domain boundary, 1 otherwise)
+#
+# Corner priority matches the C code: left/right walls processed last, so they
+# take precedence over bottom/top at corners.  Achieved here by checking i==1
+# and i==mx before j==1 and j==my.
+#
+@kernel function cavity_residual_kernel!(
+    f_par,
+    x_par,
+    dhx, dhy,           # mx-1, my-1
+    hx,  hy,            # 1/dhx, 1/dhy
+    hydhx, hxdhy,       # metric factors for Laplacian scaling
+    grashof, prandtl, lid,
+    mx :: Int, my :: Int,
+    xs :: Int, ys :: Int,
+    ox :: Int, oy :: Int,
+)
+    li, lj = @index(Global, NTuple)
+
+    i  = xs + li - 1   # global 1-based x-coordinate
+    j  = ys + lj - 1   # global 1-based y-coordinate
+    xi = li + ox        # ghost-array x-index for this point
+    xj = lj + oy        # ghost-array y-index for this point
+
+    # ── Boundary conditions ───────────────────────────────────────────────────
+
+    if i == 1                           # left wall — cold (T = 0)
+        f_par[1, li, lj] = x_par[1, xi,   xj  ]
+        f_par[2, li, lj] = x_par[2, xi,   xj  ]
+        f_par[3, li, lj] = x_par[3, xi,   xj  ] -
+            (x_par[2, xi+1, xj] - x_par[2, xi, xj]) * dhx
+        f_par[4, li, lj] = x_par[4, xi,   xj  ]
+
+    elseif i == mx                      # right wall — hot (T = 1 when Gr > 0)
+        f_par[1, li, lj] = x_par[1, xi,   xj  ]
+        f_par[2, li, lj] = x_par[2, xi,   xj  ]
+        f_par[3, li, lj] = x_par[3, xi,   xj  ] -
+            (x_par[2, xi, xj] - x_par[2, xi-1, xj]) * dhx
+        f_par[4, li, lj] = x_par[4, xi,   xj  ] -
+            (grashof > zero(grashof) ? one(grashof) : zero(grashof))
+
+    elseif j == 1                       # bottom wall — no-slip, insulated
+        f_par[1, li, lj] = x_par[1, xi,   xj  ]
+        f_par[2, li, lj] = x_par[2, xi,   xj  ]
+        f_par[3, li, lj] = x_par[3, xi,   xj  ] +
+            (x_par[1, xi, xj+1] - x_par[1, xi, xj]) * dhy
+        f_par[4, li, lj] = x_par[4, xi,   xj  ] - x_par[4, xi, xj+1]
+
+    elseif j == my                      # top wall — moving lid, insulated
+        f_par[1, li, lj] = x_par[1, xi,   xj  ] - lid
+        f_par[2, li, lj] = x_par[2, xi,   xj  ]
+        f_par[3, li, lj] = x_par[3, xi,   xj  ] +
+            (x_par[1, xi, xj] - x_par[1, xi, xj-1]) * dhy
+        f_par[4, li, lj] = x_par[4, xi,   xj  ] - x_par[4, xi, xj-1]
+
+    else                                # ── interior point ───────────────────
+
+        # Upwind split of advecting velocities
+        vx  = x_par[1, xi, xj];  avx = abs(vx)
+        vxp = oftype(vx, 0.5) * (vx + avx)   # max(vx, 0)
+        vxm = oftype(vx, 0.5) * (vx - avx)   # min(vx, 0)
+
+        vy  = x_par[2, xi, xj];  avy = abs(vy)
+        vyp = oftype(vy, 0.5) * (vy + avy)
+        vym = oftype(vy, 0.5) * (vy - avy)
+
+        # u-equation:  −∇²u − ∂ω/∂y = 0
+        cu  = x_par[1, xi, xj]
+        uxx = (2cu - x_par[1, xi-1, xj] - x_par[1, xi+1, xj]) * hydhx
+        uyy = (2cu - x_par[1, xi, xj-1] - x_par[1, xi, xj+1]) * hxdhy
+        f_par[1, li, lj] = uxx + uyy -
+            oftype(cu, 0.5) * (x_par[3, xi, xj+1] - x_par[3, xi, xj-1]) * hx
+
+        # v-equation:  −∇²v + ∂ω/∂x = 0
+        cv  = x_par[2, xi, xj]
+        vxx = (2cv - x_par[2, xi-1, xj] - x_par[2, xi+1, xj]) * hydhx
+        vyy = (2cv - x_par[2, xi, xj-1] - x_par[2, xi, xj+1]) * hxdhy
+        f_par[2, li, lj] = vxx + vyy +
+            oftype(cv, 0.5) * (x_par[3, xi+1, xj] - x_par[3, xi-1, xj]) * hy
+
+        # ω-equation:  −∇²ω + ∇·(uω, vω) − Gr·∂T/∂x = 0
+        cω  = x_par[3, xi, xj]
+        wxx = (2cω - x_par[3, xi-1, xj] - x_par[3, xi+1, xj]) * hydhx
+        wyy = (2cω - x_par[3, xi, xj-1] - x_par[3, xi, xj+1]) * hxdhy
+        f_par[3, li, lj] = wxx + wyy +
+            (vxp * (cω - x_par[3, xi-1, xj]) + vxm * (x_par[3, xi+1, xj] - cω)) * hy +
+            (vyp * (cω - x_par[3, xi, xj-1]) + vym * (x_par[3, xi, xj+1] - cω)) * hx -
+            oftype(cω, 0.5) * grashof * (x_par[4, xi+1, xj] - x_par[4, xi-1, xj]) * hy
+
+        # T-equation:  −∇²T + Pr·∇·(uT, vT) = 0
+        cT  = x_par[4, xi, xj]
+        txx = (2cT - x_par[4, xi-1, xj] - x_par[4, xi+1, xj]) * hydhx
+        tyy = (2cT - x_par[4, xi, xj-1] - x_par[4, xi, xj+1]) * hxdhy
+        f_par[4, li, lj] = txx + tyy + prandtl * (
+            (vxp * (cT - x_par[4, xi-1, xj]) + vxm * (x_par[4, xi+1, xj] - cT)) * hy +
+            (vyp * (cT - x_par[4, xi, xj-1]) + vym * (x_par[4, xi, xj+1] - cT)) * hx)
+    end
+end
+
+# ── Setup ─────────────────────────────────────────────────────────────────────
+opts = isinteractive() ? NamedTuple() : PETSc.parse_options(ARGS)
+
+petsclib = PETSc.getlib(; PetscScalar = Float64)
+PETSc.initialize(petsclib)
+
+T       = Float64
+PetscInt = petsclib.PetscInt
+comm    = MPI.COMM_WORLD
+
+# DMDA: 4×4 default (matches ex19.c); override via -da_grid_x / -da_grid_y
+da = PETSc.DMDA(
+    petsclib, comm,
+    (PETSc.DM_BOUNDARY_NONE, PETSc.DM_BOUNDARY_NONE),
+    (4, 4),
+    4,   # DOFs per node: (u, v, ω, T)
+    1,   # stencil width
+    PETSc.DMDA_STENCIL_STAR;
+    opts...,
+)
+
+snes = PETSc.SNES(petsclib, comm; opts...)
+PETSc.setDM!(snes, da)
+
+# Actual grid size after setfromoptions (may differ from the 4×4 default)
+info = PETSc.getinfo(da)
+mx   = info.global_size[1]
+my   = info.global_size[2]
+
+user = AppCtx{T}(
+    lidvelocity = T(1) / (mx - 1),
+    prandtl     = T(1),
+    grashof     = T(1),
+)
+
+# Precomputed grid metrics
+dhx   = T(mx - 1);   dhy   = T(my - 1)
+hx    = one(T) / dhx; hy    = one(T) / dhy
+hydhx = hy * dhx;    hxdhy = hx * dhy
+
+# ── Initial condition: u = v = ω = 0, T linear in x ─────────────────────────
+x = PETSc.DMGlobalVec(da)
+
+PETSc.withlocalarray!(x; read = false) do x_arr
+    corners = PETSc.getcorners(da)
+    xs = corners.lower[1];  ys = corners.lower[2]
+    xe = corners.upper[1];  ye = corners.upper[2]
+    nx_own = xe - xs + 1;   ny_own = ye - ys + 1
+    dx = one(T) / (mx - 1)
+    x_par = reshape(x_arr, 4, nx_own, ny_own)
+    for lj in 1:ny_own, li in 1:nx_own
+        ig = xs + li - 1
+        x_par[1, li, lj] = zero(T)
+        x_par[2, li, lj] = zero(T)
+        x_par[3, li, lj] = zero(T)
+        x_par[4, li, lj] = user.grashof > 0 ? T(ig - 1) * dx : zero(T)
+    end
+end
+
+# ── Residual callback ─────────────────────────────────────────────────────────
+#
+# To run on GPU, replace CPU() with CUDABackend() (or ROCBackend()) and adapt
+# the array wrapping once withlocalarray! supports device arrays (see vec.jl).
+#
+r       = similar(x)
+
+PETSc.setfunction!(snes, r) do g_fx, snes, g_x
+    da = PETSc.getDM(snes)
+
+    l_x = PETSc.DMLocalVec(da)
+    PETSc.dm_global_to_local!(g_x, l_x, da, PETSc.INSERT_VALUES)
+
+    PETSc.withlocalarray!(
+        (g_fx, l_x);
+        read  = (false, true),
+        write = (true,  false),
+    ) do fx, lx
+        corners       = PETSc.getcorners(da)
+        ghost_corners = PETSc.getghostcorners(da)
+
+        xs  = corners.lower[1];        ys  = corners.lower[2]
+        xe  = corners.upper[1];        ye  = corners.upper[2]
+        xsg = ghost_corners.lower[1];  ysg = ghost_corners.lower[2]
+        xeg = ghost_corners.upper[1];  yeg = ghost_corners.upper[2]
+
+        nx_own = xe  - xs  + 1;  ny_own = ye  - ys  + 1
+        nx_g   = xeg - xsg + 1;  ny_g   = yeg - ysg + 1
+
+        # Plain [dof, x, y] arrays — no OffsetArray, safe for KA on GPU
+        x_par = reshape(lx, 4, nx_g,   ny_g)
+        f_par = reshape(fx, 4, nx_own, ny_own)
+
+        # Ghost offset: ghost-array index for owned start = 1 + ox (0 at domain wall)
+        ox = xs - xsg
+        oy = ys - ysg
+
+        cavity_residual_kernel!(backend, 64)(
+            f_par, x_par,
+            dhx, dhy, hx, hy, hydhx, hxdhy,
+            user.grashof, user.prandtl, user.lidvelocity,
+            mx, my, xs, ys, ox, oy;
+            ndrange = (nx_own, ny_own),
+        )
+        KernelAbstractions.synchronize(backend)
+    end
+
+    PETSc.destroy(l_x)
+    return PetscInt(0)
+end
+
+# ── Jacobian (finite differences via PETSc's built-in column-by-column FD) ───
+#
+# Pass SNESComputeJacobianDefault directly as the C function pointer, exactly
+# like the C code does:
+#   SNESSetJacobian(snes, J, J, SNESComputeJacobianDefault, NULL)
+# This avoids a nested Julia→C→Julia callback chain and is more robust
+# in parallel.  For production, replace with coloring-based FD by swapping
+# SNESComputeJacobianDefault → SNESComputeJacobianDefaultColor.
+#
+J = LibPETSc.DMCreateMatrix(petsclib, da)
+LibPETSc.SNESSetJacobian(petsclib, snes, J, J,
+    cglobal((:SNESComputeJacobianDefault, petsclib.petsc_library)), C_NULL)
+
+# ── Solve ─────────────────────────────────────────────────────────────────────
+PETSc.solve!(x, snes)
+
+if MPI.Comm_rank(comm) == 0
+    its = LibPETSc.SNESGetIterationNumber(petsclib, snes)
+    println("SNES converged in $its Newton iterations.")
+end
+
+# ── Cleanup ───────────────────────────────────────────────────────────────────
+# Explicitly destroy the PetscOptions stored on the SNES before finalization.
+# Its GC finalizer calls PetscOptionsDestroy, which can use MPI internally.
+# If GC runs it after MPI is alive but in a different collective-sync state
+# across ranks, it triggers intermittent crashes.  Destroying it explicitly
+# here (while all ranks are synchronized and PETSc/MPI are still fully active)
+# is safe and prevents any later GC-driven call.
+if !isnothing(snes.opts)
+    PETSc.destroy(snes.opts)
+    snes.opts = nothing
+end
+
+# Run a full GC now so any lingering VecRestoreArray finalizers from
+# withlocalarray! run while PETSc is still valid, then barrier all ranks.
+GC.gc(true)
+MPI.Barrier(comm)
+
+# SNES holds internal PETSc references to J, da, and r — destroy it first so
+# those reference counts are decremented before we explicitly free the objects.
+PETSc.destroy(snes)
+PETSc.destroy(J)
+PETSc.destroy(x)
+PETSc.destroy(r)
+PETSc.destroy(da)
+PETSc.finalize(petsclib)
+
+# On macOS ARM64 with MPICH ch4:ofi, MPICH's C atexit handler crashes during
+# process teardown (SIGSEGV in libfabric/OFI cleanup).  Using quick_exit(0)
+# after explicitly finalizing PETSc and MPI bypasses all C atexit() handlers
+# (while still running at_quick_exit() handlers) and avoids the crash.
+# All MPI communication is already complete at this point.
+MPI.Barrier(comm)
+MPI.Finalize()
+ccall(:quick_exit, Cvoid, (Cint,), 0)
diff --git a/src/autowrapped/SNES_wrappers.jl b/src/autowrapped/SNES_wrappers.jl
index 1f3d04cd..c2557e1f 100644
--- a/src/autowrapped/SNES_wrappers.jl
+++ b/src/autowrapped/SNES_wrappers.jl
@@ -5118,7 +5118,7 @@ function SNESComputeJacobianDefault(petsclib::PetscLibType, snes::PetscSNES, x1:
                (:SNESComputeJacobianDefault, $petsc_library),
                PetscErrorCode,
                (CSNES, CVec, CMat, CMat, Ptr{Cvoid}),
-               snes, x1, J, B, ctx,
+               snes, x1, J, B, C_NULL,
               )
 
 

From 743f089ab2cb2599cc9b283ef498934051dba232 Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Sat, 25 Apr 2026 13:07:00 +0200
Subject: [PATCH 02/39] make testsuite work with locally installed PETSc
 library

---
 test/runtests.jl | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 19f69e55..4e87c216 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,12 +1,21 @@
 using Test
 using MPI: MPI, mpiexec
-using PETSc, PETSc_jll, Pkg
+using PETSc, Pkg
 
 # Make sure that all dependencies are installed also on a clean system
 Pkg.instantiate()
 
-import MPIPreferences
-@info "Testing PETSc.jl with" MPIPreferences.binary MPIPreferences.abi PETSc_jll.host_platform
+# When set_library! has been used, petsc_library is a path string and PETSc_jll
+# is not loaded.  Only import JLL-specific symbols when using the default binaries.
+const _using_custom_lib = PETSc.petsclibs[1].petsc_library isa AbstractString
+
+if _using_custom_lib
+    @info "Testing PETSc.jl with custom library" path=PETSc.petsclibs[1].petsc_library
+else
+    using PETSc_jll
+    import MPIPreferences
+    @info "Testing PETSc.jl with" MPIPreferences.binary MPIPreferences.abi PETSc_jll.host_platform
+end
 
 # Do the MPI tests first so we do not have mpi running inside MPI
 mpi_tests = ("mpivec.jl", "mpimat.jl", "ksp.jl", "dmstag.jl")

From 688e39361de8c5ead3451e8ba22e1d89bb1c73dc Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Sat, 25 Apr 2026 13:31:45 +0200
Subject: [PATCH 03/39] add CUDA support for the example

---
 Project.toml        |  7 ++++
 examples/ex19.jl    | 28 ++++++++++---
 ext/PETScCUDAExt.jl | 96 +++++++++++++++++++++++++++++++++++++++++++++
 src/PETSc.jl        |  1 +
 src/vec.jl          | 15 +++++++
 5 files changed, 141 insertions(+), 6 deletions(-)
 create mode 100644 ext/PETScCUDAExt.jl

diff --git a/Project.toml b/Project.toml
index 96dcceb5..2a0e1037 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,6 +18,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 
 [compat]
+CUDA = "5"
 ForwardDiff = "0.10, 1"
 Libdl = "^1.10"
 LinearAlgebra = "^1.10"
@@ -32,6 +33,12 @@ Statistics = "^1.10"
 UnicodePlots = "3.0"
 julia = "^1.10"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+PETScCUDAExt = "CUDA"
+
 [extras]
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
diff --git a/examples/ex19.jl b/examples/ex19.jl
index e3863d46..7c3af774 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -23,16 +23,26 @@
     julia --project ex19.jl -snes_monitor -ksp_monitor -da_grid_x 32 -da_grid_y 32
     mpiexec -n 4 julia --project ex19.jl -snes_monitor -pc_type mg -da_grid_x 64 -da_grid_y 64
 
-  To switch to GPU (once withlocalarray! supports device arrays):
-    Replace  `KernelAbstractions.CPU()`  with  `CUDABackend()`  and adapt the
-    array-wrapping inside the residual callback accordingly.
+  GPU usage: set  useCUDA = true  then run as above.
+    Requires PETSc built with --with-cuda, and CUDA.jl in the environment.
+
+  Set  useCUDA = true  (below) to run the residual kernel on GPU via CUDA.
+  Requires PETSc built with CUDA support and CUDA.jl installed.
 =#
 
+# ── GPU switch ────────────────────────────────────────────────────────────────
+const useCUDA = false
+
 using MPI
 using PETSc
 using KernelAbstractions
 
-backend = KernelAbstractions.CPU()
+if useCUDA
+    using CUDA
+    const backend = CUDABackend()
+else
+    const backend = KernelAbstractions.CPU()
+end
 
 
 # ── Physical parameters ──────────────────────────────────────────────────────
@@ -153,7 +163,7 @@ end
 opts = isinteractive() ? NamedTuple() : PETSc.parse_options(ARGS)
 
 petsclib = PETSc.getlib(; PetscScalar = Float64)
-PETSc.initialize(petsclib)
+PETSc.initialize(petsclib; log_view = true)
 
 T       = Float64
 PetscInt = petsclib.PetscInt
@@ -170,6 +180,11 @@ da = PETSc.DMDA(
     opts...,
 )
 
+if useCUDA
+    LibPETSc.DMSetVecType(petsclib, da, "cuda")
+    LibPETSc.DMSetMatType(petsclib, da, "aijcusparse")
+end
+
 snes = PETSc.SNES(petsclib, comm; opts...)
 PETSc.setDM!(snes, da)
 
@@ -221,7 +236,7 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     l_x = PETSc.DMLocalVec(da)
     PETSc.dm_global_to_local!(g_x, l_x, da, PETSc.INSERT_VALUES)
 
-    PETSc.withlocalarray!(
+    PETSc.withlocalarray_device!(
         (g_fx, l_x);
         read  = (false, true),
         write = (true,  false),
@@ -273,6 +288,7 @@ LibPETSc.SNESSetJacobian(petsclib, snes, J, J,
     cglobal((:SNESComputeJacobianDefault, petsclib.petsc_library)), C_NULL)
 
 # ── Solve ─────────────────────────────────────────────────────────────────────
+@show Threads.nthreads()
 PETSc.solve!(x, snes)
 
 if MPI.Comm_rank(comm) == 0
diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
new file mode 100644
index 00000000..d2c67313
--- /dev/null
+++ b/ext/PETScCUDAExt.jl
@@ -0,0 +1,96 @@
+module PETScCUDAExt
+
+using PETSc
+using PETSc: LibPETSc, AbstractPetscVec
+using PETSc.LibPETSc: PetscMemType, PETSC_MEMTYPE_HOST
+using CUDA
+
+# ── Internal: get one device-or-host array from a single Vec ─────────────────
+#
+# Uses VecGetArray{,Read,Write}AndMemType so PETSc tells us where the data is:
+#   PETSC_MEMTYPE_HOST   → return a plain Julia Vector (no copy)
+#   anything else (CUDA) → wrap the device pointer as a CuArray (no copy)
+#
+# The returned array has a finalizer that calls the matching VecRestoreArray*
+# so the caller just needs to finalize it when done, exactly like unsafe_localarray.
+#
+function _unsafe_localarray_device(
+    vec::AbstractPetscVec{PetscLib};
+    read::Bool = true,
+    write::Bool = true,
+) where {PetscLib}
+
+    if write && read
+        cpu_arr, mtype = LibPETSc.VecGetArrayAndMemType(PetscLib, vec)
+    elseif write
+        cpu_arr, mtype = LibPETSc.VecGetArrayWriteAndMemType(PetscLib, vec)
+    else
+        cpu_arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, vec)
+    end
+
+    if mtype === PETSC_MEMTYPE_HOST
+        # Data is on the host — attach a restore finalizer and return as-is.
+        finalizer(cpu_arr) do a
+            if write && read
+                LibPETSc.VecRestoreArrayAndMemType(PetscLib, vec, a)
+            elseif write
+                LibPETSc.VecRestoreArrayWriteAndMemType(PetscLib, vec, a)
+            else
+                LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, vec, a)
+            end
+            return nothing
+        end
+        return cpu_arr
+    else
+        # Data is on the GPU — wrap the device pointer as a CuArray.
+        # cpu_arr holds the raw device pointer in a Julia Vector shell; we must
+        # keep it alive (captured in the finalizer) so the pointer stays valid.
+        T   = eltype(cpu_arr)
+        n   = length(cpu_arr)
+        ptr = reinterpret(CuPtr{T}, UInt(pointer(cpu_arr)))
+        dev_arr = CUDA.unsafe_wrap(CuArray, ptr, n; own = false)
+
+        finalizer(dev_arr) do _
+            if write && read
+                LibPETSc.VecRestoreArrayAndMemType(PetscLib, vec, cpu_arr)
+            elseif write
+                LibPETSc.VecRestoreArrayWriteAndMemType(PetscLib, vec, cpu_arr)
+            else
+                LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, vec, cpu_arr)
+            end
+            return nothing
+        end
+        return dev_arr
+    end
+end
+
+# ── Public override of withlocalarray_device! ─────────────────────────────────
+#
+# Drop-in replacement for withlocalarray! that hands the kernel a CuArray when
+# the Vec lives on GPU, and falls back to a plain Array when it lives on CPU.
+# No host↔device copies are performed in either case.
+#
+function PETSc.withlocalarray_device!(
+    f!,
+    vecs::NTuple{N, AbstractPetscVec};
+    read::Union{Bool, NTuple{N, Bool}}  = true,
+    write::Union{Bool, NTuple{N, Bool}} = true,
+) where {N}
+    read  isa NTuple{N, Bool} || (read  = ntuple(_ -> read,  N))
+    write isa NTuple{N, Bool} || (write = ntuple(_ -> write, N))
+
+    arrays = map(vecs, read, write) do v, r, w
+        _unsafe_localarray_device(v; read = r, write = w)
+    end
+
+    val = f!(arrays...)
+
+    map(Base.finalize, arrays)
+
+    return val
+end
+
+PETSc.withlocalarray_device!(f!, vecs...; kwargs...) =
+    PETSc.withlocalarray_device!(f!, vecs; kwargs...)
+
+end # module
diff --git a/src/PETSc.jl b/src/PETSc.jl
index 97b613bb..79514c82 100644
--- a/src/PETSc.jl
+++ b/src/PETSc.jl
@@ -24,6 +24,7 @@ export LibPETSc
 export audit_petsc_file
 export set_petsclib
 export set_library!, unset_library!, library_info
+export withlocalarray_device!
 
 using Libdl
 
diff --git a/src/vec.jl b/src/vec.jl
index 89a9eab4..2265dbce 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -310,6 +310,21 @@ function withlocalarray!(
 end
 withlocalarray!(f!, vecs...; kwargs...) = withlocalarray!(f!, vecs; kwargs...)
 
+"""
+    withlocalarray_device!(f!, vecs...; read, write)
+
+Like [`withlocalarray!`](@ref) but returns a device array (e.g. `CuArray`) when
+the underlying PETSc vector lives on GPU (i.e. `PetscMemType` is not HOST).
+
+When CUDA.jl is loaded the `PETScCUDAExt` extension overrides this function to
+wrap the device pointer returned by `VecGetArrayAndMemType` into a `CuArray`
+without any host↔device copy.  When CUDA.jl is not loaded, or when the vector
+lives on the host, this falls back to [`withlocalarray!`](@ref).
+"""
+withlocalarray_device!(f!, vecs::NTuple{N, AbstractPetscVec}; kwargs...) where {N} =
+    withlocalarray!(f!, vecs; kwargs...)
+withlocalarray_device!(f!, vecs...; kwargs...) = withlocalarray_device!(f!, vecs; kwargs...)
+
 
 """
     ghostupdatebegin!(

From 3b1ff845dc34961b25db3e29318b963e541b2464 Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Sat, 25 Apr 2026 13:48:13 +0200
Subject: [PATCH 04/39] use AbstractString instead of string

---
 src/dm.jl | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/dm.jl b/src/dm.jl
index 49561639..1014d759 100644
--- a/src/dm.jl
+++ b/src/dm.jl
@@ -445,3 +445,23 @@ function MatAIJ(da::AbstractPetscDM{PetscLib}) where {PetscLib}
     J = LibPETSc.DMCreateMatrix(getlib(PetscLib), da)
     return J
 end
+
+function LibPETSc.DMSetVecType(
+    petsclib,
+    dm::AbstractPetscDM,
+    ctype::AbstractString,
+)
+    GC.@preserve ctype begin
+        LibPETSc.DMSetVecType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, ctype))
+    end
+end
+
+function LibPETSc.DMSetMatType(
+    petsclib,
+    dm::AbstractPetscDM,
+    ctype::AbstractString,
+)
+    GC.@preserve ctype begin
+        LibPETSc.DMSetMatType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, ctype))
+    end
+end

From 5e06d1fbcbe71c89badaa61c2020ba2f0f691277 Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Sat, 25 Apr 2026 13:50:23 +0200
Subject: [PATCH 05/39] use consistent AbstractString

---
 src/dm.jl                    |  19 ------
 src/string_wrappers.jl       | 112 ++++++++++-------------------------
 src/string_wrappers_extra.jl |  38 ++----------
 src/ts.jl                    |   8 +--
 4 files changed, 39 insertions(+), 138 deletions(-)

diff --git a/src/dm.jl b/src/dm.jl
index 1014d759..a7a9f692 100644
--- a/src/dm.jl
+++ b/src/dm.jl
@@ -446,22 +446,3 @@ function MatAIJ(da::AbstractPetscDM{PetscLib}) where {PetscLib}
     return J
 end
 
-function LibPETSc.DMSetVecType(
-    petsclib,
-    dm::AbstractPetscDM,
-    ctype::AbstractString,
-)
-    GC.@preserve ctype begin
-        LibPETSc.DMSetVecType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, ctype))
-    end
-end
-
-function LibPETSc.DMSetMatType(
-    petsclib,
-    dm::AbstractPetscDM,
-    ctype::AbstractString,
-)
-    GC.@preserve ctype begin
-        LibPETSc.DMSetMatType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, ctype))
-    end
-end
diff --git a/src/string_wrappers.jl b/src/string_wrappers.jl
index 9d6c3106..c4a103c1 100644
--- a/src/string_wrappers.jl
+++ b/src/string_wrappers.jl
@@ -1,95 +1,43 @@
-# Convenience wrappers for PETSc SetType functions that accept Julia strings
-# instead of C string pointers
-#
-# These wrappers are defined in the parent PETSc module and delegate to
-# LibPETSc functions with automatic string-to-pointer conversion.
+# Convenience overloads for PETSc Set*Type functions.
+# Each accepts AbstractString and converts to the Ptr{Cchar} the C API expects.
+# GC.@preserve keeps the String alive across the ccall inside the LibPETSc wrapper.
 
-"""
-    MatSetType(petsclib, mat, type::String)
-
-Convenience wrapper for setting matrix type using a Julia string.
-
-# Example
-```julia
-mat = LibPETSc.MatCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.MatSetType(petsclib, mat, "seqaij")
-```
-"""
-function LibPETSc.MatSetType(petsclib::LibPETSc.PetscLibType, mat, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.MatSetType(petsclib, mat, ptr)
-    return nothing
+function LibPETSc.MatSetType(petsclib, mat, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.MatSetType(petsclib, mat, Base.unsafe_convert(Ptr{Cchar}, s))
 end
 
-"""
-    VecSetType(petsclib, vec, type::String)
-
-Convenience wrapper for setting vector type using a Julia string.
-
-# Example
-```julia
-vec = LibPETSc.VecCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.VecSetType(petsclib, vec, "seq")
-```
-"""
-function LibPETSc.VecSetType(petsclib::LibPETSc.PetscLibType, vec, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.VecSetType(petsclib, vec, ptr)
-    return nothing
+function LibPETSc.VecSetType(petsclib, vec, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.VecSetType(petsclib, vec, Base.unsafe_convert(Ptr{Cchar}, s))
 end
 
-"""
-    KSPSetType(petsclib, ksp, type::String)
-
-Convenience wrapper for setting KSP solver type using a Julia string.
-
-# Example
-```julia
-ksp = LibPETSc.KSPCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.KSPSetType(petsclib, ksp, "gmres")
-```
-"""
-function LibPETSc.KSPSetType(petsclib::LibPETSc.PetscLibType, ksp, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.KSPSetType(petsclib, ksp, ptr)
-    return nothing
+function LibPETSc.KSPSetType(petsclib, ksp, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.KSPSetType(petsclib, ksp, Base.unsafe_convert(Ptr{Cchar}, s))
 end
 
-"""
-    SNESSetType(petsclib, snes, type::String)
-
-Convenience wrapper for setting SNES solver type using a Julia string.
+function LibPETSc.PCSetType(petsclib, pc, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.PCSetType(petsclib, pc, Base.unsafe_convert(Ptr{Cchar}, s))
+end
 
-# Example
-```julia
-snes = LibPETSc.SNESCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.SNESSetType(petsclib, snes, "newtonls")
-```
-"""
-function LibPETSc.SNESSetType(petsclib::LibPETSc.PetscLibType, snes, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.SNESSetType(petsclib, snes, ptr)
-    return nothing
+function LibPETSc.SNESSetType(petsclib, snes, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.SNESSetType(petsclib, snes, Base.unsafe_convert(Ptr{Cchar}, s))
 end
 
-"""
-    DMSetType(petsclib, dm, type::String)
+function LibPETSc.DMSetType(petsclib, dm, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.DMSetType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, s))
+end
 
-Convenience wrapper for setting DM type using a Julia string.
+function LibPETSc.DMSetVecType(petsclib, dm, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.DMSetVecType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, s))
+end
 
-# Example
-```julia
-dm = LibPETSc.DMCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.DMSetType(petsclib, dm, "da")
-```
-"""
-function LibPETSc.DMSetType(petsclib::LibPETSc.PetscLibType, dm, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.DMSetType(petsclib, dm, ptr)
-    return nothing
+function LibPETSc.DMSetMatType(petsclib, dm, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.DMSetMatType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, s))
 end
diff --git a/src/string_wrappers_extra.jl b/src/string_wrappers_extra.jl
index 2a28baef..1c943c02 100644
--- a/src/string_wrappers_extra.jl
+++ b/src/string_wrappers_extra.jl
@@ -1,35 +1,9 @@
-"""
-    TSSetType(petsclib, ts, type::String)
-
-Convenience wrapper for setting TS (time-stepping) type using a Julia string.
-
-# Example
-```julia
-ts = LibPETSc.TSCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.TSSetType(petsclib, ts, "bdf")
-```
-"""
-function LibPETSc.TSSetType(petsclib::LibPETSc.PetscLibType, ts, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.TSSetType(petsclib, ts, ptr)
-    return nothing
+function LibPETSc.TSSetType(petsclib, ts, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.TSSetType(petsclib, ts, Base.unsafe_convert(Ptr{Cchar}, s))
 end
 
-"""
-    TaoSetType(petsclib, tao, type::String)
-
-Convenience wrapper for setting Tao solver type using a Julia string.
-
-# Example
-```julia
-tao = LibPETSc.TaoCreate(petsclib)
-LibPETSc.TaoSetType(petsclib, tao, "lmvm")
-```
-"""
-function LibPETSc.TaoSetType(petsclib::LibPETSc.PetscLibType, tao, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.TaoSetType(petsclib, tao, ptr)
-    return nothing
+function LibPETSc.TaoSetType(petsclib, tao, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.TaoSetType(petsclib, tao, Base.unsafe_convert(Ptr{Cchar}, s))
 end
diff --git a/src/ts.jl b/src/ts.jl
index e972f021..c084e438 100644
--- a/src/ts.jl
+++ b/src/ts.jl
@@ -221,12 +221,10 @@ string such as `"none"` or `"basic"`.
 function LibPETSc.TSAdaptSetType(
     petsclib::LibPETSc.PetscLibType,
     adapt::LibPETSc.TSAdapt,
-    type::String,
+    type::AbstractString,
 )
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Cchar}, pointer(c_str))
-    LibPETSc.TSAdaptSetType(petsclib, adapt, ptr)
-    return nothing
+    s = String(type)
+    GC.@preserve s LibPETSc.TSAdaptSetType(petsclib, adapt, Base.unsafe_convert(Ptr{Cchar}, s))
 end
 
 """

From 8443f9f19864f6e8d6c05e85e0ffc7f0d140c03b Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Sat, 25 Apr 2026 13:50:44 +0200
Subject: [PATCH 06/39] version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 2a0e1037..1369e621 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "PETSc"
 uuid = "ace2c81b-2b5f-4b1e-a30d-d662738edfe0"
-version = "0.4.9"
+version = "0.4.10-DEV"
 authors = ["Boris Kaus <kaus@uni-mainz.de>", "Viral B. Shah <virals@gmail.com>", "Valentin Churavy <v.churavy@gmail.com>", "Erik Schnetter <eschnetter@perimeterinstitute.ca>", "Jeremy E. Kozdon <jeremy@kozdon.net>", "Simon Byrne <simonbyrne@gmail.com>"]
 
 [deps]

From 2340d01d73299b184b3fada4dc7ac6700c53eb37 Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Sat, 25 Apr 2026 14:04:08 +0200
Subject: [PATCH 07/39] undo

---
 src/autowrapped/petsc_library.jl | 1 +
 src/string_wrappers.jl           | 5 -----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/autowrapped/petsc_library.jl b/src/autowrapped/petsc_library.jl
index 7a9151ee..070a3f12 100644
--- a/src/autowrapped/petsc_library.jl
+++ b/src/autowrapped/petsc_library.jl
@@ -388,6 +388,7 @@ include("PetscDraw_wrappers.jl")
 include("PetscRegressor_wrappers.jl")
 include("PF_wrappers.jl")
 include("IS_wrappers.jl")
+# include("PC_wrappers.jl")  # excluded: PC type in ccall signatures needs fixing
 include("TS_wrappers.jl")
 include("AO_wrappers.jl")
 include("Tao_wrappers.jl")
diff --git a/src/string_wrappers.jl b/src/string_wrappers.jl
index c4a103c1..5916ecf8 100644
--- a/src/string_wrappers.jl
+++ b/src/string_wrappers.jl
@@ -17,11 +17,6 @@ function LibPETSc.KSPSetType(petsclib, ksp, type::AbstractString)
     GC.@preserve s LibPETSc.KSPSetType(petsclib, ksp, Base.unsafe_convert(Ptr{Cchar}, s))
 end
 
-function LibPETSc.PCSetType(petsclib, pc, type::AbstractString)
-    s = String(type)
-    GC.@preserve s LibPETSc.PCSetType(petsclib, pc, Base.unsafe_convert(Ptr{Cchar}, s))
-end
-
 function LibPETSc.SNESSetType(petsclib, snes, type::AbstractString)
     s = String(type)
     GC.@preserve s LibPETSc.SNESSetType(petsclib, snes, Base.unsafe_convert(Ptr{Cchar}, s))

From a308fc2aed81cd0800c890a7b1b1d0e842c34c92 Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Sat, 25 Apr 2026 14:16:18 +0200
Subject: [PATCH 08/39] bugfixes

---
 src/autowrapped/Vec_wrappers.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/autowrapped/Vec_wrappers.jl b/src/autowrapped/Vec_wrappers.jl
index 12edff86..6df7e12c 100644
--- a/src/autowrapped/Vec_wrappers.jl
+++ b/src/autowrapped/Vec_wrappers.jl
@@ -1340,10 +1340,10 @@ function VecGetArrayAndMemType(petsclib::PetscLibType, x::PetscVec) end
               )
 
 	a = unsafe_wrap(Array, a_[], VecGetLocalSize(petsclib, x); own = false)
-	mtype = unsafe_string(mtype_[])
+	mtype = mtype_[]
 
 	return a,mtype
-end 
+end
 
 """
 	VecRestoreArrayAndMemType(petsclib::PetscLibType,x::PetscVec, a::Vector{PetscScalar}) 
@@ -1414,10 +1414,10 @@ function VecGetArrayReadAndMemType(petsclib::PetscLibType, x::PetscVec) end
               )
 
 	a = unsafe_wrap(Array, a_[], VecGetLocalSize(petsclib, x); own = false)
-	mtype = unsafe_string(mtype_[])
+	mtype = mtype_[]
 
 	return a,mtype
-end 
+end
 
 """
 	VecRestoreArrayReadAndMemType(petsclib::PetscLibType,x::PetscVec, a::Vector{PetscScalar}) 

From ee7131d6615602bb23315d7fe939ec5e87b2d6bd Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Sat, 25 Apr 2026 17:20:06 +0200
Subject: [PATCH 09/39] next attempt

---
 examples/ex19.jl                | 116 +++++++++++++++++++++-----------
 src/autowrapped/Vec_wrappers.jl |   4 +-
 src/snes.jl                     |  26 ++++---
 3 files changed, 92 insertions(+), 54 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index 7c3af774..598d03c9 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -31,7 +31,7 @@
 =#
 
 # ── GPU switch ────────────────────────────────────────────────────────────────
-const useCUDA = false
+const useCUDA = true
 
 using MPI
 using PETSc
@@ -39,6 +39,7 @@ using KernelAbstractions
 
 if useCUDA
     using CUDA
+    import CUDA: CuArray, CuPtr, unsafe_wrap
     const backend = CUDABackend()
 else
     const backend = KernelAbstractions.CPU()
@@ -165,9 +166,12 @@ opts = isinteractive() ? NamedTuple() : PETSc.parse_options(ARGS)
 petsclib = PETSc.getlib(; PetscScalar = Float64)
 PETSc.initialize(petsclib; log_view = true)
 
-T       = Float64
+petsclib = PETSc.getlib(; PetscScalar = Float64)
+PETSc.initialize(petsclib; log_view = true)
+
+T        = Float64
 PetscInt = petsclib.PetscInt
-comm    = MPI.COMM_WORLD
+comm     = MPI.COMM_WORLD
 
 # DMDA: 4×4 default (matches ex19.c); override via -da_grid_x / -da_grid_y
 da = PETSc.DMDA(
@@ -223,12 +227,45 @@ PETSc.withlocalarray!(x; read = false) do x_arr
     end
 end
 
-# ── Residual callback ─────────────────────────────────────────────────────────
+# ── Helpers for zero-copy access to PETSc device/host arrays ─────────────────
+#
+# On CPU: plain Julia Arrays via unsafe_localarray (VecGetArray internally).
+# On GPU: CuArrays wrapping the device pointer via VecCUDAGetArrayRead/Write —
+#         no host↔device copies, data stays on the GPU throughout.
 #
-# To run on GPU, replace CPU() with CUDABackend() (or ROCBackend()) and adapt
-# the array wrapping once withlocalarray! supports device arrays (see vec.jl).
+# Returns (fx, lx, fx_ptr, lx_ptr) where fx_ptr/lx_ptr are the Ref handles
+# needed by the corresponding Restore calls.
 #
-r       = similar(x)
+function get_petsc_arrays(petsclib, g_fx, l_x)
+    if useCUDA
+        fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, g_fx)
+        lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, l_x)
+
+        # fx_arr and lx_arr are Julia Vectors whose underlying pointer is a
+        # CUDA device pointer when mtype == PETSC_MEMTYPE_CUDA.
+        # unsafe_wrap creates a CuArray view — zero copy, no host transfer.
+        fx = unsafe_wrap(CuArray, CuPtr{T}(UInt(pointer(fx_arr))), length(fx_arr))
+        lx = unsafe_wrap(CuArray, CuPtr{T}(UInt(pointer(lx_arr))), length(lx_arr))
+        return fx, lx, fx_arr, lx_arr
+    else
+        fx = PETSc.unsafe_localarray(g_fx; read = true,  write = true)
+        lx = PETSc.unsafe_localarray(l_x;  read = true,  write = false)
+        return fx, lx, nothing, nothing
+    end
+end
+
+function restore_petsc_arrays(petsclib, g_fx, l_x, fx_arr, lx_arr, fx, lx)
+    if useCUDA
+        LibPETSc.VecRestoreArrayAndMemType(petsclib, g_fx, fx_arr)
+        LibPETSc.VecRestoreArrayReadAndMemType(petsclib, l_x, lx_arr)
+    else
+        Base.finalize(fx)
+        Base.finalize(lx)
+    end
+end
+
+# ── Residual callback ─────────────────────────────────────────────────────────
+r = similar(x)
 
 PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     da = PETSc.getDM(snes)
@@ -236,40 +273,37 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     l_x = PETSc.DMLocalVec(da)
     PETSc.dm_global_to_local!(g_x, l_x, da, PETSc.INSERT_VALUES)
 
-    PETSc.withlocalarray_device!(
-        (g_fx, l_x);
-        read  = (false, true),
-        write = (true,  false),
-    ) do fx, lx
-        corners       = PETSc.getcorners(da)
-        ghost_corners = PETSc.getghostcorners(da)
-
-        xs  = corners.lower[1];        ys  = corners.lower[2]
-        xe  = corners.upper[1];        ye  = corners.upper[2]
-        xsg = ghost_corners.lower[1];  ysg = ghost_corners.lower[2]
-        xeg = ghost_corners.upper[1];  yeg = ghost_corners.upper[2]
-
-        nx_own = xe  - xs  + 1;  ny_own = ye  - ys  + 1
-        nx_g   = xeg - xsg + 1;  ny_g   = yeg - ysg + 1
-
-        # Plain [dof, x, y] arrays — no OffsetArray, safe for KA on GPU
-        x_par = reshape(lx, 4, nx_g,   ny_g)
-        f_par = reshape(fx, 4, nx_own, ny_own)
-
-        # Ghost offset: ghost-array index for owned start = 1 + ox (0 at domain wall)
-        ox = xs - xsg
-        oy = ys - ysg
-
-        cavity_residual_kernel!(backend, 64)(
-            f_par, x_par,
-            dhx, dhy, hx, hy, hydhx, hxdhy,
-            user.grashof, user.prandtl, user.lidvelocity,
-            mx, my, xs, ys, ox, oy;
-            ndrange = (nx_own, ny_own),
-        )
-        KernelAbstractions.synchronize(backend)
-    end
+    fx, lx, fx_ptr, lx_ptr = get_petsc_arrays(petsclib, g_fx, l_x)
+
+    corners       = PETSc.getcorners(da)
+    ghost_corners = PETSc.getghostcorners(da)
+
+    xs  = corners.lower[1];        ys  = corners.lower[2]
+    xe  = corners.upper[1];        ye  = corners.upper[2]
+    xsg = ghost_corners.lower[1];  ysg = ghost_corners.lower[2]
+    xeg = ghost_corners.upper[1];  yeg = ghost_corners.upper[2]
+
+    nx_own = xe  - xs  + 1;  ny_own = ye  - ys  + 1
+    nx_g   = xeg - xsg + 1;  ny_g   = yeg - ysg + 1
+
+    # Plain [dof, x, y] arrays — no OffsetArray, safe for KA on GPU
+    x_par = reshape(lx, 4, nx_g,   ny_g)
+    f_par = reshape(fx, 4, nx_own, ny_own)
+
+    # Ghost offset: ghost-array index for owned start = 1 + ox (0 at domain wall)
+    ox = xs - xsg
+    oy = ys - ysg
+
+    cavity_residual_kernel!(backend, 64)(
+        f_par, x_par,
+        dhx, dhy, hx, hy, hydhx, hxdhy,
+        user.grashof, user.prandtl, user.lidvelocity,
+        mx, my, xs, ys, ox, oy;
+        ndrange = (nx_own, ny_own),
+    )
+    KernelAbstractions.synchronize(backend)
 
+    restore_petsc_arrays(petsclib, g_fx, l_x, fx_ptr, lx_ptr, fx, lx)
     PETSc.destroy(l_x)
     return PetscInt(0)
 end
@@ -329,4 +363,4 @@ PETSc.finalize(petsclib)
 # All MPI communication is already complete at this point.
 MPI.Barrier(comm)
 MPI.Finalize()
-ccall(:quick_exit, Cvoid, (Cint,), 0)
+ccall(:quick_exit, Cvoid, (Cint,), 0)
\ No newline at end of file
diff --git a/src/autowrapped/Vec_wrappers.jl b/src/autowrapped/Vec_wrappers.jl
index 6df7e12c..624b92ee 100644
--- a/src/autowrapped/Vec_wrappers.jl
+++ b/src/autowrapped/Vec_wrappers.jl
@@ -1487,10 +1487,10 @@ function VecGetArrayWriteAndMemType(petsclib::PetscLibType, x::PetscVec) end
               )
 
 	a = unsafe_wrap(Array, a_[], VecGetLocalSize(petsclib, x); own = false)
-	mtype = unsafe_string(mtype_[])
+	mtype = mtype_[]
 
 	return a,mtype
-end 
+end
 
 """
 	VecRestoreArrayWriteAndMemType(petsclib::PetscLibType,x::PetscVec, a::Vector{PetscScalar}) 
diff --git a/src/snes.jl b/src/snes.jl
index f63f9209..2c329dc5 100644
--- a/src/snes.jl
+++ b/src/snes.jl
@@ -87,19 +87,22 @@ setfunction!(snes::AbstractPetscSNES, rhs!, vec) = setfunction!(rhs!, snes, vec)
 # Wrapper for calls to setfunction!
 mutable struct Fn_SNESSetFunction{PetscLib} end
 function (w::Fn_SNESSetFunction{PetscLib})(
-    ::CSNES,
+    actual_snes_ptr::CSNES,
     r_x::CVec,
     r_fx::CVec,
     snes_ptr::Ptr{Cvoid},
 ) where {PetscLib}
     snes = unsafe_pointer_to_objref(snes_ptr)
+    # Wrap the actual C SNES for the current MG level so that getDM() inside
+    # the callback returns the correct DM (matches the pattern in Fn_KSPComputeRHS).
+    actual_snes = PetscSNES{PetscLib}(actual_snes_ptr, getlib(PetscLib).age)
     x  = PetscVec{PetscLib}(r_x)
     fx = PetscVec{PetscLib}(r_fx)
 
-    if Base.applicable(snes.f!, fx, snes, x, snes.user_ctx)
-        return snes.f!(fx, snes, x, snes.user_ctx)
+    if Base.applicable(snes.f!, fx, actual_snes, x, snes.user_ctx)
+        return snes.f!(fx, actual_snes, x, snes.user_ctx)
     else
-        return snes.f!(fx, snes, x)
+        return snes.f!(fx, actual_snes, x)
     end
 end
 
@@ -161,13 +164,14 @@ setjacobian!(snes::AbstractPetscSNES, updateJ!, J, PJ = J) =
 # Wrapper for calls to setjacobian!
 mutable struct Fn_SNESSetJacobian{PetscLib} end
 function (w::Fn_SNESSetJacobian{PetscLib})(
-    ::CSNES,
+    actual_snes_ptr::CSNES,
     r_x::CVec,
     r_A::CMat,
     r_P::CMat,
     snes_ptr::Ptr{Cvoid},
 ) where {PetscLib}
     snes = unsafe_pointer_to_objref(snes_ptr)
+    actual_snes = PetscSNES{PetscLib}(actual_snes_ptr, getlib(PetscLib).age)
     x = PetscVec{PetscLib}(r_x)
     A = PetscMat{PetscLib}(r_A)
     P = PetscMat{PetscLib}(r_P)
@@ -175,16 +179,16 @@ function (w::Fn_SNESSetJacobian{PetscLib})(
     same_mat = (P.ptr == A.ptr)
 
     if same_mat
-        if Base.applicable(snes.updateJ!, A, snes, x, snes.user_ctx)
-            return snes.updateJ!(A, snes, x, snes.user_ctx)
+        if Base.applicable(snes.updateJ!, A, actual_snes, x, snes.user_ctx)
+            return snes.updateJ!(A, actual_snes, x, snes.user_ctx)
         else
-            return snes.updateJ!(A, snes, x)
+            return snes.updateJ!(A, actual_snes, x)
         end
     else
-        if Base.applicable(snes.updateJ!, A, P, snes, x, snes.user_ctx)
-            return snes.updateJ!(A, P, snes, x, snes.user_ctx)
+        if Base.applicable(snes.updateJ!, A, P, actual_snes, x, snes.user_ctx)
+            return snes.updateJ!(A, P, actual_snes, x, snes.user_ctx)
         else
-            return snes.updateJ!(A, P, snes, x)
+            return snes.updateJ!(A, P, actual_snes, x)
         end
     end
 end

From 5d5205f4a7d3f68b77949cb0bdc2f90a9de2da31 Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Sun, 26 Apr 2026 17:58:41 +0200
Subject: [PATCH 10/39] GPU/CPu logic (mg not yet working)

Co-authored-by: Copilot <copilot@github.com>
---
 examples/ex19.jl | 86 ++++++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 32 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index 598d03c9..c7fb0971 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -31,7 +31,7 @@
 =#
 
 # ── GPU switch ────────────────────────────────────────────────────────────────
-const useCUDA = true
+const useCUDA = false
 
 using MPI
 using PETSc
@@ -166,9 +166,6 @@ opts = isinteractive() ? NamedTuple() : PETSc.parse_options(ARGS)
 petsclib = PETSc.getlib(; PetscScalar = Float64)
 PETSc.initialize(petsclib; log_view = true)
 
-petsclib = PETSc.getlib(; PetscScalar = Float64)
-PETSc.initialize(petsclib; log_view = true)
-
 T        = Float64
 PetscInt = petsclib.PetscInt
 comm     = MPI.COMM_WORLD
@@ -184,10 +181,10 @@ da = PETSc.DMDA(
     opts...,
 )
 
-if useCUDA
-    LibPETSc.DMSetVecType(petsclib, da, "cuda")
-    LibPETSc.DMSetMatType(petsclib, da, "aijcusparse")
-end
+# NOTE (Stage 1): PETSc vecs stay on CPU so FD coloring arithmetic uses BLAS
+# (not cuBLAS), avoiding the VecPlaceArray+VecAXPY CUDA bug in PETSc's
+# MatFDColoringApply_AIJ.  The residual kernel still runs on the GPU via
+# explicit H2D/kernel/D2H copies in get_petsc_arrays / restore_petsc_arrays.
 
 snes = PETSc.SNES(petsclib, comm; opts...)
 PETSc.setDM!(snes, da)
@@ -227,35 +224,58 @@ PETSc.withlocalarray!(x; read = false) do x_arr
     end
 end
 
-# ── Helpers for zero-copy access to PETSc device/host arrays ─────────────────
+# ── Helpers for PETSc array access with optional GPU residual kernel ──────────
 #
-# On CPU: plain Julia Arrays via unsafe_localarray (VecGetArray internally).
-# On GPU: CuArrays wrapping the device pointer via VecCUDAGetArrayRead/Write —
-#         no host↔device copies, data stays on the GPU throughout.
+# Three cases:
 #
-# Returns (fx, lx, fx_ptr, lx_ptr) where fx_ptr/lx_ptr are the Ref handles
-# needed by the corresponding Restore calls.
+#   useCUDA=false  →  plain CPU arrays via unsafe_localarray.
+#
+#   useCUDA=true, vecs on GPU (PETSC_MEMTYPE_CUDA)
+#              →  zero-copy CuArray wraps; no host↔device transfer.
+#
+#   useCUDA=true, vecs on CPU (PETSC_MEMTYPE_HOST, Stage-1 coloring path)
+#              →  allocate GPU scratch buffers, copy lx H2D before kernel,
+#                 copy fx D2H after kernel, then let PETSc see the result in
+#                 the HOST fx_arr.  FD coloring arithmetic stays on CPU (BLAS)
+#                 which avoids the VecPlaceArray+cuBLAS bug.
+#
+# Returns (fx, lx, fx_arr, lx_arr, fx_bounce)
+#   fx / lx       — arrays passed to the kernel (CuArray or plain Array)
+#   fx_arr/lx_arr — raw PETSc handles for Restore calls (nothing on CPU path)
+#   fx_bounce     — CuArray whose contents must be copied back to fx_arr after
+#                   the kernel; nothing when no copy is needed.
 #
 function get_petsc_arrays(petsclib, g_fx, l_x)
     if useCUDA
         fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, g_fx)
         lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, l_x)
 
-        # fx_arr and lx_arr are Julia Vectors whose underlying pointer is a
-        # CUDA device pointer when mtype == PETSC_MEMTYPE_CUDA.
-        # unsafe_wrap creates a CuArray view — zero copy, no host transfer.
-        fx = unsafe_wrap(CuArray, CuPtr{T}(UInt(pointer(fx_arr))), length(fx_arr))
-        lx = unsafe_wrap(CuArray, CuPtr{T}(UInt(pointer(lx_arr))), length(lx_arr))
-        return fx, lx, fx_arr, lx_arr
+        if lx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
+            # Native GPU vecs: zero-copy wrap, no bounce needed.
+            fx = unsafe_wrap(CuArray, CuPtr{T}(UInt(pointer(fx_arr))), length(fx_arr))
+            lx = unsafe_wrap(CuArray, CuPtr{T}(UInt(pointer(lx_arr))), length(lx_arr))
+            return fx, lx, fx_arr, lx_arr, nothing
+        else
+            # CPU vecs (FD coloring path): bounce residual through GPU.
+            lx_gpu = CuArray{T}(undef, length(lx_arr))
+            fx_gpu = CuArray{T}(undef, length(fx_arr))
+            copyto!(lx_gpu, lx_arr)          # H2D: send ghost input to GPU
+            return fx_gpu, lx_gpu, fx_arr, lx_arr, fx_gpu
+        end
     else
         fx = PETSc.unsafe_localarray(g_fx; read = true,  write = true)
         lx = PETSc.unsafe_localarray(l_x;  read = true,  write = false)
-        return fx, lx, nothing, nothing
+        return fx, lx, nothing, nothing, nothing
     end
 end
 
-function restore_petsc_arrays(petsclib, g_fx, l_x, fx_arr, lx_arr, fx, lx)
+function restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
     if useCUDA
+        if fx_bounce !== nothing
+            # D2H: copy GPU residual result back to the HOST PETSc array.
+            CUDA.synchronize()
+            copyto!(fx_arr, fx_bounce)
+        end
         LibPETSc.VecRestoreArrayAndMemType(petsclib, g_fx, fx_arr)
         LibPETSc.VecRestoreArrayReadAndMemType(petsclib, l_x, lx_arr)
     else
@@ -273,7 +293,7 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     l_x = PETSc.DMLocalVec(da)
     PETSc.dm_global_to_local!(g_x, l_x, da, PETSc.INSERT_VALUES)
 
-    fx, lx, fx_ptr, lx_ptr = get_petsc_arrays(petsclib, g_fx, l_x)
+    fx, lx, fx_arr, lx_arr, fx_bounce = get_petsc_arrays(petsclib, g_fx, l_x)
 
     corners       = PETSc.getcorners(da)
     ghost_corners = PETSc.getghostcorners(da)
@@ -303,23 +323,25 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     )
     KernelAbstractions.synchronize(backend)
 
-    restore_petsc_arrays(petsclib, g_fx, l_x, fx_ptr, lx_ptr, fx, lx)
+    restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
     PETSc.destroy(l_x)
     return PetscInt(0)
 end
 
-# ── Jacobian (finite differences via PETSc's built-in column-by-column FD) ───
+# ── Jacobian: FD coloring via PETSc's SNESComputeJacobianDefaultColor ────────
+#
+# With ctx = C_NULL, PETSc auto-builds the ISColoring from the DM sparsity
+# pattern and registers SNESComputeFunction (our Julia callback) as the
+# function to perturb.  Because PETSc vecs are CPU (Stage 1 — no DMSetVecType),
+# the VecAXPY inside MatFDColoringApply_AIJ uses BLAS rather than cuBLAS, so
+# the placed HOST dy[] buffer is updated correctly.
 #
-# Pass SNESComputeJacobianDefault directly as the C function pointer, exactly
-# like the C code does:
-#   SNESSetJacobian(snes, J, J, SNESComputeJacobianDefault, NULL)
-# This avoids a nested Julia→C→Julia callback chain and is more robust
-# in parallel.  For production, replace with coloring-based FD by swapping
-# SNESComputeJacobianDefault → SNESComputeJacobianDefaultColor.
+# The residual callback bounces through GPU via H2D/kernel/D2H in
+# get_petsc_arrays / restore_petsc_arrays, so GPU acceleration is preserved.
 #
 J = LibPETSc.DMCreateMatrix(petsclib, da)
 LibPETSc.SNESSetJacobian(petsclib, snes, J, J,
-    cglobal((:SNESComputeJacobianDefault, petsclib.petsc_library)), C_NULL)
+    cglobal((:SNESComputeJacobianDefaultColor, petsclib.petsc_library)), C_NULL)
 
 # ── Solve ─────────────────────────────────────────────────────────────────────
 @show Threads.nthreads()

From 072ab4164d554cae36165063d07c3d8bee54a6b1 Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Sun, 26 Apr 2026 16:09:02 +0000
Subject: [PATCH 11/39] make mg work

Co-authored-by: Copilot <copilot@github.com>
---
 examples/ex19.jl | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index c7fb0971..c7f44cc9 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -31,7 +31,7 @@
 =#
 
 # ── GPU switch ────────────────────────────────────────────────────────────────
-const useCUDA = false
+const useCUDA = true
 
 using MPI
 using PETSc
@@ -314,11 +314,22 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     ox = xs - xsg
     oy = ys - ysg
 
+    # Recompute grid metrics from the DM so this callback is correct on every
+    # MG level (coarsen/refine changes mx/my; capturing outer-scope values
+    # would give wrong stencil weights and lid velocity on coarse grids).
+    info_  = PETSc.getinfo(da)
+    mx_    = info_.global_size[1]
+    my_    = info_.global_size[2]
+    dhx_   = T(mx_ - 1);    dhy_   = T(my_ - 1)
+    hx_    = one(T) / dhx_; hy_    = one(T) / dhy_
+    hydhx_ = hy_ * dhx_;    hxdhy_ = hx_ * dhy_
+    lid_   = T(1) / dhx_    # lidvelocity = 1/(mx-1)
+
     cavity_residual_kernel!(backend, 64)(
         f_par, x_par,
-        dhx, dhy, hx, hy, hydhx, hxdhy,
-        user.grashof, user.prandtl, user.lidvelocity,
-        mx, my, xs, ys, ox, oy;
+        dhx_, dhy_, hx_, hy_, hydhx_, hxdhy_,
+        user.grashof, user.prandtl, lid_,
+        mx_, my_, xs, ys, ox, oy;
         ndrange = (nx_own, ny_own),
     )
     KernelAbstractions.synchronize(backend)

From fb76e758e70d08670bd8e9f94c43bb5eb9a9c894 Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Mon, 27 Apr 2026 05:20:15 +0000
Subject: [PATCH 12/39] use colorring

Co-authored-by: Copilot <copilot@github.com>
---
 examples/ex19.jl                     | 275 ++++++++++++++++++++++++---
 ext/PETScCUDAExt.jl                  |  17 +-
 src/autowrapped/ISaddons_wrappers.jl |   4 +-
 src/vec.jl                           |  20 +-
 4 files changed, 270 insertions(+), 46 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index c7f44cc9..afe9a3a7 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -18,20 +18,21 @@
     Top/bottom:  ∂T/∂n = 0  (insulated, Neumann)
     ω:           derived from the no-slip condition at each wall
 
-  Usage:
+  Usage (from the examples/ directory):
     julia --project ex19.jl
-    julia --project ex19.jl -snes_monitor -ksp_monitor -da_grid_x 32 -da_grid_y 32
+    julia --project ex19.jl -snes_monitor -da_grid_x 129 -da_grid_y 129
+    julia --project ex19.jl -snes_monitor -da_grid_x 129 -da_grid_y 129 -log_view
     mpiexec -n 4 julia --project ex19.jl -snes_monitor -pc_type mg -da_grid_x 64 -da_grid_y 64
 
+  Requires: LocalPreferences.toml in examples/ with PetscInt = "Int32" matching the
+  PETSc build (check with: grep sizeof_PetscInt petscconf.h).
+
   GPU usage: set  useCUDA = true  then run as above.
     Requires PETSc built with --with-cuda, and CUDA.jl in the environment.
-
-  Set  useCUDA = true  (below) to run the residual kernel on GPU via CUDA.
-  Requires PETSc built with CUDA support and CUDA.jl installed.
 =#
 
 # ── GPU switch ────────────────────────────────────────────────────────────────
-const useCUDA = true
+const useCUDA = false
 
 using MPI
 using PETSc
@@ -160,11 +161,30 @@ end
     end
 end
 
+# ── FD-coloring GPU helper kernels ────────────────────────────────────────────
+#
+# scatter_perturb_kernel!: add h to selected entries of a vector.
+#   cols[k] is the 1-based Julia index to perturb.
+#
+@kernel function scatter_perturb_kernel!(x, cols, h)
+    k = @index(Global)
+    @inbounds x[cols[k]] += h
+end
+
+# fd_accumulate_kernel!: write (f1 − f0)/h into val at COO indices.
+#   coo_idxs[k] and row_idxs[k] are 1-based Julia indices.
+#
+@kernel function fd_accumulate_kernel!(val, f0, f1, coo_idxs, row_idxs, inv_h)
+    k = @index(Global)
+    @inbounds val[coo_idxs[k]] = (f1[row_idxs[k]] - f0[row_idxs[k]]) * inv_h
+end
+
 # ── Setup ─────────────────────────────────────────────────────────────────────
-opts = isinteractive() ? NamedTuple() : PETSc.parse_options(ARGS)
+opts     = isinteractive() ? NamedTuple() : PETSc.parse_options(filter(a -> a != "-log_view", ARGS))
+log_view = "-log_view" in ARGS
 
-petsclib = PETSc.getlib(; PetscScalar = Float64)
-PETSc.initialize(petsclib; log_view = true)
+petsclib = PETSc.getlib(; PetscScalar = Float64, PetscInt = Int32)
+PETSc.initialize(petsclib; log_view)
 
 T        = Float64
 PetscInt = petsclib.PetscInt
@@ -181,18 +201,26 @@ da = PETSc.DMDA(
     opts...,
 )
 
-# NOTE (Stage 1): PETSc vecs stay on CPU so FD coloring arithmetic uses BLAS
-# (not cuBLAS), avoiding the VecPlaceArray+VecAXPY CUDA bug in PETSc's
-# MatFDColoringApply_AIJ.  The residual kernel still runs on the GPU via
-# explicit H2D/kernel/D2H copies in get_petsc_arrays / restore_petsc_arrays.
+# Stage 2: GPU vecs and GPU matrix enable a fully GPU-resident FD coloring
+# path via COO preallocation.  No host↔device bouncing in residual or Jacobian.
+# NOTE: MG with GPU vecs needs per-level COO rebuild (not yet implemented);
+#       for MG tests use -pc_type lu or default ILU.
+if useCUDA
+    GC.@preserve begin
+        vt = "cuda"
+        mt = "aijcusparse"
+        LibPETSc.DMSetVecType(petsclib, da, pointer(vt))
+        LibPETSc.DMSetMatType(petsclib, da, pointer(mt))
+    end
+end
 
 snes = PETSc.SNES(petsclib, comm; opts...)
 PETSc.setDM!(snes, da)
 
 # Actual grid size after setfromoptions (may differ from the 4×4 default)
 info = PETSc.getinfo(da)
-mx   = info.global_size[1]
-my   = info.global_size[2]
+mx   = Int(info.global_size[1])
+my   = Int(info.global_size[2])
 
 user = AppCtx{T}(
     lidvelocity = T(1) / (mx - 1),
@@ -252,8 +280,8 @@ function get_petsc_arrays(petsclib, g_fx, l_x)
 
         if lx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
             # Native GPU vecs: zero-copy wrap, no bounce needed.
-            fx = unsafe_wrap(CuArray, CuPtr{T}(UInt(pointer(fx_arr))), length(fx_arr))
-            lx = unsafe_wrap(CuArray, CuPtr{T}(UInt(pointer(lx_arr))), length(lx_arr))
+            fx = unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(fx_arr))), length(fx_arr))
+            lx = unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(lx_arr))), length(lx_arr))
             return fx, lx, fx_arr, lx_arr, nothing
         else
             # CPU vecs (FD coloring path): bounce residual through GPU.
@@ -318,8 +346,8 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     # MG level (coarsen/refine changes mx/my; capturing outer-scope values
     # would give wrong stencil weights and lid velocity on coarse grids).
     info_  = PETSc.getinfo(da)
-    mx_    = info_.global_size[1]
-    my_    = info_.global_size[2]
+    mx_    = Int(info_.global_size[1])
+    my_    = Int(info_.global_size[2])
     dhx_   = T(mx_ - 1);    dhy_   = T(my_ - 1)
     hx_    = one(T) / dhx_; hy_    = one(T) / dhy_
     hydhx_ = hy_ * dhx_;    hxdhy_ = hx_ * dhy_
@@ -339,20 +367,204 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     return PetscInt(0)
 end
 
-# ── Jacobian: FD coloring via PETSc's SNESComputeJacobianDefaultColor ────────
+# ── Jacobian: manual FD coloring with GPU-efficient COO matrix assembly ───────
 #
-# With ctx = C_NULL, PETSc auto-builds the ISColoring from the DM sparsity
-# pattern and registers SNESComputeFunction (our Julia callback) as the
-# function to perturb.  Because PETSc vecs are CPU (Stage 1 — no DMSetVecType),
-# the VecAXPY inside MatFDColoringApply_AIJ uses BLAS rather than cuBLAS, so
-# the placed HOST dy[] buffer is updated correctly.
-#
-# The residual callback bounces through GPU via H2D/kernel/D2H in
-# get_petsc_arrays / restore_petsc_arrays, so GPU acceleration is preserved.
+# 1) Obtain the DM's ISColoring (PETSc determines which columns can be
+#    perturbed simultaneously without touching the same nonzero row twice).
+# 2) Build the sparse (row, col) COO triplets analytically from the DMDA STAR
+#    stencil; store per-COO the 0-based color of its column.
+# 3) Pre-allocate J via MatSetPreallocationCOO (one-time GPU setup).
+# 4) Each Newton step: loop over colors, scatter +h to owned cols of that
+#    color (GPU kernel), evaluate F(x_pert), accumulate (F1−F0)/h into
+#    val[] (GPU kernel), assemble via MatSetValuesCOO(J, val_dev).
 #
+# NOTE: Assumes serial (single MPI rank).  For parallel runs ghost-column
+#       colors must be communicated; see ISColoringGetColors documentation.
+# NOTE: For MG, coarser levels fall back to SNESComputeJacobianDefaultColor
+#       (correct, but CPU-only FD coloring for those levels).
+
+# ── 1. ISColoring ─────────────────────────────────────────────────────────────
+iscoloring = LibPETSc.DMCreateColoring(petsclib, da, LibPETSc.IS_COLORING_GLOBAL)
+
+# ── 2. Per-column color via raw ISColoringGetColors call ──────────────────────
+#   C API:  ISColoringGetColors(iscoloring, PetscInt *n, PetscInt *nc,
+#                               const ISColoringValue **colors)
+#   ISColoringValue = unsigned short (UInt16) per petscconf.h PETSC_IS_COLORING_VALUE_TYPE=short
+n_cols_ref     = Ref{PetscInt}(0)
+nc_ref         = Ref{PetscInt}(0)
+colors_ptr_ref = Ref{Ptr{UInt16}}(C_NULL)
+LibPETSc.@chk ccall(
+    (:ISColoringGetColors, petsclib.petsc_library), PetscInt,
+    (LibPETSc.ISColoring, Ptr{PetscInt}, Ptr{PetscInt}, Ptr{Ptr{UInt16}}),
+    iscoloring, n_cols_ref, nc_ref, colors_ptr_ref)
+n_cols   = Int(n_cols_ref[])
+n_colors = Int(nc_ref[])
+# Copy colors to an owned Julia array before we destroy the ISColoring.
+col_colors_host = copy(unsafe_wrap(Vector{UInt16}, colors_ptr_ref[], n_cols; own = false))
+
+# ── 3. Ownership range (0-based PETSc indices) ────────────────────────────────
+row_start, row_end = LibPETSc.VecGetOwnershipRange(petsclib, x)
+col_start = row_start
+@assert n_cols == Int(row_end - row_start) "serial assumption: n_cols ($n_cols) != n_owned_dofs ($(Int(row_end-row_start)))"
+n_local_dofs = Int(row_end - row_start)
+
+# ── 4. Build COO from DMDA STAR stencil ───────────────────────────────────────
+#  For each owned node (ii, jj) and each stencil neighbor, emit dof×dof
+#  (row, col) pairs.  Every entry also records the 0-based color of its column
+#  and the 0-based local row index (row_global − row_start).
+dof_per_node = 4
+coo_corners = PETSc.getcorners(da)
+xs_da = coo_corners.lower[1];  ys_da = coo_corners.lower[2]
+xe_da = coo_corners.upper[1];  ye_da = coo_corners.upper[2]
+
+CPetscInt = petsclib.PetscInt           # matches the actual C sizeof(PetscInt)
+row_coo_host      = CPetscInt[]
+col_coo_host      = CPetscInt[]
+local_row_per_coo = CPetscInt[]   # 0-based local row  (row_global − row_start)
+color_per_coo     = CPetscInt[]   # 0-based color of each COO entry's column
+
+for jj in ys_da:ye_da, ii in xs_da:xe_da
+    ig = ii - 1;  jg = jj - 1           # 0-based global node coords
+    row_base = (jg * mx + ig) * dof_per_node
+
+    neighbors = Tuple{Int,Int}[(ii, jj)]
+    ii > 1  && push!(neighbors, (ii-1, jj))
+    ii < mx && push!(neighbors, (ii+1, jj))
+    jj > 1  && push!(neighbors, (ii, jj-1))
+    jj < my && push!(neighbors, (ii, jj+1))
+
+    for (ni, nj) in neighbors
+        nig = ni - 1;  njg = nj - 1
+        col_base = (njg * mx + nig) * dof_per_node
+        for d_row in 0:dof_per_node-1, d_col in 0:dof_per_node-1
+            r_g = row_base + d_row
+            c_g = col_base + d_col
+            push!(row_coo_host, CPetscInt(r_g))
+            push!(col_coo_host, CPetscInt(c_g))
+            # Serial: local col index = c_g  (col_start == 0)
+            push!(color_per_coo,     CPetscInt(col_colors_host[c_g - Int(col_start) + 1]))
+            push!(local_row_per_coo, CPetscInt(r_g - Int(row_start)))
+        end
+    end
+end
+nnz_coo = length(row_coo_host)
+
+# ── 5. Create J with COO preallocation ────────────────────────────────────────
 J = LibPETSc.DMCreateMatrix(petsclib, da)
-LibPETSc.SNESSetJacobian(petsclib, snes, J, J,
-    cglobal((:SNESComputeJacobianDefaultColor, petsclib.petsc_library)), C_NULL)
+# Direct ccall: use CPetscInt (= petsclib.PetscInt) so this works for both
+# 32-bit and 64-bit PETSc builds.  PetscCount = ptrdiff_t = Int64 always.
+LibPETSc.@chk ccall(
+    (:MatSetPreallocationCOO, petsclib.petsc_library), Cint,
+    (LibPETSc.CMat, Int64, Ptr{CPetscInt}, Ptr{CPetscInt}),
+    J, Int64(nnz_coo), row_coo_host, col_coo_host)
+
+# ── 6. Per-color index arrays for the FD loop ─────────────────────────────────
+# perturb_cols_1b[c]: 1-based local x-indices of owned columns with color c-1.
+# coo_idxs_1b[c]:    1-based COO entry indices whose column color == c-1.
+# local_rows_1b[c]:  1-based local residual-row indices for those COO entries.
+perturb_cols_1b = [Int32[] for _ in 1:n_colors]
+for k_local in 1:n_cols
+    c = Int(col_colors_host[k_local]) + 1   # 1-based color
+    push!(perturb_cols_1b[c], Int32(k_local))
+end
+
+coo_idxs_1b   = [Int32[] for _ in 1:n_colors]
+local_rows_1b = [Int32[] for _ in 1:n_colors]
+for k in 1:nnz_coo
+    c = Int(color_per_coo[k]) + 1            # 1-based color
+    push!(coo_idxs_1b[c],   Int32(k))
+    push!(local_rows_1b[c], Int32(local_row_per_coo[k] + 1))  # 0→1-based
+end
+
+if useCUDA
+    perturb_cols_dev = [CuArray(v) for v in perturb_cols_1b]
+    coo_idxs_dev     = [CuArray(v) for v in coo_idxs_1b]
+    local_rows_dev   = [CuArray(v) for v in local_rows_1b]
+    val_dev          = CUDA.zeros(T, nnz_coo)
+else
+    perturb_cols_dev = perturb_cols_1b
+    coo_idxs_dev     = coo_idxs_1b
+    local_rows_dev   = local_rows_1b
+    val_dev          = zeros(T, nnz_coo)
+end
+
+LibPETSc.ISColoringDestroy(petsclib, iscoloring)
+
+# ── 7. Scratch vectors for the FD loop ────────────────────────────────────────
+x_pert_vec = LibPETSc.VecDuplicate(petsclib, x)
+f0_vec     = LibPETSc.VecDuplicate(petsclib, x)
+f1_vec     = LibPETSc.VecDuplicate(petsclib, x)
+h_eps      = T(sqrt(eps(T)))
+inv_h      = T(1) / h_eps
+
+# ── 8. Custom Jacobian callback ───────────────────────────────────────────────
+PETSc.setjacobian!(snes, J) do Jmat, actual_snes, g_x
+    # For MG: if this is a coarser level (grid size differs from fine grid),
+    # fall back to PETSc's built-in FD coloring (correct for that level's DM).
+    da_level   = PETSc.getDM(actual_snes)
+    info_level = PETSc.getinfo(da_level)
+    if info_level.global_size[1] != mx || info_level.global_size[2] != my
+        LibPETSc.@chk ccall(
+            (:SNESComputeJacobianDefaultColor, petsclib.petsc_library), PetscInt,
+            (LibPETSc.CSNES, LibPETSc.CVec, LibPETSc.CMat, LibPETSc.CMat, Ptr{Cvoid}),
+            actual_snes.ptr, g_x.ptr, Jmat.ptr, Jmat.ptr, C_NULL)
+        return PetscInt(0)
+    end
+
+    # ── Evaluate F(x) → f0 ────────────────────────────────────────────────────
+    LibPETSc.SNESComputeFunction(petsclib, actual_snes, g_x, f0_vec)
+    f0_arr, f0_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, f0_vec)
+    f0_dev = (useCUDA && f0_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE) ?
+        unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(f0_arr))), n_local_dofs) :
+        f0_arr
+
+    # ── FD loop over colors ────────────────────────────────────────────────────
+    for c in 1:n_colors
+        isempty(perturb_cols_dev[c]) && continue
+
+        # Copy x → x_pert, then scatter +h to owned cols of color c.
+        LibPETSc.VecCopy(petsclib, g_x, x_pert_vec)
+        xp_arr, xp_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, x_pert_vec)
+        xp_dev = (useCUDA && xp_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE) ?
+            unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(xp_arr))), n_local_dofs) :
+            xp_arr
+        scatter_perturb_kernel!(backend, 64)(
+            xp_dev, perturb_cols_dev[c], h_eps;
+            ndrange = length(perturb_cols_dev[c]))
+        KernelAbstractions.synchronize(backend)
+        LibPETSc.VecRestoreArrayAndMemType(petsclib, x_pert_vec, xp_arr)
+
+        # Evaluate F(x_pert) → f1.
+        LibPETSc.SNESComputeFunction(petsclib, actual_snes, x_pert_vec, f1_vec)
+
+        # Accumulate (f1 − f0)/h into val[] at the COO indices of color c.
+        f1_arr, f1_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, f1_vec)
+        f1_dev = (useCUDA && f1_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE) ?
+            unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(f1_arr))), n_local_dofs) :
+            f1_arr
+        fd_accumulate_kernel!(backend, 64)(
+            val_dev, f0_dev, f1_dev,
+            coo_idxs_dev[c], local_rows_dev[c], inv_h;
+            ndrange = length(coo_idxs_dev[c]))
+        KernelAbstractions.synchronize(backend)
+        LibPETSc.VecRestoreArrayReadAndMemType(petsclib, f1_vec, f1_arr)
+    end
+
+    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, f0_vec, f0_arr)
+
+    # ── Assemble J via COO ─────────────────────────────────────────────────────
+    # For GPU matrix (aijcusparse): pass device pointer so PETSc's CUDA kernel
+    # scatters val[] into CSR storage entirely on device (no D2H transfer).
+    if useCUDA
+        LibPETSc.@chk ccall(
+            (:MatSetValuesCOO, petsclib.petsc_library), PetscInt,
+            (LibPETSc.CMat, Ptr{T}, LibPETSc.InsertMode),
+            Jmat.ptr, Ptr{T}(UInt64(pointer(val_dev))), LibPETSc.INSERT_VALUES)
+    else
+        LibPETSc.MatSetValuesCOO(petsclib, Jmat, val_dev, LibPETSc.INSERT_VALUES)
+    end
+    return PetscInt(0)
+end
 
 # ── Solve ─────────────────────────────────────────────────────────────────────
 @show Threads.nthreads()
@@ -384,6 +596,9 @@ MPI.Barrier(comm)
 # those reference counts are decremented before we explicitly free the objects.
 PETSc.destroy(snes)
 PETSc.destroy(J)
+LibPETSc.VecDestroy(petsclib, x_pert_vec)
+LibPETSc.VecDestroy(petsclib, f0_vec)
+LibPETSc.VecDestroy(petsclib, f1_vec)
 PETSc.destroy(x)
 PETSc.destroy(r)
 PETSc.destroy(da)
diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
index d2c67313..273f3d5f 100644
--- a/ext/PETScCUDAExt.jl
+++ b/ext/PETScCUDAExt.jl
@@ -64,15 +64,15 @@ function _unsafe_localarray_device(
     end
 end
 
-# ── Public override of withlocalarray_device! ─────────────────────────────────
+# ── Public hook: register CUDA implementation for withlocalarray_device! ──────
 #
-# Drop-in replacement for withlocalarray! that hands the kernel a CuArray when
-# the Vec lives on GPU, and falls back to a plain Array when it lives on CPU.
-# No host↔device copies are performed in either case.
+# We cannot override PETSc.withlocalarray_device! with the same signature
+# during precompilation (Julia restriction).  Instead we register a closure
+# in __init__ that the base method dispatches to when the hook is non-nothing.
 #
-function PETSc.withlocalarray_device!(
+function _cuda_withlocalarray_device_impl!(
     f!,
-    vecs::NTuple{N, AbstractPetscVec};
+    vecs::NTuple{N};
     read::Union{Bool, NTuple{N, Bool}}  = true,
     write::Union{Bool, NTuple{N, Bool}} = true,
 ) where {N}
@@ -90,7 +90,8 @@ function PETSc.withlocalarray_device!(
     return val
 end
 
-PETSc.withlocalarray_device!(f!, vecs...; kwargs...) =
-    PETSc.withlocalarray_device!(f!, vecs; kwargs...)
+function __init__()
+    PETSc._withlocalarray_device_hook[] = _cuda_withlocalarray_device_impl!
+end
 
 end # module
diff --git a/src/autowrapped/ISaddons_wrappers.jl b/src/autowrapped/ISaddons_wrappers.jl
index 7b22049e..0c9a1cce 100644
--- a/src/autowrapped/ISaddons_wrappers.jl
+++ b/src/autowrapped/ISaddons_wrappers.jl
@@ -1309,12 +1309,12 @@ $(_doc_external("Vec/ISColoringDestroy"))
 function ISColoringDestroy(petsclib::PetscLibType, iscoloring::ISColoring) end
 
 @for_petsc function ISColoringDestroy(petsclib::$UnionPetscLib, iscoloring::ISColoring )
-
+    iscoloring_ref = Ref(iscoloring)
     @chk ccall(
                (:ISColoringDestroy, $petsc_library),
                PetscErrorCode,
                (Ptr{ISColoring},),
-               iscoloring,
+               iscoloring_ref,
               )
 
 
diff --git a/src/vec.jl b/src/vec.jl
index 2265dbce..6653031a 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -316,13 +316,21 @@ withlocalarray!(f!, vecs...; kwargs...) = withlocalarray!(f!, vecs; kwargs...)
 Like [`withlocalarray!`](@ref) but returns a device array (e.g. `CuArray`) when
 the underlying PETSc vector lives on GPU (i.e. `PetscMemType` is not HOST).
 
-When CUDA.jl is loaded the `PETScCUDAExt` extension overrides this function to
-wrap the device pointer returned by `VecGetArrayAndMemType` into a `CuArray`
-without any host↔device copy.  When CUDA.jl is not loaded, or when the vector
-lives on the host, this falls back to [`withlocalarray!`](@ref).
+When CUDA.jl is loaded the `PETScCUDAExt` extension sets the
+`_withlocalarray_device_hook` global to wrap device pointers as `CuArray`s
+without any host↔device copy.  When CUDA.jl is not loaded this falls back
+to [`withlocalarray!`](@ref).
 """
-withlocalarray_device!(f!, vecs::NTuple{N, AbstractPetscVec}; kwargs...) where {N} =
-    withlocalarray!(f!, vecs; kwargs...)
+const _withlocalarray_device_hook = Ref{Any}(nothing)
+
+function withlocalarray_device!(f!, vecs::NTuple{N, AbstractPetscVec}; kwargs...) where {N}
+    hook = _withlocalarray_device_hook[]
+    if hook !== nothing
+        return hook(f!, vecs; kwargs...)
+    else
+        return withlocalarray!(f!, vecs; kwargs...)
+    end
+end
 withlocalarray_device!(f!, vecs...; kwargs...) = withlocalarray_device!(f!, vecs; kwargs...)
 
 

From e0267e91137ce029f6f484e701fc202cbdd75440 Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Mon, 27 Apr 2026 09:04:09 +0000
Subject: [PATCH 13/39] MPI support

Co-authored-by: Copilot <copilot@github.com>
---
 examples/ex19.jl | 137 +++++++++++++++++++++++++++++++----------------
 1 file changed, 90 insertions(+), 47 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index afe9a3a7..f1c4a8a3 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -32,7 +32,7 @@
 =#
 
 # ── GPU switch ────────────────────────────────────────────────────────────────
-const useCUDA = false
+const useCUDA = true
 
 using MPI
 using PETSc
@@ -369,22 +369,24 @@ end
 
 # ── Jacobian: manual FD coloring with GPU-efficient COO matrix assembly ───────
 #
-# 1) Obtain the DM's ISColoring (PETSc determines which columns can be
-#    perturbed simultaneously without touching the same nonzero row twice).
+# 1) Obtain the DM's IS_COLORING_LOCAL ISColoring.  Colors cover owned + ghost
+#    DOFs in DMDA local ordering; ghost colors are consistent with owning ranks.
 # 2) Build the sparse (row, col) COO triplets analytically from the DMDA STAR
-#    stencil; store per-COO the 0-based color of its column.
+#    stencil; record per-COO the 0-based color of its column, looked up via
+#    ghost-local coordinates (works for owned AND off-rank ghost columns).
 # 3) Pre-allocate J via MatSetPreallocationCOO (one-time GPU setup).
 # 4) Each Newton step: loop over colors, scatter +h to owned cols of that
 #    color (GPU kernel), evaluate F(x_pert), accumulate (F1−F0)/h into
 #    val[] (GPU kernel), assemble via MatSetValuesCOO(J, val_dev).
 #
-# NOTE: Assumes serial (single MPI rank).  For parallel runs ghost-column
-#       colors must be communicated; see ISColoringGetColors documentation.
 # NOTE: For MG, coarser levels fall back to SNESComputeJacobianDefaultColor
 #       (correct, but CPU-only FD coloring for those levels).
 
 # ── 1. ISColoring ─────────────────────────────────────────────────────────────
-iscoloring = LibPETSc.DMCreateColoring(petsclib, da, LibPETSc.IS_COLORING_GLOBAL)
+# IS_COLORING_LOCAL returns colors for all local DOFs (owned + ghost) in DMDA
+# local Vec ordering.  Ghost DOF colors are consistent with the owning rank's
+# assignment, so no extra MPI communication is needed here.
+iscoloring = LibPETSc.DMCreateColoring(petsclib, da, LibPETSc.IS_COLORING_LOCAL)
 
 # ── 2. Per-column color via raw ISColoringGetColors call ──────────────────────
 #   C API:  ISColoringGetColors(iscoloring, PetscInt *n, PetscInt *nc,
@@ -397,35 +399,49 @@ LibPETSc.@chk ccall(
     (:ISColoringGetColors, petsclib.petsc_library), PetscInt,
     (LibPETSc.ISColoring, Ptr{PetscInt}, Ptr{PetscInt}, Ptr{Ptr{UInt16}}),
     iscoloring, n_cols_ref, nc_ref, colors_ptr_ref)
-n_cols   = Int(n_cols_ref[])
-n_colors = Int(nc_ref[])
+n_cols_local = Int(n_cols_ref[])   # IS_COLORING_LOCAL: owned + ghost DOFs
+n_colors     = Int(nc_ref[])
 # Copy colors to an owned Julia array before we destroy the ISColoring.
-col_colors_host = copy(unsafe_wrap(Vector{UInt16}, colors_ptr_ref[], n_cols; own = false))
+col_colors_local = copy(unsafe_wrap(Vector{UInt16}, colors_ptr_ref[], n_cols_local; own = false))
 
 # ── 3. Ownership range (0-based PETSc indices) ────────────────────────────────
 row_start, row_end = LibPETSc.VecGetOwnershipRange(petsclib, x)
-col_start = row_start
-@assert n_cols == Int(row_end - row_start) "serial assumption: n_cols ($n_cols) != n_owned_dofs ($(Int(row_end-row_start)))"
 n_local_dofs = Int(row_end - row_start)
 
 # ── 4. Build COO from DMDA STAR stencil ───────────────────────────────────────
 #  For each owned node (ii, jj) and each stencil neighbor, emit dof×dof
-#  (row, col) pairs.  Every entry also records the 0-based color of its column
-#  and the 0-based local row index (row_global − row_start).
+#  (row, col) pairs using GHOST-LOCAL 0-based indices.  Using local indices
+#  (rather than global natural-ordering) makes the code correct for any MPI
+#  decomposition (1D or 2D), because MatSetValuesLocal /
+#  MatSetPreallocationCOOLocal handle the local→global mapping internally.
 dof_per_node = 4
-coo_corners = PETSc.getcorners(da)
-xs_da = coo_corners.lower[1];  ys_da = coo_corners.lower[2]
-xe_da = coo_corners.upper[1];  ye_da = coo_corners.upper[2]
+coo_corners       = PETSc.getcorners(da)
+ghost_coo_corners = PETSc.getghostcorners(da)
+xs_da  = coo_corners.lower[1];       ys_da  = coo_corners.lower[2]
+xe_da  = coo_corners.upper[1];       ye_da  = coo_corners.upper[2]
+xsg_da = ghost_coo_corners.lower[1]; ysg_da = ghost_coo_corners.lower[2]
+xeg_da = ghost_coo_corners.upper[1]; yeg_da = ghost_coo_corners.upper[2]
+nx_g_da = xeg_da - xsg_da + 1;  ny_g_da = yeg_da - ysg_da + 1
+# Reshape into [dof, ghost_x, ghost_y] — matches DMDA local Vec layout.
+col_colors_mat = reshape(col_colors_local, dof_per_node, nx_g_da, ny_g_da)
 
 CPetscInt = petsclib.PetscInt           # matches the actual C sizeof(PetscInt)
-row_coo_host      = CPetscInt[]
-col_coo_host      = CPetscInt[]
-local_row_per_coo = CPetscInt[]   # 0-based local row  (row_global − row_start)
+# Ghost-local 0-based row / col indices (for MatSetValuesLocal /
+# MatSetPreallocationCOOLocal).  Both rows and cols use the DMDA ghost-local
+# numbering: index = d + ix_ghost * dof + iy_ghost * dof * nx_g
+row_coo_local     = CPetscInt[]
+col_coo_local     = CPetscInt[]
+local_row_per_coo = CPetscInt[]   # 0-based OWNED-local row (= p in VecGetArray)
 color_per_coo     = CPetscInt[]   # 0-based color of each COO entry's column
 
 for jj in ys_da:ye_da, ii in xs_da:xe_da
-    ig = ii - 1;  jg = jj - 1           # 0-based global node coords
-    row_base = (jg * mx + ig) * dof_per_node
+    # Ghost-local (0-based) coordinates of this owned node
+    ix_gh = ii - xsg_da   # 0-based ghost-x
+    iy_gh = jj - ysg_da   # 0-based ghost-y
+    # Owned-local (0-based) position (for f-array indexing)
+    ix_ow = ii - xs_da    # 0-based owned-x
+    iy_ow = jj - ys_da    # 0-based owned-y
+    nx_own_loc = xe_da - xs_da + 1
 
     neighbors = Tuple{Int,Int}[(ii, jj)]
     ii > 1  && push!(neighbors, (ii-1, jj))
@@ -434,38 +450,59 @@ for jj in ys_da:ye_da, ii in xs_da:xe_da
     jj < my && push!(neighbors, (ii, jj+1))
 
     for (ni, nj) in neighbors
-        nig = ni - 1;  njg = nj - 1
-        col_base = (njg * mx + nig) * dof_per_node
+        nix_gh = ni - xsg_da   # 0-based ghost-x of neighbor
+        njy_gh = nj - ysg_da   # 0-based ghost-y of neighbor
         for d_row in 0:dof_per_node-1, d_col in 0:dof_per_node-1
-            r_g = row_base + d_row
-            c_g = col_base + d_col
-            push!(row_coo_host, CPetscInt(r_g))
-            push!(col_coo_host, CPetscInt(c_g))
-            # Serial: local col index = c_g  (col_start == 0)
-            push!(color_per_coo,     CPetscInt(col_colors_host[c_g - Int(col_start) + 1]))
-            push!(local_row_per_coo, CPetscInt(r_g - Int(row_start)))
+            # Ghost-local 0-based indices (used with MatSetValuesLocal)
+            r_local = d_row + ix_gh  * dof_per_node + iy_gh  * dof_per_node * nx_g_da
+            c_local = d_col + nix_gh * dof_per_node + njy_gh * dof_per_node * nx_g_da
+            # Owned-local 0-based row (= position in VecGetArray output)
+            p_owned = d_row + ix_ow * dof_per_node + iy_ow * nx_own_loc * dof_per_node
+            push!(row_coo_local,    CPetscInt(r_local))
+            push!(col_coo_local,    CPetscInt(c_local))
+            push!(color_per_coo,    CPetscInt(col_colors_mat[d_col+1, nix_gh+1, njy_gh+1]))
+            push!(local_row_per_coo, CPetscInt(p_owned))
         end
     end
 end
-nnz_coo = length(row_coo_host)
+nnz_coo = length(row_coo_local)
 
-# ── 5. Create J with COO preallocation ────────────────────────────────────────
+# ── 5. Create J ───────────────────────────────────────────────────────────────
 J = LibPETSc.DMCreateMatrix(petsclib, da)
-# Direct ccall: use CPetscInt (= petsclib.PetscInt) so this works for both
-# 32-bit and 64-bit PETSc builds.  PetscCount = ptrdiff_t = Int64 always.
-LibPETSc.@chk ccall(
-    (:MatSetPreallocationCOO, petsclib.petsc_library), Cint,
-    (LibPETSc.CMat, Int64, Ptr{CPetscInt}, Ptr{CPetscInt}),
-    J, Int64(nnz_coo), row_coo_host, col_coo_host)
+# For GPU: use COO-local preallocation so MatSetValuesCOO can scatter on device.
+# For CPU: DMCreateMatrix already preallocated the correct structure; use
+# MatSetValuesLocal (handles any 1-D or 2-D MPI decomposition correctly).
+if useCUDA
+    LibPETSc.@chk ccall(
+        (:MatSetPreallocationCOOLocal, petsclib.petsc_library), Cint,
+        (LibPETSc.CMat, Int64, Ptr{CPetscInt}, Ptr{CPetscInt}),
+        J, Int64(nnz_coo), row_coo_local, col_coo_local)
+end
 
 # ── 6. Per-color index arrays for the FD loop ─────────────────────────────────
-# perturb_cols_1b[c]: 1-based local x-indices of owned columns with color c-1.
+# perturb_cols_1b[c]: 1-based OWNED-LOCAL indices of owned columns with color c-1.
 # coo_idxs_1b[c]:    1-based COO entry indices whose column color == c-1.
 # local_rows_1b[c]:  1-based local residual-row indices for those COO entries.
+#
+# IMPORTANT: col_colors_local uses the GHOST-LOCAL layout [dof, ghost_x, ghost_y],
+# but VecGetArray returns the OWNED-LOCAL portion (owned DOFs only, re-indexed 1..n_local_dofs).
+# For ranks where ghost DOFs come BEFORE owned DOFs in the ghost-local vec (e.g. rank 1
+# with ghost row below), the ghost-local index of owned DOF p ≠ p.  We must convert
+# owned-local index p → ghost-local index k before looking up the color.
+ox_coo   = xs_da - xsg_da                      # ghost offset in x (grid nodes)
+oy_coo   = ys_da - ysg_da                      # ghost offset in y (grid nodes)
+nx_own   = xe_da - xs_da + 1                   # owned x width
 perturb_cols_1b = [Int32[] for _ in 1:n_colors]
-for k_local in 1:n_cols
-    c = Int(col_colors_host[k_local]) + 1   # 1-based color
-    push!(perturb_cols_1b[c], Int32(k_local))
+for p_local in 1:n_local_dofs   # 1-based owned-local index
+    p0      = p_local - 1       # 0-based
+    d       =  p0 % dof_per_node
+    x_owned = (p0 ÷ dof_per_node) % nx_own          # 0-based owned-x
+    y_owned = (p0 ÷ dof_per_node) ÷ nx_own          # 0-based owned-y
+    # convert to ghost-local 1-based index
+    k_ghost = d + (x_owned + ox_coo) * dof_per_node +
+              (y_owned + oy_coo) * dof_per_node * nx_g_da + 1
+    c = Int(col_colors_local[k_ghost]) + 1   # 1-based color
+    push!(perturb_cols_1b[c], Int32(p_local))
 end
 
 coo_idxs_1b   = [Int32[] for _ in 1:n_colors]
@@ -552,16 +589,22 @@ PETSc.setjacobian!(snes, J) do Jmat, actual_snes, g_x
 
     LibPETSc.VecRestoreArrayReadAndMemType(petsclib, f0_vec, f0_arr)
 
-    # ── Assemble J via COO ─────────────────────────────────────────────────────
-    # For GPU matrix (aijcusparse): pass device pointer so PETSc's CUDA kernel
-    # scatters val[] into CSR storage entirely on device (no D2H transfer).
+    # ── Assemble J via COO (GPU) or MatSetValues (CPU) ────────────────────────
     if useCUDA
         LibPETSc.@chk ccall(
             (:MatSetValuesCOO, petsclib.petsc_library), PetscInt,
             (LibPETSc.CMat, Ptr{T}, LibPETSc.InsertMode),
             Jmat.ptr, Ptr{T}(UInt64(pointer(val_dev))), LibPETSc.INSERT_VALUES)
     else
-        LibPETSc.MatSetValuesCOO(petsclib, Jmat, val_dev, LibPETSc.INSERT_VALUES)
+        LibPETSc.MatZeroEntries(petsclib, Jmat)
+        for k in 1:nnz_coo
+            LibPETSc.MatSetValuesLocal(petsclib, Jmat,
+                PetscInt(1), CPetscInt[row_coo_local[k]],
+                PetscInt(1), CPetscInt[col_coo_local[k]],
+                T[val_dev[k]], LibPETSc.INSERT_VALUES)
+        end
+        LibPETSc.MatAssemblyBegin(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
+        LibPETSc.MatAssemblyEnd(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
     end
     return PetscInt(0)
 end

From 5ac3c3d62f5040430aa8adfedbe31e3db8ff58de Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Mon, 27 Apr 2026 14:51:14 +0000
Subject: [PATCH 14/39] mg working (GPU + CPU, serial + parallel)

Co-authored-by: Copilot <copilot@github.com>
---
 examples/ex19.jl | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index f1c4a8a3..9c837e06 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -32,7 +32,7 @@
 =#
 
 # ── GPU switch ────────────────────────────────────────────────────────────────
-const useCUDA = true
+const useCUDA = false
 
 using MPI
 using PETSc
@@ -278,16 +278,24 @@ function get_petsc_arrays(petsclib, g_fx, l_x)
         fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, g_fx)
         lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, l_x)
 
-        if lx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
-            # Native GPU vecs: zero-copy wrap, no bounce needed.
+        if lx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE && fx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
+            # Both on GPU: zero-copy wrap, no bounce needed.
             fx = unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(fx_arr))), length(fx_arr))
             lx = unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(lx_arr))), length(lx_arr))
             return fx, lx, fx_arr, lx_arr, nothing
         else
-            # CPU vecs (FD coloring path): bounce residual through GPU.
-            lx_gpu = CuArray{T}(undef, length(lx_arr))
+            # At least one Vec is host-resident (e.g. freshly created coarser MG
+            # output Vec not yet allocated on device, or FD coloring CPU path).
+            # If lx is already on device (common for MG coarser levels after
+            # global→local scatter), wrap it directly; otherwise copy H2D.
+            lx_gpu = if lx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
+                unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(lx_arr))), length(lx_arr))
+            else
+                tmp = CuArray{T}(undef, length(lx_arr))
+                copyto!(tmp, lx_arr)          # H2D: send ghost input to GPU
+                tmp
+            end
             fx_gpu = CuArray{T}(undef, length(fx_arr))
-            copyto!(lx_gpu, lx_arr)          # H2D: send ghost input to GPU
             return fx_gpu, lx_gpu, fx_arr, lx_arr, fx_gpu
         end
     else
@@ -591,10 +599,26 @@ PETSc.setjacobian!(snes, J) do Jmat, actual_snes, g_x
 
     # ── Assemble J via COO (GPU) or MatSetValues (CPU) ────────────────────────
     if useCUDA
+        # Assemble via GPU pointer so the GPU (cuSPARSE) copy is up to date.
         LibPETSc.@chk ccall(
             (:MatSetValuesCOO, petsclib.petsc_library), PetscInt,
             (LibPETSc.CMat, Ptr{T}, LibPETSc.InsertMode),
             Jmat.ptr, Ptr{T}(UInt64(pointer(val_dev))), LibPETSc.INSERT_VALUES)
+        LibPETSc.MatAssemblyBegin(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
+        LibPETSc.MatAssemblyEnd(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
+        # Force GPU→CPU sync so both copies are valid.
+        # MatBindToCPU(PETSC_TRUE) triggers MatSeqAIJCUSPARSECopyFromGPU when
+        # offloadmask==PETSC_OFFLOAD_GPU, making the CPU CSR correct.
+        # MatBindToCPU(PETSC_FALSE) then releases the CPU-only restriction,
+        # leaving offloadmask==PETSC_OFFLOAD_BOTH so that:
+        #   MatGetDiagonal (Jacobi smoother in MG)  → reads CPU copy ✓
+        #   MatPtAP (Galerkin coarse-op formation)  → uses GPU copy  ✓
+        LibPETSc.@chk ccall(
+            (:MatBindToCPU, petsclib.petsc_library), PetscInt,
+            (LibPETSc.CMat, LibPETSc.PetscBool), Jmat.ptr, LibPETSc.PETSC_TRUE)
+        LibPETSc.@chk ccall(
+            (:MatBindToCPU, petsclib.petsc_library), PetscInt,
+            (LibPETSc.CMat, LibPETSc.PetscBool), Jmat.ptr, LibPETSc.PETSC_FALSE)
     else
         LibPETSc.MatZeroEntries(petsclib, Jmat)
         for k in 1:nnz_coo

From a73e4ed074a72255a64908a7c3f8c975c77a4faa Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Mon, 27 Apr 2026 17:03:06 +0000
Subject: [PATCH 15/39] use CString for MatType/VecType; add
 get_petsc_arrays/restore_petsc_arrays along with multiple dispatch for GPU

Co-authored-by: Copilot <copilot@github.com>
---
 examples/ex19.jl                   | 84 ++----------------------------
 ext/PETScCUDAExt.jl                | 59 +++++++++++++++++++++
 src/PETSc.jl                       |  1 +
 src/autowrapped/DM_wrappers.jl     | 12 ++---
 src/autowrapped/senums_wrappers.jl |  4 +-
 src/string_wrappers.jl             | 10 +---
 src/vec.jl                         | 69 ++++++++++++++++++++++++
 7 files changed, 143 insertions(+), 96 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index 9c837e06..ab23bea8 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -32,7 +32,7 @@
 =#
 
 # ── GPU switch ────────────────────────────────────────────────────────────────
-const useCUDA = false
+const useCUDA = true
 
 using MPI
 using PETSc
@@ -203,15 +203,9 @@ da = PETSc.DMDA(
 
 # Stage 2: GPU vecs and GPU matrix enable a fully GPU-resident FD coloring
 # path via COO preallocation.  No host↔device bouncing in residual or Jacobian.
-# NOTE: MG with GPU vecs needs per-level COO rebuild (not yet implemented);
-#       for MG tests use -pc_type lu or default ILU.
 if useCUDA
-    GC.@preserve begin
-        vt = "cuda"
-        mt = "aijcusparse"
-        LibPETSc.DMSetVecType(petsclib, da, pointer(vt))
-        LibPETSc.DMSetMatType(petsclib, da, pointer(mt))
-    end
+    LibPETSc.DMSetVecType(petsclib, da, "cuda")
+    LibPETSc.DMSetMatType(petsclib, da, "aijcusparse")
 end
 
 snes = PETSc.SNES(petsclib, comm; opts...)
@@ -252,74 +246,6 @@ PETSc.withlocalarray!(x; read = false) do x_arr
     end
 end
 
-# ── Helpers for PETSc array access with optional GPU residual kernel ──────────
-#
-# Three cases:
-#
-#   useCUDA=false  →  plain CPU arrays via unsafe_localarray.
-#
-#   useCUDA=true, vecs on GPU (PETSC_MEMTYPE_CUDA)
-#              →  zero-copy CuArray wraps; no host↔device transfer.
-#
-#   useCUDA=true, vecs on CPU (PETSC_MEMTYPE_HOST, Stage-1 coloring path)
-#              →  allocate GPU scratch buffers, copy lx H2D before kernel,
-#                 copy fx D2H after kernel, then let PETSc see the result in
-#                 the HOST fx_arr.  FD coloring arithmetic stays on CPU (BLAS)
-#                 which avoids the VecPlaceArray+cuBLAS bug.
-#
-# Returns (fx, lx, fx_arr, lx_arr, fx_bounce)
-#   fx / lx       — arrays passed to the kernel (CuArray or plain Array)
-#   fx_arr/lx_arr — raw PETSc handles for Restore calls (nothing on CPU path)
-#   fx_bounce     — CuArray whose contents must be copied back to fx_arr after
-#                   the kernel; nothing when no copy is needed.
-#
-function get_petsc_arrays(petsclib, g_fx, l_x)
-    if useCUDA
-        fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, g_fx)
-        lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, l_x)
-
-        if lx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE && fx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
-            # Both on GPU: zero-copy wrap, no bounce needed.
-            fx = unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(fx_arr))), length(fx_arr))
-            lx = unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(lx_arr))), length(lx_arr))
-            return fx, lx, fx_arr, lx_arr, nothing
-        else
-            # At least one Vec is host-resident (e.g. freshly created coarser MG
-            # output Vec not yet allocated on device, or FD coloring CPU path).
-            # If lx is already on device (common for MG coarser levels after
-            # global→local scatter), wrap it directly; otherwise copy H2D.
-            lx_gpu = if lx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
-                unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(lx_arr))), length(lx_arr))
-            else
-                tmp = CuArray{T}(undef, length(lx_arr))
-                copyto!(tmp, lx_arr)          # H2D: send ghost input to GPU
-                tmp
-            end
-            fx_gpu = CuArray{T}(undef, length(fx_arr))
-            return fx_gpu, lx_gpu, fx_arr, lx_arr, fx_gpu
-        end
-    else
-        fx = PETSc.unsafe_localarray(g_fx; read = true,  write = true)
-        lx = PETSc.unsafe_localarray(l_x;  read = true,  write = false)
-        return fx, lx, nothing, nothing, nothing
-    end
-end
-
-function restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
-    if useCUDA
-        if fx_bounce !== nothing
-            # D2H: copy GPU residual result back to the HOST PETSc array.
-            CUDA.synchronize()
-            copyto!(fx_arr, fx_bounce)
-        end
-        LibPETSc.VecRestoreArrayAndMemType(petsclib, g_fx, fx_arr)
-        LibPETSc.VecRestoreArrayReadAndMemType(petsclib, l_x, lx_arr)
-    else
-        Base.finalize(fx)
-        Base.finalize(lx)
-    end
-end
-
 # ── Residual callback ─────────────────────────────────────────────────────────
 r = similar(x)
 
@@ -329,7 +255,7 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     l_x = PETSc.DMLocalVec(da)
     PETSc.dm_global_to_local!(g_x, l_x, da, PETSc.INSERT_VALUES)
 
-    fx, lx, fx_arr, lx_arr, fx_bounce = get_petsc_arrays(petsclib, g_fx, l_x)
+    fx, lx, fx_arr, lx_arr, fx_bounce = PETSc.get_petsc_arrays(petsclib, g_fx, l_x)
 
     corners       = PETSc.getcorners(da)
     ghost_corners = PETSc.getghostcorners(da)
@@ -370,7 +296,7 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     )
     KernelAbstractions.synchronize(backend)
 
-    restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
+    PETSc.restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
     PETSc.destroy(l_x)
     return PetscInt(0)
 end
diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
index 273f3d5f..7bab66aa 100644
--- a/ext/PETScCUDAExt.jl
+++ b/ext/PETScCUDAExt.jl
@@ -90,8 +90,67 @@ function _cuda_withlocalarray_device_impl!(
     return val
 end
 
+# ── GPU-aware PETSc array helpers ─────────────────────────────────────────────
+#
+# CUDA implementations of PETSc.get_petsc_arrays / PETSc.restore_petsc_arrays.
+# Registered as hooks in __init__ so the base-module functions dispatch here
+# whenever CUDA.jl is loaded.
+#
+# Three sub-cases handled by get:
+#   Both Vecs on GPU  → zero-copy CuArray wraps, fx_bounce = nothing
+#   lx on GPU only    → wrap lx zero-copy; allocate GPU scratch for fx (bounce)
+#   Both Vecs on CPU  → copy lx H2D; allocate GPU scratch for fx (bounce)
+#
+# restore then D2H-copies the bounce buffer (if any) before calling
+# VecRestoreArray*AndMemType on both Vecs.
+
+function _cuda_get_petsc_arrays_impl(petsclib, g_fx, l_x)
+    T      = petsclib.PetscScalar
+    fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, g_fx)
+    lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, l_x)
+
+    if fx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE &&
+       lx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
+        # Both on GPU: zero-copy wrap, no bounce needed.
+        fx = CUDA.unsafe_wrap(CuArray,
+            reinterpret(CuPtr{T}, UInt(pointer(fx_arr))), length(fx_arr))
+        lx = CUDA.unsafe_wrap(CuArray,
+            reinterpret(CuPtr{T}, UInt(pointer(lx_arr))), length(lx_arr))
+        return fx, lx, fx_arr, lx_arr, nothing
+    else
+        # At least one Vec is host-resident (e.g. freshly created coarser MG
+        # level, or FD-coloring CPU path).  Wrap or copy lx to GPU as needed,
+        # and allocate a GPU scratch buffer for fx so the kernel can write there;
+        # restore_petsc_arrays copies it back D2H after the kernel.
+        lx_gpu = if lx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
+            CUDA.unsafe_wrap(CuArray,
+                reinterpret(CuPtr{T}, UInt(pointer(lx_arr))), length(lx_arr))
+        else
+            tmp = CuArray{T}(undef, length(lx_arr))
+            copyto!(tmp, lx_arr)        # H2D: send ghost input to GPU
+            tmp
+        end
+        fx_gpu = CuArray{T}(undef, length(fx_arr))
+        return fx_gpu, lx_gpu, fx_arr, lx_arr, fx_gpu
+    end
+end
+
+function _cuda_restore_petsc_arrays_impl(
+    petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce,
+)
+    if fx_bounce !== nothing
+        # D2H: copy GPU residual result back to the host PETSc array.
+        CUDA.synchronize()
+        copyto!(fx_arr, fx_bounce)
+    end
+    LibPETSc.VecRestoreArrayAndMemType(petsclib, g_fx, fx_arr)
+    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, l_x, lx_arr)
+end
+
 function __init__()
     PETSc._withlocalarray_device_hook[] = _cuda_withlocalarray_device_impl!
+    PETSc._get_petsc_arrays_hook[]      = _cuda_get_petsc_arrays_impl
+    PETSc._restore_petsc_arrays_hook[]  = _cuda_restore_petsc_arrays_impl
 end
 
 end # module
diff --git a/src/PETSc.jl b/src/PETSc.jl
index 79514c82..8b8e5b9c 100644
--- a/src/PETSc.jl
+++ b/src/PETSc.jl
@@ -25,6 +25,7 @@ export audit_petsc_file
 export set_petsclib
 export set_library!, unset_library!, library_info
 export withlocalarray_device!
+export get_petsc_arrays, restore_petsc_arrays
 
 using Libdl
 
diff --git a/src/autowrapped/DM_wrappers.jl b/src/autowrapped/DM_wrappers.jl
index a1aeb59e..2ab00b21 100644
--- a/src/autowrapped/DM_wrappers.jl
+++ b/src/autowrapped/DM_wrappers.jl
@@ -2618,14 +2618,14 @@ See also:
 # External Links
 $(_doc_external("DM/DMSetVecType"))
 """
-function DMSetVecType(petsclib::PetscLibType, dm::PetscDM, ctype::VecType) end
+function DMSetVecType(petsclib::PetscLibType, dm::PetscDM, ctype::Union{Cstring, AbstractString}) end
 
-@for_petsc function DMSetVecType(petsclib::$UnionPetscLib, dm::PetscDM, ctype::VecType )
+@for_petsc function DMSetVecType(petsclib::$UnionPetscLib, dm::PetscDM, ctype::Union{Cstring, AbstractString} )
 
     @chk ccall(
                (:DMSetVecType, $petsc_library),
                PetscErrorCode,
-               (CDM, VecType),
+               (CDM, Cstring),
                dm, ctype,
               )
 
@@ -2773,14 +2773,14 @@ See also:
 # External Links
 $(_doc_external("DM/DMSetMatType"))
 """
-function DMSetMatType(petsclib::PetscLibType, dm::PetscDM, ctype::MatType) end
+function DMSetMatType(petsclib::PetscLibType, dm::PetscDM, ctype::Union{Cstring, AbstractString}) end
 
-@for_petsc function DMSetMatType(petsclib::$UnionPetscLib, dm::PetscDM, ctype::MatType )
+@for_petsc function DMSetMatType(petsclib::$UnionPetscLib, dm::PetscDM, ctype::Union{Cstring, AbstractString} )
 
     @chk ccall(
                (:DMSetMatType, $petsc_library),
                PetscErrorCode,
-               (CDM, MatType),
+               (CDM, Cstring),
                dm, ctype,
               )
 
diff --git a/src/autowrapped/senums_wrappers.jl b/src/autowrapped/senums_wrappers.jl
index 251b2fd3..e2748ceb 100644
--- a/src/autowrapped/senums_wrappers.jl
+++ b/src/autowrapped/senums_wrappers.jl
@@ -6,9 +6,9 @@ PetscDrawType=Ptr{Cchar}
 PFType=Ptr{Cchar}
 DMAdaptorType=Ptr{Cchar}
 PetscFEType=Ptr{Cchar}
-VecType=Ptr{Cchar}
+VecType=Cstring
 VecTaggerType=Ptr{Cchar}
-MatType=Ptr{Cchar}
+MatType=Cstring
 MatSolverType=Ptr{Cchar}
 MatProductAlgorithm=Ptr{Cchar}
 MatOrderingType=Ptr{Cchar}
diff --git a/src/string_wrappers.jl b/src/string_wrappers.jl
index 5916ecf8..3893d635 100644
--- a/src/string_wrappers.jl
+++ b/src/string_wrappers.jl
@@ -27,12 +27,4 @@ function LibPETSc.DMSetType(petsclib, dm, type::AbstractString)
     GC.@preserve s LibPETSc.DMSetType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, s))
 end
 
-function LibPETSc.DMSetVecType(petsclib, dm, type::AbstractString)
-    s = String(type)
-    GC.@preserve s LibPETSc.DMSetVecType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, s))
-end
-
-function LibPETSc.DMSetMatType(petsclib, dm, type::AbstractString)
-    s = String(type)
-    GC.@preserve s LibPETSc.DMSetMatType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, s))
-end
+# DMSetVecType and DMSetMatType accept AbstractString directly (VecType/MatType = Cstring).
diff --git a/src/vec.jl b/src/vec.jl
index 6653031a..e989be1c 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -452,3 +452,72 @@ function LinearAlgebra.norm(
     r_val = LibPETSc.VecNorm(PetscLib, v, normtype)
     return r_val
 end
+
+# ── GPU-aware array access helpers ────────────────────────────────────────────
+#
+# `get_petsc_arrays` returns a pair of arrays (read-write and read-only) that
+# are ready to be passed to a compute kernel, together with raw PETSc handles
+# and an optional "bounce" buffer needed for the restore step.
+#
+# When PETScCUDAExt is loaded (i.e. CUDA.jl is in the environment and has been
+# imported) the hooks below are replaced with CUDA-aware implementations that
+# wrap device pointers as CuArrays — zero-copy when both Vecs are already on
+# the device, or with a H2D copy of `l_x` when it is host-resident.  A GPU
+# scratch buffer ("bounce") is allocated for `g_fx` when it is host-resident so
+# the kernel can write into GPU memory; `restore_petsc_arrays` then copies it
+# back D2H before calling VecRestoreArray.
+#
+# On the plain CPU path the hooks are `nothing` and the functions fall back to
+# `unsafe_localarray` with finalizer-based cleanup.
+
+const _get_petsc_arrays_hook     = Ref{Any}(nothing)
+const _restore_petsc_arrays_hook = Ref{Any}(nothing)
+
+"""
+    get_petsc_arrays(petsclib, g_fx, l_x) -> (fx, lx, fx_arr, lx_arr, fx_bounce)
+
+Return arrays for `g_fx` (read-write) and `l_x` (read-only) that are suitable
+for passing to a compute kernel.
+
+When PETScCUDAExt is active and either Vec lives on the GPU the returned
+`fx`/`lx` are `CuArray`s (zero-copy if both Vecs are device-resident, or with
+a host-to-device copy of `l_x` when only `l_x` is on the device).  If `g_fx`
+is host-resident a GPU scratch buffer is returned as `fx_bounce`; its contents
+must be written back by `restore_petsc_arrays` after the kernel completes.
+
+On the CPU path (no CUDA or all Vecs on host) `fx`/`lx` are plain `Array`s and
+`fx_arr = lx_arr = fx_bounce = nothing`.
+
+See also: [`restore_petsc_arrays`](@ref)
+"""
+function get_petsc_arrays(petsclib, g_fx, l_x)
+    hook = _get_petsc_arrays_hook[]
+    if hook !== nothing
+        return hook(petsclib, g_fx, l_x)
+    end
+    # CPU fallback: plain arrays, cleanup via finalizers
+    fx = unsafe_localarray(g_fx; read = true, write = true)
+    lx = unsafe_localarray(l_x;  read = true, write = false)
+    return fx, lx, nothing, nothing, nothing
+end
+
+"""
+    restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
+
+Restore PETSc Vecs after a kernel launched via [`get_petsc_arrays`](@ref).
+
+On the CUDA path this optionally synchronises the device and copies the bounce
+buffer back to the host PETSc array before calling the matching
+`VecRestoreArray*AndMemType` pair.  On the CPU path it simply finalizes the
+plain arrays returned by `unsafe_localarray`.
+"""
+function restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
+    hook = _restore_petsc_arrays_hook[]
+    if hook !== nothing
+        hook(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
+        return
+    end
+    # CPU fallback: finalizers registered by unsafe_localarray do the restore
+    Base.finalize(fx)
+    Base.finalize(lx)
+end

From 9fa9530b927e2d04f52e3e4f0cf397794ef902ba Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Mon, 27 Apr 2026 18:53:31 +0000
Subject: [PATCH 16/39] code refactor part 1

---
 examples/ex19.jl                     | 201 ++++++---------------------
 src/PETSc.jl                         |   1 +
 src/autowrapped/ISaddons_wrappers.jl |  31 +++--
 src/autowrapped/Mat_wrappers.jl      |  14 ++
 src/autowrapped/SNES_wrappers.jl     |   4 +-
 src/dmda.jl                          | 139 ++++++++++++++++++
 6 files changed, 216 insertions(+), 174 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index ab23bea8..0ef6e6e8 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -255,6 +255,12 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     l_x = PETSc.DMLocalVec(da)
     PETSc.dm_global_to_local!(g_x, l_x, da, PETSc.INSERT_VALUES)
 
+    # Get arrays for the output (g_fx) and ghost-padded input (l_x) Vecs.
+    # On GPU, returns CuArray wrappers (zero-copy when both Vecs are device-
+    # resident) together with the raw PETSc handles needed for the restore call.
+    # On CPU, returns plain Array views backed by VecGetArray.
+    # fx_bounce is a GPU scratch buffer used when g_fx is host-resident; it is
+    # copied back D2H by restore_petsc_arrays after the kernel completes.
     fx, lx, fx_arr, lx_arr, fx_bounce = PETSc.get_petsc_arrays(petsclib, g_fx, l_x)
 
     corners       = PETSc.getcorners(da)
@@ -316,136 +322,28 @@ end
 # NOTE: For MG, coarser levels fall back to SNESComputeJacobianDefaultColor
 #       (correct, but CPU-only FD coloring for those levels).
 
-# ── 1. ISColoring ─────────────────────────────────────────────────────────────
-# IS_COLORING_LOCAL returns colors for all local DOFs (owned + ghost) in DMDA
-# local Vec ordering.  Ghost DOF colors are consistent with the owning rank's
-# assignment, so no extra MPI communication is needed here.
-iscoloring = LibPETSc.DMCreateColoring(petsclib, da, LibPETSc.IS_COLORING_LOCAL)
-
-# ── 2. Per-column color via raw ISColoringGetColors call ──────────────────────
-#   C API:  ISColoringGetColors(iscoloring, PetscInt *n, PetscInt *nc,
-#                               const ISColoringValue **colors)
-#   ISColoringValue = unsigned short (UInt16) per petscconf.h PETSC_IS_COLORING_VALUE_TYPE=short
-n_cols_ref     = Ref{PetscInt}(0)
-nc_ref         = Ref{PetscInt}(0)
-colors_ptr_ref = Ref{Ptr{UInt16}}(C_NULL)
-LibPETSc.@chk ccall(
-    (:ISColoringGetColors, petsclib.petsc_library), PetscInt,
-    (LibPETSc.ISColoring, Ptr{PetscInt}, Ptr{PetscInt}, Ptr{Ptr{UInt16}}),
-    iscoloring, n_cols_ref, nc_ref, colors_ptr_ref)
-n_cols_local = Int(n_cols_ref[])   # IS_COLORING_LOCAL: owned + ghost DOFs
-n_colors     = Int(nc_ref[])
-# Copy colors to an owned Julia array before we destroy the ISColoring.
-col_colors_local = copy(unsafe_wrap(Vector{UInt16}, colors_ptr_ref[], n_cols_local; own = false))
-
-# ── 3. Ownership range (0-based PETSc indices) ────────────────────────────────
-row_start, row_end = LibPETSc.VecGetOwnershipRange(petsclib, x)
-n_local_dofs = Int(row_end - row_start)
-
-# ── 4. Build COO from DMDA STAR stencil ───────────────────────────────────────
-#  For each owned node (ii, jj) and each stencil neighbor, emit dof×dof
-#  (row, col) pairs using GHOST-LOCAL 0-based indices.  Using local indices
-#  (rather than global natural-ordering) makes the code correct for any MPI
-#  decomposition (1D or 2D), because MatSetValuesLocal /
-#  MatSetPreallocationCOOLocal handle the local→global mapping internally.
-dof_per_node = 4
-coo_corners       = PETSc.getcorners(da)
-ghost_coo_corners = PETSc.getghostcorners(da)
-xs_da  = coo_corners.lower[1];       ys_da  = coo_corners.lower[2]
-xe_da  = coo_corners.upper[1];       ye_da  = coo_corners.upper[2]
-xsg_da = ghost_coo_corners.lower[1]; ysg_da = ghost_coo_corners.lower[2]
-xeg_da = ghost_coo_corners.upper[1]; yeg_da = ghost_coo_corners.upper[2]
-nx_g_da = xeg_da - xsg_da + 1;  ny_g_da = yeg_da - ysg_da + 1
-# Reshape into [dof, ghost_x, ghost_y] — matches DMDA local Vec layout.
-col_colors_mat = reshape(col_colors_local, dof_per_node, nx_g_da, ny_g_da)
-
-CPetscInt = petsclib.PetscInt           # matches the actual C sizeof(PetscInt)
-# Ghost-local 0-based row / col indices (for MatSetValuesLocal /
-# MatSetPreallocationCOOLocal).  Both rows and cols use the DMDA ghost-local
-# numbering: index = d + ix_ghost * dof + iy_ghost * dof * nx_g
-row_coo_local     = CPetscInt[]
-col_coo_local     = CPetscInt[]
-local_row_per_coo = CPetscInt[]   # 0-based OWNED-local row (= p in VecGetArray)
-color_per_coo     = CPetscInt[]   # 0-based color of each COO entry's column
-
-for jj in ys_da:ye_da, ii in xs_da:xe_da
-    # Ghost-local (0-based) coordinates of this owned node
-    ix_gh = ii - xsg_da   # 0-based ghost-x
-    iy_gh = jj - ysg_da   # 0-based ghost-y
-    # Owned-local (0-based) position (for f-array indexing)
-    ix_ow = ii - xs_da    # 0-based owned-x
-    iy_ow = jj - ys_da    # 0-based owned-y
-    nx_own_loc = xe_da - xs_da + 1
-
-    neighbors = Tuple{Int,Int}[(ii, jj)]
-    ii > 1  && push!(neighbors, (ii-1, jj))
-    ii < mx && push!(neighbors, (ii+1, jj))
-    jj > 1  && push!(neighbors, (ii, jj-1))
-    jj < my && push!(neighbors, (ii, jj+1))
-
-    for (ni, nj) in neighbors
-        nix_gh = ni - xsg_da   # 0-based ghost-x of neighbor
-        njy_gh = nj - ysg_da   # 0-based ghost-y of neighbor
-        for d_row in 0:dof_per_node-1, d_col in 0:dof_per_node-1
-            # Ghost-local 0-based indices (used with MatSetValuesLocal)
-            r_local = d_row + ix_gh  * dof_per_node + iy_gh  * dof_per_node * nx_g_da
-            c_local = d_col + nix_gh * dof_per_node + njy_gh * dof_per_node * nx_g_da
-            # Owned-local 0-based row (= position in VecGetArray output)
-            p_owned = d_row + ix_ow * dof_per_node + iy_ow * nx_own_loc * dof_per_node
-            push!(row_coo_local,    CPetscInt(r_local))
-            push!(col_coo_local,    CPetscInt(c_local))
-            push!(color_per_coo,    CPetscInt(col_colors_mat[d_col+1, nix_gh+1, njy_gh+1]))
-            push!(local_row_per_coo, CPetscInt(p_owned))
-        end
-    end
-end
-nnz_coo = length(row_coo_local)
+# ── Coloring + COO index setup ────────────────────────────────────────────────
+# Builds the IS_COLORING_LOCAL coloring for da's STAR stencil, ghost-local COO
+# (row, col) pairs, and per-color owned-column / COO-entry index arrays.
+# 2-D DMDA STAR stencil only; see PETSc.dmda_star_fd_coloring for 3-D notes.
+coloring     = PETSc.dmda_star_fd_coloring(petsclib, da)
+n_colors     = coloring.n_colors
+n_local_dofs = coloring.n_local_dofs
+nnz_coo      = coloring.nnz_coo
+row_coo_local = coloring.row_coo_local
+col_coo_local = coloring.col_coo_local
 
 # ── 5. Create J ───────────────────────────────────────────────────────────────
 J = LibPETSc.DMCreateMatrix(petsclib, da)
-# For GPU: use COO-local preallocation so MatSetValuesCOO can scatter on device.
-# For CPU: DMCreateMatrix already preallocated the correct structure; use
-# MatSetValuesLocal (handles any 1-D or 2-D MPI decomposition correctly).
-if useCUDA
-    LibPETSc.@chk ccall(
-        (:MatSetPreallocationCOOLocal, petsclib.petsc_library), Cint,
-        (LibPETSc.CMat, Int64, Ptr{CPetscInt}, Ptr{CPetscInt}),
-        J, Int64(nnz_coo), row_coo_local, col_coo_local)
-end
+# Register the COO pattern on both CPU and GPU.  This allows MatSetValuesCOO
+# to be used for assembly in both cases, avoiding per-entry hash-table lookups
+# that MatSetValuesLocal incurs.  On GPU it also enables device-side scatter.
+LibPETSc.MatSetPreallocationCOOLocal(petsclib, J, LibPETSc.PetscCount(nnz_coo), row_coo_local, col_coo_local)
 
-# ── 6. Per-color index arrays for the FD loop ─────────────────────────────────
-# perturb_cols_1b[c]: 1-based OWNED-LOCAL indices of owned columns with color c-1.
-# coo_idxs_1b[c]:    1-based COO entry indices whose column color == c-1.
-# local_rows_1b[c]:  1-based local residual-row indices for those COO entries.
-#
-# IMPORTANT: col_colors_local uses the GHOST-LOCAL layout [dof, ghost_x, ghost_y],
-# but VecGetArray returns the OWNED-LOCAL portion (owned DOFs only, re-indexed 1..n_local_dofs).
-# For ranks where ghost DOFs come BEFORE owned DOFs in the ghost-local vec (e.g. rank 1
-# with ghost row below), the ghost-local index of owned DOF p ≠ p.  We must convert
-# owned-local index p → ghost-local index k before looking up the color.
-ox_coo   = xs_da - xsg_da                      # ghost offset in x (grid nodes)
-oy_coo   = ys_da - ysg_da                      # ghost offset in y (grid nodes)
-nx_own   = xe_da - xs_da + 1                   # owned x width
-perturb_cols_1b = [Int32[] for _ in 1:n_colors]
-for p_local in 1:n_local_dofs   # 1-based owned-local index
-    p0      = p_local - 1       # 0-based
-    d       =  p0 % dof_per_node
-    x_owned = (p0 ÷ dof_per_node) % nx_own          # 0-based owned-x
-    y_owned = (p0 ÷ dof_per_node) ÷ nx_own          # 0-based owned-y
-    # convert to ghost-local 1-based index
-    k_ghost = d + (x_owned + ox_coo) * dof_per_node +
-              (y_owned + oy_coo) * dof_per_node * nx_g_da + 1
-    c = Int(col_colors_local[k_ghost]) + 1   # 1-based color
-    push!(perturb_cols_1b[c], Int32(p_local))
-end
-
-coo_idxs_1b   = [Int32[] for _ in 1:n_colors]
-local_rows_1b = [Int32[] for _ in 1:n_colors]
-for k in 1:nnz_coo
-    c = Int(color_per_coo[k]) + 1            # 1-based color
-    push!(coo_idxs_1b[c],   Int32(k))
-    push!(local_rows_1b[c], Int32(local_row_per_coo[k] + 1))  # 0→1-based
-end
+# ── Per-color index arrays for the FD loop ───────────────────────────────────
+perturb_cols_1b = coloring.perturb_cols
+coo_idxs_1b     = coloring.coo_idxs
+local_rows_1b   = coloring.local_rows
 
 if useCUDA
     perturb_cols_dev = [CuArray(v) for v in perturb_cols_1b]
@@ -459,8 +357,6 @@ else
     val_dev          = zeros(T, nnz_coo)
 end
 
-LibPETSc.ISColoringDestroy(petsclib, iscoloring)
-
 # ── 7. Scratch vectors for the FD loop ────────────────────────────────────────
 x_pert_vec = LibPETSc.VecDuplicate(petsclib, x)
 f0_vec     = LibPETSc.VecDuplicate(petsclib, x)
@@ -470,15 +366,11 @@ inv_h      = T(1) / h_eps
 
 # ── 8. Custom Jacobian callback ───────────────────────────────────────────────
 PETSc.setjacobian!(snes, J) do Jmat, actual_snes, g_x
-    # For MG: if this is a coarser level (grid size differs from fine grid),
+    # For MG: if this is a coarser level (different DM than the fine-grid da),
     # fall back to PETSc's built-in FD coloring (correct for that level's DM).
-    da_level   = PETSc.getDM(actual_snes)
-    info_level = PETSc.getinfo(da_level)
-    if info_level.global_size[1] != mx || info_level.global_size[2] != my
-        LibPETSc.@chk ccall(
-            (:SNESComputeJacobianDefaultColor, petsclib.petsc_library), PetscInt,
-            (LibPETSc.CSNES, LibPETSc.CVec, LibPETSc.CMat, LibPETSc.CMat, Ptr{Cvoid}),
-            actual_snes.ptr, g_x.ptr, Jmat.ptr, Jmat.ptr, C_NULL)
+    da_level = PETSc.getDM(actual_snes)
+    if da_level.ptr != da.ptr
+        LibPETSc.SNESComputeJacobianDefaultColor(petsclib, actual_snes, g_x, Jmat, Jmat, C_NULL)
         return PetscInt(0)
     end
 
@@ -523,15 +415,18 @@ PETSc.setjacobian!(snes, J) do Jmat, actual_snes, g_x
 
     LibPETSc.VecRestoreArrayReadAndMemType(petsclib, f0_vec, f0_arr)
 
-    # ── Assemble J via COO (GPU) or MatSetValues (CPU) ────────────────────────
+    # ── Assemble J via COO ─────────────────────────────────────────────────────
+    # On GPU: pass a raw device pointer so cuSPARSE scatters directly on device.
+    # On CPU: pass the Vector{T} directly (uses the vector overload).
+    # Both paths use the COO pattern registered by MatSetPreallocationCOOLocal.
+    if useCUDA
+        LibPETSc.MatSetValuesCOO(petsclib, Jmat, Ptr{T}(UInt64(pointer(val_dev))), LibPETSc.INSERT_VALUES)
+    else
+        LibPETSc.MatSetValuesCOO(petsclib, Jmat, val_dev, LibPETSc.INSERT_VALUES)
+    end
+    LibPETSc.MatAssemblyBegin(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
+    LibPETSc.MatAssemblyEnd(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
     if useCUDA
-        # Assemble via GPU pointer so the GPU (cuSPARSE) copy is up to date.
-        LibPETSc.@chk ccall(
-            (:MatSetValuesCOO, petsclib.petsc_library), PetscInt,
-            (LibPETSc.CMat, Ptr{T}, LibPETSc.InsertMode),
-            Jmat.ptr, Ptr{T}(UInt64(pointer(val_dev))), LibPETSc.INSERT_VALUES)
-        LibPETSc.MatAssemblyBegin(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
-        LibPETSc.MatAssemblyEnd(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
         # Force GPU→CPU sync so both copies are valid.
         # MatBindToCPU(PETSC_TRUE) triggers MatSeqAIJCUSPARSECopyFromGPU when
         # offloadmask==PETSC_OFFLOAD_GPU, making the CPU CSR correct.
@@ -539,22 +434,8 @@ PETSc.setjacobian!(snes, J) do Jmat, actual_snes, g_x
         # leaving offloadmask==PETSC_OFFLOAD_BOTH so that:
         #   MatGetDiagonal (Jacobi smoother in MG)  → reads CPU copy ✓
         #   MatPtAP (Galerkin coarse-op formation)  → uses GPU copy  ✓
-        LibPETSc.@chk ccall(
-            (:MatBindToCPU, petsclib.petsc_library), PetscInt,
-            (LibPETSc.CMat, LibPETSc.PetscBool), Jmat.ptr, LibPETSc.PETSC_TRUE)
-        LibPETSc.@chk ccall(
-            (:MatBindToCPU, petsclib.petsc_library), PetscInt,
-            (LibPETSc.CMat, LibPETSc.PetscBool), Jmat.ptr, LibPETSc.PETSC_FALSE)
-    else
-        LibPETSc.MatZeroEntries(petsclib, Jmat)
-        for k in 1:nnz_coo
-            LibPETSc.MatSetValuesLocal(petsclib, Jmat,
-                PetscInt(1), CPetscInt[row_coo_local[k]],
-                PetscInt(1), CPetscInt[col_coo_local[k]],
-                T[val_dev[k]], LibPETSc.INSERT_VALUES)
-        end
-        LibPETSc.MatAssemblyBegin(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
-        LibPETSc.MatAssemblyEnd(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
+        LibPETSc.MatBindToCPU(petsclib, Jmat, LibPETSc.PETSC_TRUE)
+        LibPETSc.MatBindToCPU(petsclib, Jmat, LibPETSc.PETSC_FALSE)
     end
     return PetscInt(0)
 end
diff --git a/src/PETSc.jl b/src/PETSc.jl
index 8b8e5b9c..2178d57a 100644
--- a/src/PETSc.jl
+++ b/src/PETSc.jl
@@ -26,6 +26,7 @@ export set_petsclib
 export set_library!, unset_library!, library_info
 export withlocalarray_device!
 export get_petsc_arrays, restore_petsc_arrays
+export dmda_star_fd_coloring
 
 using Libdl
 
diff --git a/src/autowrapped/ISaddons_wrappers.jl b/src/autowrapped/ISaddons_wrappers.jl
index 0c9a1cce..a4645084 100644
--- a/src/autowrapped/ISaddons_wrappers.jl
+++ b/src/autowrapped/ISaddons_wrappers.jl
@@ -1387,7 +1387,7 @@ function ISColoringView(petsclib::PetscLibType, iscoloring::ISColoring, viewer::
 end 
 
 """
-	n::PetscInt,nc::PetscInt = ISColoringGetColors(petsclib::PetscLibType,iscoloring::ISColoring, colors::ISColoringValue) 
+	n::PetscInt, nc::PetscInt, colors::Vector{UInt16} = ISColoringGetColors(petsclib::PetscLibType, iscoloring::ISColoring)
 Returns an array with the color for each local node
 
 Not Collective
@@ -1396,9 +1396,9 @@ Input Parameter:
 - `iscoloring` - the coloring context
 
 Output Parameters:
-- `n`      - number of nodes
+- `n`      - number of nodes (DOFs)
 - `nc`     - number of colors
-- `colors` - color for each node
+- `colors` - copy of the color array (one `UInt16` entry per DOF)
 
 Level: advanced
 
@@ -1407,23 +1407,30 @@ Level: advanced
 # External Links
 $(_doc_external("Vec/ISColoringGetColors"))
 """
-function ISColoringGetColors(petsclib::PetscLibType, iscoloring::ISColoring, colors::ISColoringValue) end
+function ISColoringGetColors(petsclib::PetscLibType, iscoloring::ISColoring) end
 
-@for_petsc function ISColoringGetColors(petsclib::$UnionPetscLib, iscoloring::ISColoring, colors::ISColoringValue )
-	n_ = Ref{$PetscInt}()
-	nc_ = Ref{$PetscInt}()
+@for_petsc function ISColoringGetColors(petsclib::$UnionPetscLib, iscoloring::ISColoring)
+    n_  = Ref{$PetscInt}()
+    nc_ = Ref{$PetscInt}()
+    # PETSc returns a pointer into its own storage; we must not free it.
+    colors_ptr_ = Ref{ISColoringValue}(C_NULL)
 
     @chk ccall(
                (:ISColoringGetColors, $petsc_library),
                PetscErrorCode,
-               (ISColoring, Ptr{$PetscInt}, Ptr{$PetscInt}, ISColoringValue),
-               iscoloring, n_, nc_, colors,
+               (ISColoring, Ptr{$PetscInt}, Ptr{$PetscInt}, Ptr{ISColoringValue}),
+               iscoloring, n_, nc_, colors_ptr_,
               )
 
-	n = n_[]
-	nc = nc_[]
+    n  = n_[]
+    nc = nc_[]
+    # ISColoringValue is a PETSc opaque pointer alias for `unsigned short *`.
+    # Reinterpret as Ptr{UInt16} and copy to a Julia-owned Vector so the
+    # caller can safely use the data after ISColoringDestroy.
+    colors_raw = Ptr{UInt16}(UInt(colors_ptr_[]))
+    colors = copy(unsafe_wrap(Vector{UInt16}, colors_raw, Int(n); own = false))
 
-	return n,nc
+    return n, nc, colors
 end 
 
 """
diff --git a/src/autowrapped/Mat_wrappers.jl b/src/autowrapped/Mat_wrappers.jl
index 7f726f56..3be1ef05 100644
--- a/src/autowrapped/Mat_wrappers.jl
+++ b/src/autowrapped/Mat_wrappers.jl
@@ -19698,6 +19698,7 @@ Level: beginner
 $(_doc_external("Mat/MatSetValuesCOO"))
 """
 function MatSetValuesCOO(petsclib::PetscLibType, A::PetscMat, coo_v::Vector{PetscScalar}, imode::InsertMode) end
+function MatSetValuesCOO(petsclib::PetscLibType, A::PetscMat, coo_v::Ptr{PetscScalar}, imode::InsertMode) end
 
 @for_petsc function MatSetValuesCOO(petsclib::$UnionPetscLib, A::PetscMat, coo_v::Vector{$PetscScalar}, imode::InsertMode )
 
@@ -19712,6 +19713,19 @@ function MatSetValuesCOO(petsclib::PetscLibType, A::PetscMat, coo_v::Vector{Pets
 	return nothing
 end 
 
+@for_petsc function MatSetValuesCOO(petsclib::$UnionPetscLib, A::PetscMat, coo_v::Ptr{$PetscScalar}, imode::InsertMode )
+
+    @chk ccall(
+               (:MatSetValuesCOO, $petsc_library),
+               PetscErrorCode,
+               (CMat, Ptr{$PetscScalar}, InsertMode),
+               A, coo_v, imode,
+              )
+
+
+	return nothing
+end 
+
 """
 	MatSetBindingPropagates(petsclib::PetscLibType,A::PetscMat, flg::PetscBool) 
 Sets whether the state of being bound to the CPU for a GPU matrix type propagates to child and some other associated objects
diff --git a/src/autowrapped/SNES_wrappers.jl b/src/autowrapped/SNES_wrappers.jl
index c2557e1f..105796e0 100644
--- a/src/autowrapped/SNES_wrappers.jl
+++ b/src/autowrapped/SNES_wrappers.jl
@@ -5396,9 +5396,9 @@ Options Database Keys:
 # External Links
 $(_doc_external("SNES/SNESComputeJacobianDefaultColor"))
 """
-function SNESComputeJacobianDefaultColor(petsclib::PetscLibType, snes::PetscSNES, x1::PetscVec, J::PetscMat, B::PetscMat, ctx::Cvoid) end
+function SNESComputeJacobianDefaultColor(petsclib::PetscLibType, snes::PetscSNES, x1::PetscVec, J::PetscMat, B::PetscMat, ctx::Ptr{Cvoid}) end
 
-@for_petsc function SNESComputeJacobianDefaultColor(petsclib::$UnionPetscLib, snes::PetscSNES, x1::PetscVec, J::PetscMat, B::PetscMat, ctx::Cvoid )
+@for_petsc function SNESComputeJacobianDefaultColor(petsclib::$UnionPetscLib, snes::PetscSNES, x1::PetscVec, J::PetscMat, B::PetscMat, ctx::Ptr{Cvoid})
 
     @chk ccall(
                (:SNESComputeJacobianDefaultColor, $petsc_library),
diff --git a/src/dmda.jl b/src/dmda.jl
index 96fc9c8c..c8e0f6c8 100644
--- a/src/dmda.jl
+++ b/src/dmda.jl
@@ -204,4 +204,143 @@ function localinteriorlinearindex(da::AbstractPetscDM{PetscLib}) where PetscLib
     upper = CartesianIndex(ndofs(da), ghost_corners.upper)
     ind_local = LinearIndices(lower:upper)[:, l_inds][:]
     return ind_local
+end
+"""
+    dmda_star_fd_coloring(petsclib, da)
+
+Build all data needed for manual FD coloring of a **2-D** DMDA with a STAR
+stencil, using `IS_COLORING_LOCAL` and ghost-local COO indexing.
+
+Specifically, this function:
+1. Creates an `IS_COLORING_LOCAL` `ISColoring` via `DMCreateColoring` and
+   extracts the per-DOF color vector (ghost-local layout).
+2. Enumerates all STAR-stencil (row, col) pairs for every owned node and
+   records their ghost-local 0-based indices and colors.
+3. Builds per-color index arrays (`perturb_cols`, `coo_idxs`, `local_rows`)
+   ready for use in an FD coloring Newton loop.
+
+Returns a `NamedTuple`:
+- `n_colors`      — number of colors
+- `n_local_dofs`  — number of locally owned DOFs (owned nodes × dof/node)
+- `nnz_coo`       — total number of COO entries
+- `row_coo_local` — ghost-local 0-based row indices (`Vector{PetscInt}`)
+- `col_coo_local` — ghost-local 0-based column indices (`Vector{PetscInt}`)
+- `perturb_cols`  — `perturb_cols[c]`: 1-based owned-local column indices
+                    with color `c-1`; used to scatter `+h` perturbations.
+- `coo_idxs`      — `coo_idxs[c]`: 1-based COO entry indices for color `c-1`
+- `local_rows`    — `local_rows[c]`: corresponding 1-based owned-local
+                    residual-row indices; used to read `(f1-f0)/h`.
+
+!!! note "2-D DMDA STAR stencil only"
+    Neighbor enumeration covers only `±x` and `±y` directions.  The ghost-local
+    flat-index formula is `d + ix*dof + iy*dof*nx_g`.  For a 3-D DMDA:
+    - add `±z` neighbors guarded by `kk > 1` / `kk < mz`,
+    - extend the flat-index formula with `+ iz*dof*nx_g*ny_g`,
+    - reshape `col_colors_mat` to `(dof, nx_g, ny_g, nz_g)`,
+    - decode `z_owned` in the `perturb_cols` loop.
+"""
+function dmda_star_fd_coloring(petsclib::PetscLib, da::AbstractPetscDM{PetscLib}) where PetscLib
+    CPetscInt = petsclib.PetscInt
+
+    # ── ISColoring ────────────────────────────────────────────────────────────
+    # IS_COLORING_LOCAL covers owned + ghost DOFs; ghost colors are consistent
+    # with the owning rank so no extra MPI communication is needed here.
+    iscoloring = LibPETSc.DMCreateColoring(petsclib, da, LibPETSc.IS_COLORING_LOCAL)
+    _, nc_pi, col_colors_local = LibPETSc.ISColoringGetColors(petsclib, iscoloring)
+    n_colors = Int(nc_pi)
+    LibPETSc.ISColoringDestroy(petsclib, iscoloring)
+
+    # ── DMDA geometry ─────────────────────────────────────────────────────────
+    info          = getinfo(da)
+    mx            = Int(info.global_size[1])
+    my            = Int(info.global_size[2])
+    dof_per_node  = Int(info.dof)
+    corners       = getcorners(da)
+    ghost_corners = getghostcorners(da)
+    xs_da  = corners.lower[1];       ys_da  = corners.lower[2]
+    xe_da  = corners.upper[1];       ye_da  = corners.upper[2]
+    xsg_da = ghost_corners.lower[1]; ysg_da = ghost_corners.lower[2]
+    xeg_da = ghost_corners.upper[1]; yeg_da = ghost_corners.upper[2]
+    nx_g_da = xeg_da - xsg_da + 1
+    ny_g_da = yeg_da - ysg_da + 1
+    nx_own  = xe_da  - xs_da  + 1
+    ny_own  = ye_da  - ys_da  + 1
+    ox_coo  = xs_da  - xsg_da   # ghost offset in x (grid nodes)
+    oy_coo  = ys_da  - ysg_da   # ghost offset in y (grid nodes)
+    n_local_dofs = nx_own * ny_own * dof_per_node
+
+    # col_colors_mat[d+1, ix_ghost+1, iy_ghost+1] → color of that ghost DOF
+    col_colors_mat = reshape(col_colors_local, dof_per_node, nx_g_da, ny_g_da)
+
+    # ── COO triplets from 2-D STAR stencil ────────────────────────────────────
+    # Ghost-local 0-based row/col indices (for MatSetPreallocationCOOLocal).
+    # Both use DMDA ghost-local numbering: idx = d + ix_g*dof + iy_g*dof*nx_g.
+    row_coo_local     = CPetscInt[]
+    col_coo_local     = CPetscInt[]
+    local_row_per_coo = CPetscInt[]  # 0-based owned-local row (VecGetArray index)
+    color_per_coo     = CPetscInt[]  # 0-based color of each COO entry's column
+
+    for jj in ys_da:ye_da, ii in xs_da:xe_da
+        ix_gh = ii - xsg_da   # 0-based ghost-x of this owned node
+        iy_gh = jj - ysg_da   # 0-based ghost-y of this owned node
+        ix_ow = ii - xs_da    # 0-based owned-x
+        iy_ow = jj - ys_da    # 0-based owned-y
+
+        neighbors = Tuple{Int,Int}[(ii, jj)]
+        ii > 1  && push!(neighbors, (ii-1, jj))
+        ii < mx && push!(neighbors, (ii+1, jj))
+        jj > 1  && push!(neighbors, (ii, jj-1))
+        jj < my && push!(neighbors, (ii, jj+1))
+
+        for (ni, nj) in neighbors
+            nix_gh = ni - xsg_da
+            njy_gh = nj - ysg_da
+            for d_row in 0:dof_per_node-1, d_col in 0:dof_per_node-1
+                r_local = d_row + ix_gh  * dof_per_node + iy_gh  * dof_per_node * nx_g_da
+                c_local = d_col + nix_gh * dof_per_node + njy_gh * dof_per_node * nx_g_da
+                p_owned = d_row + ix_ow  * dof_per_node + iy_ow  * nx_own       * dof_per_node
+                push!(row_coo_local,     CPetscInt(r_local))
+                push!(col_coo_local,     CPetscInt(c_local))
+                push!(color_per_coo,     CPetscInt(col_colors_mat[d_col+1, nix_gh+1, njy_gh+1]))
+                push!(local_row_per_coo, CPetscInt(p_owned))
+            end
+        end
+    end
+    nnz_coo = length(row_coo_local)
+
+    # ── Per-color index arrays ────────────────────────────────────────────────
+    # perturb_cols[c]: 1-based owned-local column indices with color c-1.
+    # col_colors_local uses the ghost-local layout, but VecGetArray returns only
+    # owned DOFs re-indexed 1..n_local_dofs.  We convert owned-local → ghost-local
+    # before looking up the color.
+    perturb_cols = [Int32[] for _ in 1:n_colors]
+    for p_local in 1:n_local_dofs
+        p0      = p_local - 1
+        d       = p0 % dof_per_node
+        x_owned = (p0 ÷ dof_per_node) % nx_own
+        y_owned = (p0 ÷ dof_per_node) ÷ nx_own
+        k_ghost = d + (x_owned + ox_coo) * dof_per_node +
+                      (y_owned + oy_coo) * dof_per_node * nx_g_da + 1
+        c = Int(col_colors_local[k_ghost]) + 1
+        push!(perturb_cols[c], Int32(p_local))
+    end
+
+    coo_idxs   = [Int32[] for _ in 1:n_colors]
+    local_rows = [Int32[] for _ in 1:n_colors]
+    for k in 1:nnz_coo
+        c = Int(color_per_coo[k]) + 1
+        push!(coo_idxs[c],   Int32(k))
+        push!(local_rows[c], Int32(local_row_per_coo[k] + 1))
+    end
+
+    return (;
+        n_colors,
+        n_local_dofs,
+        nnz_coo,
+        row_coo_local,
+        col_coo_local,
+        perturb_cols,
+        coo_idxs,
+        local_rows,
+    )
 end
\ No newline at end of file

From 4d51898383e474ad6bd847c203c1c71d7f4e6cd2 Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Mon, 27 Apr 2026 19:42:54 +0000
Subject: [PATCH 17/39] code refactor part 2

Co-authored-by: Copilot <copilot@github.com>
---
 examples/ex19.jl | 247 ++++++++++++++++++++++++++++-------------------
 src/snes.jl      |   4 +
 2 files changed, 153 insertions(+), 98 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index 0ef6e6e8..71651775 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -19,15 +19,43 @@
     ω:           derived from the no-slip condition at each wall
 
   Usage (from the examples/ directory):
+    # Basic run (4×4 default grid)
     julia --project ex19.jl
-    julia --project ex19.jl -snes_monitor -da_grid_x 129 -da_grid_y 129
-    julia --project ex19.jl -snes_monitor -da_grid_x 129 -da_grid_y 129 -log_view
-    mpiexec -n 4 julia --project ex19.jl -snes_monitor -pc_type mg -da_grid_x 64 -da_grid_y 64
 
-  Requires: LocalPreferences.toml in examples/ with PetscInt = "Int32" matching the
-  PETSc build (check with: grep sizeof_PetscInt petscconf.h).
+    # Larger grid with SNES convergence output
+    julia --project ex19.jl -snes_monitor -snes_converged_reason -da_grid_x 129 -da_grid_y 129
 
-  GPU usage: set  useCUDA = true  then run as above.
+    # With PETSc performance log
+    julia --project ex19.jl -da_grid_x 129 -da_grid_y 129 -log_view
+
+    # Multigrid preconditioner (3 levels, Chebyshev+Jacobi smoothers)
+    julia --project ex19.jl -da_grid_x 125 -da_grid_y 125 \\
+        -pc_type mg -pc_mg_levels 3 \\
+        -mg_levels_ksp_type chebyshev -mg_levels_pc_type jacobi \\
+        -snes_monitor -ksp_monitor
+
+    # MPI parallel (4 ranks)
+    mpiexec -n 4 julia --project ex19.jl \\
+        -da_grid_x 256 -da_grid_y 256 \\
+        -pc_type mg -pc_mg_levels 4 \\
+        -mg_levels_ksp_type chebyshev -mg_levels_pc_type jacobi
+
+    # GPU (CUDA) — set  useCUDA = true  at the top of the file, then:
+    julia --project ex19.jl -da_grid_x 256 -da_grid_y 256 \\
+        -pc_type mg -pc_mg_levels 4 \\
+        -mg_levels_ksp_type chebyshev -mg_levels_pc_type jacobi
+
+  Jacobian strategy:
+    Fine-grid level: manual FD coloring via PETSc.dmda_star_fd_coloring +
+      MatSetPreallocationCOOLocal + MatSetValuesCOO.  On GPU the entire
+      perturb → F(x+h) → accumulate loop runs on-device with no host copies.
+    Coarser MG levels: fall back to SNESComputeJacobianDefaultColor
+      (correct for each level's DM, CPU-only).
+
+  Requires: LocalPreferences.toml in examples/ with PetscInt = "Int32" matching
+    the PETSc build (check with: grep sizeof_PetscInt petscconf.h).
+
+  GPU usage: set  useCUDA = true  at the top of this file.
     Requires PETSc built with --with-cuda, and CUDA.jl in the environment.
 =#
 
@@ -179,6 +207,70 @@ end
     @inbounds val[coo_idxs[k]] = (f1[row_idxs[k]] - f0[row_idxs[k]]) * inv_h
 end
 
+# ── FD-coloring Jacobian fill ─────────────────────────────────────────────────
+#
+# Fills val_dev[k] with the forward-difference Jacobian value for COO slot k.
+# Loops over colors: scatter +h onto the owned columns of that color, evaluate
+# F(x + h·eₖ), then accumulate (F1 − F0)/h into val_dev at the matching slots.
+# Does NOT call MatSetValuesCOO or MatAssembly; those remain with the caller.
+#
+# Captures from module scope: useCUDA, backend, CuArray, CuPtr,
+#   scatter_perturb_kernel!, fd_accumulate_kernel!, KernelAbstractions.
+#
+function maybe_wrap_device(arr, mtype, n, ::Type{T}, useCUDA) where T
+    if useCUDA && mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
+        return unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(arr))), n)
+    else
+        return arr
+    end
+end
+
+function fd_coloring_jac!(
+    petsclib,
+    snes,
+    g_x,
+    f0_vec, f1_vec, x_pert_vec,
+    val_dev  :: AbstractVector{T},
+    n_colors    :: Int,
+    n_local_dofs :: Int,
+    perturb_cols_dev,
+    coo_idxs_dev,
+    local_rows_dev,
+    h_eps :: T,
+    inv_h :: T,
+) where T
+    LibPETSc.SNESComputeFunction(petsclib, snes, g_x, f0_vec)
+    f0_arr, f0_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, f0_vec)
+    f0_dev = maybe_wrap_device(f0_arr, f0_mtype, n_local_dofs, T, useCUDA)
+
+    for c in 1:n_colors
+        isempty(perturb_cols_dev[c]) && continue
+
+        LibPETSc.VecCopy(petsclib, g_x, x_pert_vec)
+        xp_arr, xp_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, x_pert_vec)
+        xp_dev = maybe_wrap_device(xp_arr, xp_mtype, n_local_dofs, T, useCUDA)
+        scatter_perturb_kernel!(backend, 64)(
+            xp_dev, perturb_cols_dev[c], h_eps;
+            ndrange = length(perturb_cols_dev[c]))
+        KernelAbstractions.synchronize(backend)
+        LibPETSc.VecRestoreArrayAndMemType(petsclib, x_pert_vec, xp_arr)
+
+        LibPETSc.SNESComputeFunction(petsclib, snes, x_pert_vec, f1_vec)
+
+        f1_arr, f1_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, f1_vec)
+        f1_dev = maybe_wrap_device(f1_arr, f1_mtype, n_local_dofs, T, useCUDA)
+        fd_accumulate_kernel!(backend, 64)(
+            val_dev, f0_dev, f1_dev,
+            coo_idxs_dev[c], local_rows_dev[c], inv_h;
+            ndrange = length(coo_idxs_dev[c]))
+        KernelAbstractions.synchronize(backend)
+        LibPETSc.VecRestoreArrayReadAndMemType(petsclib, f1_vec, f1_arr)
+    end
+
+    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, f0_vec, f0_arr)
+    return nothing
+end
+
 # ── Setup ─────────────────────────────────────────────────────────────────────
 opts     = isinteractive() ? NamedTuple() : PETSc.parse_options(filter(a -> a != "-log_view", ARGS))
 log_view = "-log_view" in ARGS
@@ -186,7 +278,7 @@ log_view = "-log_view" in ARGS
 petsclib = PETSc.getlib(; PetscScalar = Float64, PetscInt = Int32)
 PETSc.initialize(petsclib; log_view)
 
-T        = Float64
+_T        = Float64
 PetscInt = petsclib.PetscInt
 comm     = MPI.COMM_WORLD
 
@@ -201,8 +293,8 @@ da = PETSc.DMDA(
     opts...,
 )
 
-# Stage 2: GPU vecs and GPU matrix enable a fully GPU-resident FD coloring
-# path via COO preallocation.  No host↔device bouncing in residual or Jacobian.
+# GPU vecs and GPU matrix enable a fully GPU-resident FD coloring path via
+# COO preallocation.  No host↔device bouncing in residual or Jacobian.
 if useCUDA
     LibPETSc.DMSetVecType(petsclib, da, "cuda")
     LibPETSc.DMSetMatType(petsclib, da, "aijcusparse")
@@ -216,15 +308,15 @@ info = PETSc.getinfo(da)
 mx   = Int(info.global_size[1])
 my   = Int(info.global_size[2])
 
-user = AppCtx{T}(
-    lidvelocity = T(1) / (mx - 1),
-    prandtl     = T(1),
-    grashof     = T(1),
+user = AppCtx{_T}(
+    lidvelocity = _T(1) / (mx - 1),
+    prandtl     = _T(1),
+    grashof     = _T(1),
 )
 
 # Precomputed grid metrics
-dhx   = T(mx - 1);   dhy   = T(my - 1)
-hx    = one(T) / dhx; hy    = one(T) / dhy
+dhx   = _T(mx - 1);   dhy   = _T(my - 1)
+hx    = one(_T) / dhx; hy    = one(_T) / dhy
 hydhx = hy * dhx;    hxdhy = hx * dhy
 
 # ── Initial condition: u = v = ω = 0, T linear in x ─────────────────────────
@@ -235,14 +327,14 @@ PETSc.withlocalarray!(x; read = false) do x_arr
     xs = corners.lower[1];  ys = corners.lower[2]
     xe = corners.upper[1];  ye = corners.upper[2]
     nx_own = xe - xs + 1;   ny_own = ye - ys + 1
-    dx = one(T) / (mx - 1)
+    dx = one(_T) / (mx - 1)
     x_par = reshape(x_arr, 4, nx_own, ny_own)
     for lj in 1:ny_own, li in 1:nx_own
         ig = xs + li - 1
-        x_par[1, li, lj] = zero(T)
-        x_par[2, li, lj] = zero(T)
-        x_par[3, li, lj] = zero(T)
-        x_par[4, li, lj] = user.grashof > 0 ? T(ig - 1) * dx : zero(T)
+        x_par[1, li, lj] = zero(_T)
+        x_par[2, li, lj] = zero(_T)
+        x_par[3, li, lj] = zero(_T)
+        x_par[4, li, lj] = user.grashof > 0 ? _T(ig - 1) * dx : zero(_T)
     end
 end
 
@@ -288,10 +380,10 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     info_  = PETSc.getinfo(da)
     mx_    = Int(info_.global_size[1])
     my_    = Int(info_.global_size[2])
-    dhx_   = T(mx_ - 1);    dhy_   = T(my_ - 1)
-    hx_    = one(T) / dhx_; hy_    = one(T) / dhy_
+    dhx_   = _T(mx_ - 1);   dhy_   = _T(my_ - 1)
+    hx_    = one(_T) / dhx_; hy_   = one(_T) / dhy_
     hydhx_ = hy_ * dhx_;    hxdhy_ = hx_ * dhy_
-    lid_   = T(1) / dhx_    # lidvelocity = 1/(mx-1)
+    lid_   = _T(1) / dhx_    # lidvelocity = 1/(mx-1)
 
     cavity_residual_kernel!(backend, 64)(
         f_par, x_par,
@@ -309,15 +401,17 @@ end
 
 # ── Jacobian: manual FD coloring with GPU-efficient COO matrix assembly ───────
 #
-# 1) Obtain the DM's IS_COLORING_LOCAL ISColoring.  Colors cover owned + ghost
-#    DOFs in DMDA local ordering; ghost colors are consistent with owning ranks.
-# 2) Build the sparse (row, col) COO triplets analytically from the DMDA STAR
-#    stencil; record per-COO the 0-based color of its column, looked up via
-#    ghost-local coordinates (works for owned AND off-rank ghost columns).
-# 3) Pre-allocate J via MatSetPreallocationCOO (one-time GPU setup).
-# 4) Each Newton step: loop over colors, scatter +h to owned cols of that
-#    color (GPU kernel), evaluate F(x_pert), accumulate (F1−F0)/h into
-#    val[] (GPU kernel), assemble via MatSetValuesCOO(J, val_dev).
+# Setup (one-time):
+#   dmda_star_fd_coloring  — builds IS_COLORING_LOCAL coloring and analytically
+#     derives the STAR-stencil COO (row, col) triplets; returns per-color index
+#     arrays for the owned columns and COO slots to fill.
+#   MatSetPreallocationCOOLocal  — registers the COO pattern (enables on-device
+#     scatter on GPU; avoids per-entry hash-table lookups on CPU).
+#
+# Each Newton step (fd_coloring_jac!):
+#   For each color: copy x → x_pert, scatter +h to owned cols of that color
+#   (GPU kernel), evaluate F(x_pert), accumulate (F1−F0)/h into val[] (GPU
+#   kernel).  Then MatSetValuesCOO assembles J from val[] in one call.
 #
 # NOTE: For MG, coarser levels fall back to SNESComputeJacobianDefaultColor
 #       (correct, but CPU-only FD coloring for those levels).
@@ -326,14 +420,14 @@ end
 # Builds the IS_COLORING_LOCAL coloring for da's STAR stencil, ghost-local COO
 # (row, col) pairs, and per-color owned-column / COO-entry index arrays.
 # 2-D DMDA STAR stencil only; see PETSc.dmda_star_fd_coloring for 3-D notes.
-coloring     = PETSc.dmda_star_fd_coloring(petsclib, da)
-n_colors     = coloring.n_colors
-n_local_dofs = coloring.n_local_dofs
-nnz_coo      = coloring.nnz_coo
+coloring      = PETSc.dmda_star_fd_coloring(petsclib, da)
+n_colors      = coloring.n_colors
+n_local_dofs  = coloring.n_local_dofs
+nnz_coo       = coloring.nnz_coo
 row_coo_local = coloring.row_coo_local
 col_coo_local = coloring.col_coo_local
 
-# ── 5. Create J ───────────────────────────────────────────────────────────────
+# ── Create J ─────────────────────────────────────────────────────────────────
 J = LibPETSc.DMCreateMatrix(petsclib, da)
 # Register the COO pattern on both CPU and GPU.  This allows MatSetValuesCOO
 # to be used for assembly in both cases, avoiding per-entry hash-table lookups
@@ -349,22 +443,22 @@ if useCUDA
     perturb_cols_dev = [CuArray(v) for v in perturb_cols_1b]
     coo_idxs_dev     = [CuArray(v) for v in coo_idxs_1b]
     local_rows_dev   = [CuArray(v) for v in local_rows_1b]
-    val_dev          = CUDA.zeros(T, nnz_coo)
+    val_dev          = CUDA.zeros(_T, nnz_coo)
 else
     perturb_cols_dev = perturb_cols_1b
     coo_idxs_dev     = coo_idxs_1b
     local_rows_dev   = local_rows_1b
-    val_dev          = zeros(T, nnz_coo)
+    val_dev          = zeros(_T, nnz_coo)
 end
 
-# ── 7. Scratch vectors for the FD loop ────────────────────────────────────────
+# ── Scratch vectors for the FD loop ────────────────────────────────────────
 x_pert_vec = LibPETSc.VecDuplicate(petsclib, x)
 f0_vec     = LibPETSc.VecDuplicate(petsclib, x)
 f1_vec     = LibPETSc.VecDuplicate(petsclib, x)
-h_eps      = T(sqrt(eps(T)))
-inv_h      = T(1) / h_eps
+h_eps      = _T(sqrt(eps(_T)))
+inv_h      = _T(1) / h_eps
 
-# ── 8. Custom Jacobian callback ───────────────────────────────────────────────
+# ── Jacobian callback ────────────────────────────────────────────────────────
 PETSc.setjacobian!(snes, J) do Jmat, actual_snes, g_x
     # For MG: if this is a coarser level (different DM than the fine-grid da),
     # fall back to PETSc's built-in FD coloring (correct for that level's DM).
@@ -374,53 +468,21 @@ PETSc.setjacobian!(snes, J) do Jmat, actual_snes, g_x
         return PetscInt(0)
     end
 
-    # ── Evaluate F(x) → f0 ────────────────────────────────────────────────────
-    LibPETSc.SNESComputeFunction(petsclib, actual_snes, g_x, f0_vec)
-    f0_arr, f0_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, f0_vec)
-    f0_dev = (useCUDA && f0_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE) ?
-        unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(f0_arr))), n_local_dofs) :
-        f0_arr
-
-    # ── FD loop over colors ────────────────────────────────────────────────────
-    for c in 1:n_colors
-        isempty(perturb_cols_dev[c]) && continue
-
-        # Copy x → x_pert, then scatter +h to owned cols of color c.
-        LibPETSc.VecCopy(petsclib, g_x, x_pert_vec)
-        xp_arr, xp_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, x_pert_vec)
-        xp_dev = (useCUDA && xp_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE) ?
-            unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(xp_arr))), n_local_dofs) :
-            xp_arr
-        scatter_perturb_kernel!(backend, 64)(
-            xp_dev, perturb_cols_dev[c], h_eps;
-            ndrange = length(perturb_cols_dev[c]))
-        KernelAbstractions.synchronize(backend)
-        LibPETSc.VecRestoreArrayAndMemType(petsclib, x_pert_vec, xp_arr)
-
-        # Evaluate F(x_pert) → f1.
-        LibPETSc.SNESComputeFunction(petsclib, actual_snes, x_pert_vec, f1_vec)
-
-        # Accumulate (f1 − f0)/h into val[] at the COO indices of color c.
-        f1_arr, f1_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, f1_vec)
-        f1_dev = (useCUDA && f1_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE) ?
-            unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(f1_arr))), n_local_dofs) :
-            f1_arr
-        fd_accumulate_kernel!(backend, 64)(
-            val_dev, f0_dev, f1_dev,
-            coo_idxs_dev[c], local_rows_dev[c], inv_h;
-            ndrange = length(coo_idxs_dev[c]))
-        KernelAbstractions.synchronize(backend)
-        LibPETSc.VecRestoreArrayReadAndMemType(petsclib, f1_vec, f1_arr)
-    end
-
-    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, f0_vec, f0_arr)
+    # ── FD coloring: fill val_dev with Jacobian entries ────────────────────────
+    fd_coloring_jac!(
+        petsclib, actual_snes, g_x,
+        f0_vec, f1_vec, x_pert_vec, val_dev,
+        n_colors, n_local_dofs,
+        perturb_cols_dev, coo_idxs_dev, local_rows_dev,
+        h_eps, inv_h,
+    )
 
     # ── Assemble J via COO ─────────────────────────────────────────────────────
     # On GPU: pass a raw device pointer so cuSPARSE scatters directly on device.
     # On CPU: pass the Vector{T} directly (uses the vector overload).
     # Both paths use the COO pattern registered by MatSetPreallocationCOOLocal.
     if useCUDA
-        LibPETSc.MatSetValuesCOO(petsclib, Jmat, Ptr{T}(UInt64(pointer(val_dev))), LibPETSc.INSERT_VALUES)
+        LibPETSc.MatSetValuesCOO(petsclib, Jmat, Ptr{_T}(UInt64(pointer(val_dev))), LibPETSc.INSERT_VALUES)
     else
         LibPETSc.MatSetValuesCOO(petsclib, Jmat, val_dev, LibPETSc.INSERT_VALUES)
     end
@@ -450,17 +512,6 @@ if MPI.Comm_rank(comm) == 0
 end
 
 # ── Cleanup ───────────────────────────────────────────────────────────────────
-# Explicitly destroy the PetscOptions stored on the SNES before finalization.
-# Its GC finalizer calls PetscOptionsDestroy, which can use MPI internally.
-# If GC runs it after MPI is alive but in a different collective-sync state
-# across ranks, it triggers intermittent crashes.  Destroying it explicitly
-# here (while all ranks are synchronized and PETSc/MPI are still fully active)
-# is safe and prevents any later GC-driven call.
-if !isnothing(snes.opts)
-    PETSc.destroy(snes.opts)
-    snes.opts = nothing
-end
-
 # Run a full GC now so any lingering VecRestoreArray finalizers from
 # withlocalarray! run while PETSc is still valid, then barrier all ranks.
 GC.gc(true)
@@ -470,9 +521,9 @@ MPI.Barrier(comm)
 # those reference counts are decremented before we explicitly free the objects.
 PETSc.destroy(snes)
 PETSc.destroy(J)
-LibPETSc.VecDestroy(petsclib, x_pert_vec)
-LibPETSc.VecDestroy(petsclib, f0_vec)
-LibPETSc.VecDestroy(petsclib, f1_vec)
+PETSc.destroy(x_pert_vec)
+PETSc.destroy(f0_vec)
+PETSc.destroy(f1_vec)
 PETSc.destroy(x)
 PETSc.destroy(r)
 PETSc.destroy(da)
diff --git a/src/snes.jl b/src/snes.jl
index 2c329dc5..e1b2fb1d 100644
--- a/src/snes.jl
+++ b/src/snes.jl
@@ -247,6 +247,10 @@ is garbage collected, but can be called explicitly to free resources immediately
 $(_doc_external("SNES/SNESDestroy"))
 """
 function destroy(snes::AbstractPetscSNES{PetscLib}) where {PetscLib}
+    if !isnothing(snes.opts)
+        destroy(snes.opts)
+        snes.opts = nothing
+    end
     if !(finalized(PetscLib)) && snes.ptr != C_NULL
         LibPETSc.SNESDestroy(PetscLib, snes)
     end

From c2061eecb2788705da4b6e9380a704602d83d678 Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Mon, 27 Apr 2026 19:55:35 +0000
Subject: [PATCH 18/39] add gpu doc page

---
 docs/make.jl        |   1 +
 docs/src/man/gpu.md | 131 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+)
 create mode 100644 docs/src/man/gpu.md

diff --git a/docs/make.jl b/docs/make.jl
index 17f7c94f..d9fcc08d 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -49,6 +49,7 @@ makedocs(;
         ],
         "Utilities" => "man/utilities.md",
         "Running on HPC Systems" => "man/hpc.md",
+        "GPU Support (CUDA)" => "man/gpu.md",
         "FAQ"  => "man/FAQ.md",
         "Contributing"  => "man/contributing.md",
         "Funding" => "man/funding.md",
diff --git a/docs/src/man/gpu.md b/docs/src/man/gpu.md
new file mode 100644
index 00000000..4a924a33
--- /dev/null
+++ b/docs/src/man/gpu.md
@@ -0,0 +1,131 @@
+# GPU Support (CUDA + KernelAbstractions)
+
+Julia has outstanding support for GPU's as it compiles machine code for the particular devices. Importantly, all modern GPUs are suppported, which implies that it is quite straightforward to write GPU kernels in Julia, for example by packages such as [KernelAbstractions](https://github.com/JuliaGPU/KernelAbstractions.jl)
+
+PETSc also has added GPU support in recent years, and PETSc vector and matrix objects, along with manu of the solvfers, can be moved to the GPU.
+
+GPU support in PETSc.jl requires a **locally built PETSc** with CUDA or HIP enabled — the precompiled `PETSc_jll` binaries do not include GPU support. See [Installation](@ref) for instructions on pointing PETSc.jl at a local library.
+The examples below are given for CUDA. Doing this on AMD machines (HIP) will likely work the same biut will require a spevific extension to bve added.
+
+## Prerequisites
+
+1. **PETSc built with CUDA** — configure with `--with-cuda=1` (and the matching `--with-cuda-dir`). Confirm with:
+   ```bash
+   grep -i cuda $PETSC_DIR/$PETSC_ARCH/include/petscconf.h
+   # should show: #define PETSC_HAVE_CUDA 1
+   ```
+
+2. **CUDA.jl** in your Julia environment:
+   ```julia
+   pkg> add CUDA
+   ```
+
+3. **KernelAbstractions.jl** if you want to write portable GPU/CPU kernels:
+   ```julia
+   pkg> add KernelAbstractions
+   ```
+
+## How it works
+
+When CUDA.jl is loaded alongside PETSc.jl, the `PETScCUDAExt` extension is activated automatically. It registers CUDA-aware implementations for the functions below via Julia's package extension mechanism — no extra configuration is needed.
+
+PETSc manages where vector data lives (host or device). The extension inspects the `PetscMemType` returned by `VecGetArray*AndMemType` calls and either wraps the device pointer as a `CuArray` (zero-copy) or allocates a bounce buffer if the data needs to move between host and device.
+
+## Public API
+
+### `withlocalarray_device!`
+
+Callback-based access to the underlying array of one or more Vecs, returning a `CuArray` when the data is on the device:
+
+```julia
+withlocalarray_device!(f!, vecs...; read=true, write=true)
+```
+
+```julia
+using PETSc, CUDA, KernelAbstractions
+
+withlocalarray_device!(my_vec; read=false, write=true) do arr
+    # arr is a CuArray if the Vec lives on the GPU, plain Array otherwise
+    fill!(arr, 42)
+end
+```
+
+For multiple Vecs, pass keyword tuples to control read/write access per Vec:
+
+```julia
+withlocalarray_device!(
+    (x_vec, f_vec);
+    read  = (true,  false),
+    write = (false, true),
+) do x_arr, f_arr
+    my_kernel!(backend)(f_arr, x_arr; ndrange = length(f_arr))
+    KernelAbstractions.synchronize(backend)
+end
+```
+
+### `get_petsc_arrays` / `restore_petsc_arrays`
+
+Lower-level paired get/restore for the residual function pattern, where you need both a global output Vec and a local (ghost-padded) input Vec:
+
+```julia
+fx, lx, fx_arr, lx_arr, fx_bounce = get_petsc_arrays(petsclib, g_fx, l_x)
+# launch kernel writing into fx, reading from lx
+restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
+```
+
+- When both Vecs are on the GPU, `fx` and `lx` are zero-copy `CuArray` wrappers.
+- When `l_x` is host-resident (e.g. on a coarser MG level), the data is copied host→device before the kernel and the result is copied device→host by `restore_petsc_arrays`.
+- On a CPU-only path (CUDA.jl not loaded, or all Vecs on host), `fx`/`lx` are plain `Array`s with no copies.
+
+## Writing portable kernels with KernelAbstractions
+
+Select the backend at the top of your script based on the `useCUDA` flag:
+
+```julia
+using KernelAbstractions
+using CUDA
+import CUDA: CuArray, CuPtr, unsafe_wrap
+
+const backend = CUDABackend()   # or CPU() for a CPU run
+```
+
+Write kernels with `@kernel` so the same code runs on both backends:
+
+```julia
+@kernel function my_kernel!(out, inp)
+    i = @index(Global)
+    out[i] = inp[i] * 2
+end
+
+# launch:
+my_kernel!(backend, 256)(out_arr, inp_arr; ndrange = length(out_arr))
+KernelAbstractions.synchronize(backend)
+```
+
+## Example
+
+[`examples/ex19.jl`](https://github.com/JuliaParallel/PETSc.jl/blob/main/examples/ex19.jl) is a full 2D driven-cavity example (velocity–vorticity–temperature) that demonstrates:
+
+- Switching between CPU and GPU with a single `useCUDA` flag.
+- FD coloring-based Jacobian assembly running entirely on-device.
+- `get_petsc_arrays` / `restore_petsc_arrays` in the residual callback.
+- Multigrid preconditioning with coarser levels falling back to a CPU Jacobian.
+
+To run it on a GPU:
+
+1. Set `const useCUDA = true` near the top of `ex19.jl`.
+2. Ensure your local PETSc build has CUDA support and is linked via `PETSc.set_library!`.
+3. Launch:
+   ```bash
+   julia --project ex19.jl -da_grid_x 256 -da_grid_y 256 \
+       -pc_type mg -pc_mg_levels 4 \
+       -mg_levels_ksp_type chebyshev -mg_levels_pc_type jacobi \
+       -snes_monitor -ksp_monitor
+   ```
+
+> [!NOTE]
+> The `PetscInt` type in your `LocalPreferences.toml` must match the PETSc build.
+> Check with `grep sizeof_PetscInt $PETSC_DIR/$PETSC_ARCH/include/petscconf.h`.
+
+
+## Performance 

From 1f4d1a31b3d28234e37901d6b273112d92f2429e Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Mon, 27 Apr 2026 19:56:42 +0000
Subject: [PATCH 19/39] grammar

---
 docs/src/man/gpu.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/src/man/gpu.md b/docs/src/man/gpu.md
index 4a924a33..fd3edc90 100644
--- a/docs/src/man/gpu.md
+++ b/docs/src/man/gpu.md
@@ -1,11 +1,11 @@
 # GPU Support (CUDA + KernelAbstractions)
 
-Julia has outstanding support for GPU's as it compiles machine code for the particular devices. Importantly, all modern GPUs are suppported, which implies that it is quite straightforward to write GPU kernels in Julia, for example by packages such as [KernelAbstractions](https://github.com/JuliaGPU/KernelAbstractions.jl)
+Julia has outstanding support for GPUs as it compiles machine code for the particular devices. Importantly, all modern GPUs are supported, which implies that it is quite straightforward to write GPU kernels in Julia, for example using packages such as [KernelAbstractions](https://github.com/JuliaGPU/KernelAbstractions.jl).
 
-PETSc also has added GPU support in recent years, and PETSc vector and matrix objects, along with manu of the solvfers, can be moved to the GPU.
+PETSc has also added GPU support in recent years, and PETSc vector and matrix objects, along with many of the solvers, can be moved to the GPU.
 
 GPU support in PETSc.jl requires a **locally built PETSc** with CUDA or HIP enabled — the precompiled `PETSc_jll` binaries do not include GPU support. See [Installation](@ref) for instructions on pointing PETSc.jl at a local library.
-The examples below are given for CUDA. Doing this on AMD machines (HIP) will likely work the same biut will require a spevific extension to bve added.
+The examples below are given for CUDA. Doing this on AMD machines (HIP) will likely work the same but will require a specific extension to be added.
 
 ## Prerequisites
 

From 3c2271fc244470442428e93bc73bed4b373d077c Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Mon, 27 Apr 2026 20:49:59 +0000
Subject: [PATCH 20/39] optimizations

Co-authored-by: Copilot <copilot@github.com>
---
 src/dmda.jl | 85 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 27 deletions(-)

diff --git a/src/dmda.jl b/src/dmda.jl
index c8e0f6c8..7bfc9330 100644
--- a/src/dmda.jl
+++ b/src/dmda.jl
@@ -275,45 +275,75 @@ function dmda_star_fd_coloring(petsclib::PetscLib, da::AbstractPetscDM{PetscLib}
     # ── COO triplets from 2-D STAR stencil ────────────────────────────────────
     # Ghost-local 0-based row/col indices (for MatSetPreallocationCOOLocal).
     # Both use DMDA ghost-local numbering: idx = d + ix_g*dof + iy_g*dof*nx_g.
-    row_coo_local     = CPetscInt[]
-    col_coo_local     = CPetscInt[]
-    local_row_per_coo = CPetscInt[]  # 0-based owned-local row (VecGetArray index)
-    color_per_coo     = CPetscInt[]  # 0-based color of each COO entry's column
+    #
+    # Pre-compute exact nnz_coo analytically so we can allocate once and fill
+    # by index rather than growing via push! (avoids O(nnz) realloc+copy work).
+    # Each owned node contributes dof² entries per STAR neighbor it has.
+    # Interior nodes have 5 neighbors (self + 4); boundary nodes have fewer.
+    nbr_left   = xs_da == 1  ? nx_own - 1 : nx_own   # columns with a left neighbor
+    nbr_right  = xe_da == mx ? nx_own - 1 : nx_own   # columns with a right neighbor
+    nbr_bottom = ys_da == 1  ? ny_own - 1 : ny_own   # rows with a bottom neighbor
+    nbr_top    = ye_da == my ? ny_own - 1 : ny_own   # rows with a top neighbor
+    total_nbr_pairs = nx_own * ny_own +               # self
+                      nbr_left   * ny_own +
+                      nbr_right  * ny_own +
+                      nbr_bottom * nx_own +
+                      nbr_top    * nx_own
+    nnz_coo = total_nbr_pairs * dof_per_node^2
+
+    row_coo_local     = Vector{CPetscInt}(undef, nnz_coo)
+    col_coo_local     = Vector{CPetscInt}(undef, nnz_coo)
+    local_row_per_coo = Vector{CPetscInt}(undef, nnz_coo)  # 0-based owned-local row
+    color_per_coo     = Vector{CPetscInt}(undef, nnz_coo)  # 0-based color of column
+
+    # Inner helper: write dof² entries for a (row-node, col-node) pair.
+    # Captures ix_gh, iy_gh, ix_ow, iy_ow from the outer scope.
+    @inline function fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, nix_gh, njy_gh)
+        r_base = ix_gh  * dof_per_node + iy_gh  * dof_per_node * nx_g_da
+        c_base = nix_gh * dof_per_node + njy_gh * dof_per_node * nx_g_da
+        p_base = ix_ow  * dof_per_node + iy_ow  * nx_own       * dof_per_node
+        for d_row in 0:dof_per_node-1
+            r_local = CPetscInt(d_row + r_base)
+            p_owned = CPetscInt(d_row + p_base)
+            for d_col in 0:dof_per_node-1
+                @inbounds begin
+                    row_coo_local[k]     = r_local
+                    col_coo_local[k]     = CPetscInt(d_col + c_base)
+                    color_per_coo[k]     = CPetscInt(col_colors_mat[d_col+1, nix_gh+1, njy_gh+1])
+                    local_row_per_coo[k] = p_owned
+                end
+                k += 1
+            end
+        end
+        return k
+    end
 
+    k = 1
     for jj in ys_da:ye_da, ii in xs_da:xe_da
         ix_gh = ii - xsg_da   # 0-based ghost-x of this owned node
         iy_gh = jj - ysg_da   # 0-based ghost-y of this owned node
         ix_ow = ii - xs_da    # 0-based owned-x
         iy_ow = jj - ys_da    # 0-based owned-y
 
-        neighbors = Tuple{Int,Int}[(ii, jj)]
-        ii > 1  && push!(neighbors, (ii-1, jj))
-        ii < mx && push!(neighbors, (ii+1, jj))
-        jj > 1  && push!(neighbors, (ii, jj-1))
-        jj < my && push!(neighbors, (ii, jj+1))
-
-        for (ni, nj) in neighbors
-            nix_gh = ni - xsg_da
-            njy_gh = nj - ysg_da
-            for d_row in 0:dof_per_node-1, d_col in 0:dof_per_node-1
-                r_local = d_row + ix_gh  * dof_per_node + iy_gh  * dof_per_node * nx_g_da
-                c_local = d_col + nix_gh * dof_per_node + njy_gh * dof_per_node * nx_g_da
-                p_owned = d_row + ix_ow  * dof_per_node + iy_ow  * nx_own       * dof_per_node
-                push!(row_coo_local,     CPetscInt(r_local))
-                push!(col_coo_local,     CPetscInt(c_local))
-                push!(color_per_coo,     CPetscInt(col_colors_mat[d_col+1, nix_gh+1, njy_gh+1]))
-                push!(local_row_per_coo, CPetscInt(p_owned))
-            end
-        end
+        # self
+        k = fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, ix_gh, iy_gh)
+        # left
+        ii > 1  && (k = fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, ix_gh - 1, iy_gh))
+        # right
+        ii < mx && (k = fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, ix_gh + 1, iy_gh))
+        # bottom
+        jj > 1  && (k = fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, ix_gh, iy_gh - 1))
+        # top
+        jj < my && (k = fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, ix_gh, iy_gh + 1))
     end
-    nnz_coo = length(row_coo_local)
 
     # ── Per-color index arrays ────────────────────────────────────────────────
     # perturb_cols[c]: 1-based owned-local column indices with color c-1.
     # col_colors_local uses the ghost-local layout, but VecGetArray returns only
     # owned DOFs re-indexed 1..n_local_dofs.  We convert owned-local → ghost-local
     # before looking up the color.
-    perturb_cols = [Int32[] for _ in 1:n_colors]
+    hint_cols = max(1, n_local_dofs ÷ n_colors)
+    perturb_cols = [sizehint!(Int32[], hint_cols) for _ in 1:n_colors]
     for p_local in 1:n_local_dofs
         p0      = p_local - 1
         d       = p0 % dof_per_node
@@ -325,8 +355,9 @@ function dmda_star_fd_coloring(petsclib::PetscLib, da::AbstractPetscDM{PetscLib}
         push!(perturb_cols[c], Int32(p_local))
     end
 
-    coo_idxs   = [Int32[] for _ in 1:n_colors]
-    local_rows = [Int32[] for _ in 1:n_colors]
+    hint_coo = max(1, nnz_coo ÷ n_colors)
+    coo_idxs   = [sizehint!(Int32[], hint_coo) for _ in 1:n_colors]
+    local_rows = [sizehint!(Int32[], hint_coo) for _ in 1:n_colors]
     for k in 1:nnz_coo
         c = Int(color_per_coo[k]) + 1
         push!(coo_idxs[c],   Int32(k))

From 9aa38fbf07d45656e46ea71c9e6db787ed67fb92 Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Tue, 28 Apr 2026 08:38:52 +0000
Subject: [PATCH 21/39] update docs with scalability results

---
 docs/src/man/gpu.md | 77 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/docs/src/man/gpu.md b/docs/src/man/gpu.md
index fd3edc90..fddcbf23 100644
--- a/docs/src/man/gpu.md
+++ b/docs/src/man/gpu.md
@@ -129,3 +129,80 @@ To run it on a GPU:
 
 
 ## Performance 
+
+We have checked the performance of `examples/ex19.jl` by running it on GPU and on 1 or 32 CPU's of a Grace-Hopper 200 machine, using the following options:
+
+```bash
+$mpiexec -n 1 julia --project ex19.jl -snes_monitor -snes_converged_reason -snes_monitor   -pc_type mg  -mg_levels_ksp_type chebyshev -mg_levels_pc_type jacobi -ksp_monitor -log_view  -mg_levels_esteig_ksp_max_it 20  -mg_levels_esteig_ksp_max_it 20 -mg_levels_ksp_chebyshev_esteig 0,0.1,0,1.1 -mg_levels_ksp_max_it 3  -da_grid_x 513 -da_grid_y 513 -pc_mg_levels 6 
+```
+We performed tests in which we doubled the resolution in `x` and `y` while increasing the number of multigrid levels such that the coarse grid level has the same size in all cases.
+
+The results of the full solve include booting up Julia and computing the coloring pattern (which doesn't scale well).
+
+Below we report results for the inner solve itself (KSPSolve) on a GPU (with 1 MPI rank, which is a requirement of PETSc), and compare that with the results on CPU on 1 core and on 32 cores
+
+
+**KSPSolve**:
+| Resolution | GPU time (s) | GPU (GFlop/s) | CPU-1 time (s) | CPU-1 (GFlop/s) | CPU-32 time (s) | CPU-32 (GFlop/s) |
+|---|---|---|---|---|---|---|
+| 513²   |  0.116 | 144.3 |   4.337 |  4.1 |  0.297 | 61.0 |
+| 1025²  |  0.299 | 249.5 |  19.10  |  3.9 |  1.196 | 61.7 |
+| 2049²  |  1.118 | 295.5 |  89.76  |  3.6 |  5.757 | 56.6 |
+| 4097²  |  4.540 | 312.4 | 422.2   |  3.3 | 28.30  | 49.4 |
+
+
+**SNESSolve**:
+| Resolution | GPU time (s) | GPU (GFlop/s) | CPU-1 time (s) | CPU-1 (GFlop/s) | CPU-32 time (s) | CPU-32 (GFlop/s) |
+|---|---|---|---|---|---|---|
+| 513²   |  3.645 |  6.7 |   8.041 | 3.2 |  2.118 | 12.3 |
+| 1025²  |  5.127 | 20.5 |  32.75  | 3.2 |  3.698 | 28.2 |
+| 2049²  | 11.37  | 39.7 | 144.1   | 3.1 | 10.57  | 42.3 |
+| 4097²  | 36.88  | 47.7 | 658.3   | 2.9 | 43.58  | 43.2 |
+
+From this it is clear that the `KSPSolve` itself is very efficient on the GPU (and clearly beats the CPU), but that there is quite some overhead when we compare it with `SNESSolve` where this difference is not so large anymore.
+
+
+Lets have a look in detail on whethervthe example actually runs on the GPU:
+
+***GPU Utilisation Evidence — ex19.jl on GH200 (SM90)***
+
+All GPU runs use a CUDA-enabled PETSc build (cuSPARSE, cuBLAS) with KernelAbstractions.jl providing the residual kernel. The PETSc `GPU %F` column (fraction of flops executed on GPU) and the host↔device transfer logs provide direct evidence of efficient GPU utilisation.
+
+*1. Flop fraction on GPU*
+
+Nearly all floating point work executes on the GPU across all resolutions and all major solver phases:
+
+| Event | GPU %F |
+|---|---|
+| KSPSolve | 100% |
+| PCApply | 100% |
+| PCSetUp | 98–99% |
+| SNESSolve | 99–100% |
+
+At the kernel level, all key GMRES and multigrid operations run fully on the GPU: `MatMult`, `MatResidual`, `VecMAXPY`, `VecMDot`, `VecAXPBYCZ`, `VecAYPX`, `VecPointwiseMult`, `VecNormalize` all report 100% GPU %F.
+
+*2. Sparse matrix storage and transfers*
+
+System matrices are assembled on the host and uploaded once per Newton iteration via `MatCUSPARSECopyTo`, then all SpMV operations run in cuSPARSE format. Only 3 GpuToCpu copies occur per solve (one per Newton iteration), confirming matrices remain GPU-resident throughout.
+
+| Resolution | Matrix upload size (MB) |
+|---|---|
+| 513²  |    495 |
+| 1025² |  1,990 |
+| 2049² |  7,950 |
+| 4097² | 31,800 |
+
+*3. Vector transfers*
+
+Vectors stay GPU-resident. The number of host↔device copies is small relative to the hundreds of thousands of vector operations performed:
+
+| Resolution | CpuToGpu count | CpuToGpu (MB) | GpuToCpu count | GpuToCpu (MB) |
+|---|---|---|---|---|
+| 513²  | 37 |    109 | 24 |    42 |
+| 1025² | 55 |    437 | 36 |   168 |
+| 2049² | 64 |  1,750 | 42 |   672 |
+| 4097² | 73 |  6,980 | 48 | 2,690 |
+
+*4. Residual evaluation (KernelAbstractions)*
+
+`MatFDColorApply` — which drives all residual evaluations for the finite-difference Jacobian — reports 0% GPU %F in PETSc's profiler. This is expected: the residual kernel is launched by Julia's CUDA.jl runtime (via KernelAbstractions) and its flops are invisible to PETSc's event system. GPU execution is confirmed indirectly by the GpuToCpu transfer pattern in `MatFDColorApply`: PETSc hands off perturbed vectors, the KA kernel evaluates the residual on the GPU, and the result is returned. On the GH200's unified memory architecture these transfers are intra-device and incur minimal latency.
\ No newline at end of file

From d9fe358fb52a7371cc1258c7f5f2f8aa8c52ae32 Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Tue, 28 Apr 2026 08:42:48 +0000
Subject: [PATCH 22/39] typos

---
 docs/src/man/gpu.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/src/man/gpu.md b/docs/src/man/gpu.md
index fddcbf23..8b04190b 100644
--- a/docs/src/man/gpu.md
+++ b/docs/src/man/gpu.md
@@ -162,10 +162,11 @@ Below we report results for the inner solve itself (KSPSolve) on a GPU (with 1 M
 From this it is clear that the `KSPSolve` itself is very efficient on the GPU (and clearly beats the CPU), but that there is quite some overhead when we compare it with `SNESSolve` where this difference is not so large anymore.
 
 
-Lets have a look in detail on whethervthe example actually runs on the GPU:
 
 ***GPU Utilisation Evidence — ex19.jl on GH200 (SM90)***
 
+Lets have a look in detail on whether the example actually runs on the GPU:
+
 All GPU runs use a CUDA-enabled PETSc build (cuSPARSE, cuBLAS) with KernelAbstractions.jl providing the residual kernel. The PETSc `GPU %F` column (fraction of flops executed on GPU) and the host↔device transfer logs provide direct evidence of efficient GPU utilisation.
 
 *1. Flop fraction on GPU*

From a344d422c1c9118b41f2728649752d7b079cb6c9 Mon Sep 17 00:00:00 2001
From: Boris Kaus <61824822+boriskaus@users.noreply.github.com>
Date: Tue, 28 Apr 2026 11:58:03 +0200
Subject: [PATCH 23/39] Update src/dmda.jl

Co-authored-by: Valentin Churavy <v.churavy@gmail.com>
---
 src/dmda.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dmda.jl b/src/dmda.jl
index 7bfc9330..92caf412 100644
--- a/src/dmda.jl
+++ b/src/dmda.jl
@@ -205,6 +205,7 @@ function localinteriorlinearindex(da::AbstractPetscDM{PetscLib}) where PetscLib
     ind_local = LinearIndices(lower:upper)[:, l_inds][:]
     return ind_local
 end
+
 """
     dmda_star_fd_coloring(petsclib, da)
 

From 118c88dff8ad9ca52410c92937263295e78cd5ba Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Tue, 28 Apr 2026 12:21:50 +0000
Subject: [PATCH 24/39] remove string_wrappers & friends

---
 src/PETSc.jl                       |   5 --
 src/autowrapped/senums_wrappers.jl | 122 ++++++++++++++---------------
 src/string_wrappers.jl             |  30 -------
 src/string_wrappers_extra.jl       |   9 ---
 4 files changed, 61 insertions(+), 105 deletions(-)
 delete mode 100644 src/string_wrappers.jl
 delete mode 100644 src/string_wrappers_extra.jl

diff --git a/src/PETSc.jl b/src/PETSc.jl
index 2178d57a..5f4cc0e4 100644
--- a/src/PETSc.jl
+++ b/src/PETSc.jl
@@ -42,11 +42,6 @@ include("sys.jl")
 include("dmda.jl")          
 include("dmstag.jl")
 
-# String convenience wrappers for SetType functions
-include("string_wrappers.jl")       
-include("string_wrappers_extra.jl")
-
-
 include("audit.jl")
 
 
diff --git a/src/autowrapped/senums_wrappers.jl b/src/autowrapped/senums_wrappers.jl
index e2748ceb..e8dfe9ae 100644
--- a/src/autowrapped/senums_wrappers.jl
+++ b/src/autowrapped/senums_wrappers.jl
@@ -1,63 +1,63 @@
-# not quite sure yet how to deal with this
-PetscRegressorType=Ptr{Cchar}
-DMFieldType=Ptr{Cchar}
-DMPlexTransformType=Ptr{Cchar}
-PetscDrawType=Ptr{Cchar}
-PFType=Ptr{Cchar}
-DMAdaptorType=Ptr{Cchar}
-PetscFEType=Ptr{Cchar}
+# PETSc "type name" string aliases — all use Cstring so ccall handles AbstractString conversion automatically.
+PetscRegressorType=Cstring
+DMFieldType=Cstring
+DMPlexTransformType=Cstring
+PetscDrawType=Cstring
+PFType=Cstring
+DMAdaptorType=Cstring
+PetscFEType=Cstring
 VecType=Cstring
-VecTaggerType=Ptr{Cchar}
+VecTaggerType=Cstring
 MatType=Cstring
-MatSolverType=Ptr{Cchar}
-MatProductAlgorithm=Ptr{Cchar}
-MatOrderingType=Ptr{Cchar}
-MatColoringType=Ptr{Cchar}
-MatPartitioningType=Ptr{Cchar}
-MatMFFDType=Ptr{Cchar}
-SNESType=Ptr{Cchar}
-SNESLineSearchType=Ptr{Cchar}
-SNESMSType=Ptr{Cchar}
-DMType=Ptr{Cchar}
-KSPType=Ptr{Cchar}
-KSPGuessType=Ptr{Cchar}
-ISType=Ptr{Cchar}
-ISLocalToGlobalMappingType=Ptr{Cchar}
-PetscSectionSymType=Ptr{Cchar}
-TSType=Ptr{Cchar}
-TSTrajectoryType=Ptr{Cchar}
-TSSSPType=Ptr{Cchar}
-TSAdaptType=Ptr{Cchar}
-TSGLLEAdaptType=Ptr{Cchar}
-TSGLLEAcceptType=Ptr{Cchar}
-TSGLLEType=Ptr{Cchar}
-TSRKType=Ptr{Cchar}
-TSMPRKType=Ptr{Cchar}
-TSIRKType=Ptr{Cchar}
-TSGLEEType=Ptr{Cchar}
-TSARKIMEXType=Ptr{Cchar}
-TSDIRKType=Ptr{Cchar}
-TSRosWType=Ptr{Cchar}
-TSBasicSymplecticType=Ptr{Cchar}
-PetscSpaceType=Ptr{Cchar}
-MatCoarsenType=Ptr{Cchar}
-DMLabelType=Ptr{Cchar}
-PetscPartitionerType=Ptr{Cchar}
-PCType=Ptr{Cchar}
-PCGAMGType=Ptr{Cchar}
-PCGAMGClassicalType=Ptr{Cchar}
-PetscBenchType=Ptr{Cchar}
-PetscRandomType=Ptr{Cchar}
-TaoType=Ptr{Cchar}
-PetscViewerType=Ptr{Cchar}
-DMForestTopology=Ptr{Cchar}
-DMForestAdaptivityStrategy=Ptr{Cchar}
-CharacteristicType=Ptr{Cchar}
-PetscSFType=Ptr{Cchar}
-TaoLineSearchType=Ptr{Cchar}
-PetscDualSpaceType=Ptr{Cchar}
-AOType=Ptr{Cchar}
-PetscLimiterType=Ptr{Cchar}
-PetscFVType=Ptr{Cchar}
-PetscLogHandlerType=Ptr{Cchar}
-PetscDSType=Ptr{Cchar}
+MatSolverType=Cstring
+MatProductAlgorithm=Cstring
+MatOrderingType=Cstring
+MatColoringType=Cstring
+MatPartitioningType=Cstring
+MatMFFDType=Cstring
+SNESType=Cstring
+SNESLineSearchType=Cstring
+SNESMSType=Cstring
+DMType=Cstring
+KSPType=Cstring
+KSPGuessType=Cstring
+ISType=Cstring
+ISLocalToGlobalMappingType=Cstring
+PetscSectionSymType=Cstring
+TSType=Cstring
+TSTrajectoryType=Cstring
+TSSSPType=Cstring
+TSAdaptType=Cstring
+TSGLLEAdaptType=Cstring
+TSGLLEAcceptType=Cstring
+TSGLLEType=Cstring
+TSRKType=Cstring
+TSMPRKType=Cstring
+TSIRKType=Cstring
+TSGLEEType=Cstring
+TSARKIMEXType=Cstring
+TSDIRKType=Cstring
+TSRosWType=Cstring
+TSBasicSymplecticType=Cstring
+PetscSpaceType=Cstring
+MatCoarsenType=Cstring
+DMLabelType=Cstring
+PetscPartitionerType=Cstring
+PCType=Cstring
+PCGAMGType=Cstring
+PCGAMGClassicalType=Cstring
+PetscBenchType=Cstring
+PetscRandomType=Cstring
+TaoType=Cstring
+PetscViewerType=Cstring
+DMForestTopology=Cstring
+DMForestAdaptivityStrategy=Cstring
+CharacteristicType=Cstring
+PetscSFType=Cstring
+TaoLineSearchType=Cstring
+PetscDualSpaceType=Cstring
+AOType=Cstring
+PetscLimiterType=Cstring
+PetscFVType=Cstring
+PetscLogHandlerType=Cstring
+PetscDSType=Cstring
diff --git a/src/string_wrappers.jl b/src/string_wrappers.jl
deleted file mode 100644
index 3893d635..00000000
--- a/src/string_wrappers.jl
+++ /dev/null
@@ -1,30 +0,0 @@
-# Convenience overloads for PETSc Set*Type functions.
-# Each accepts AbstractString and converts to the Ptr{Cchar} the C API expects.
-# GC.@preserve keeps the String alive across the ccall inside the LibPETSc wrapper.
-
-function LibPETSc.MatSetType(petsclib, mat, type::AbstractString)
-    s = String(type)
-    GC.@preserve s LibPETSc.MatSetType(petsclib, mat, Base.unsafe_convert(Ptr{Cchar}, s))
-end
-
-function LibPETSc.VecSetType(petsclib, vec, type::AbstractString)
-    s = String(type)
-    GC.@preserve s LibPETSc.VecSetType(petsclib, vec, Base.unsafe_convert(Ptr{Cchar}, s))
-end
-
-function LibPETSc.KSPSetType(petsclib, ksp, type::AbstractString)
-    s = String(type)
-    GC.@preserve s LibPETSc.KSPSetType(petsclib, ksp, Base.unsafe_convert(Ptr{Cchar}, s))
-end
-
-function LibPETSc.SNESSetType(petsclib, snes, type::AbstractString)
-    s = String(type)
-    GC.@preserve s LibPETSc.SNESSetType(petsclib, snes, Base.unsafe_convert(Ptr{Cchar}, s))
-end
-
-function LibPETSc.DMSetType(petsclib, dm, type::AbstractString)
-    s = String(type)
-    GC.@preserve s LibPETSc.DMSetType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, s))
-end
-
-# DMSetVecType and DMSetMatType accept AbstractString directly (VecType/MatType = Cstring).
diff --git a/src/string_wrappers_extra.jl b/src/string_wrappers_extra.jl
deleted file mode 100644
index 1c943c02..00000000
--- a/src/string_wrappers_extra.jl
+++ /dev/null
@@ -1,9 +0,0 @@
-function LibPETSc.TSSetType(petsclib, ts, type::AbstractString)
-    s = String(type)
-    GC.@preserve s LibPETSc.TSSetType(petsclib, ts, Base.unsafe_convert(Ptr{Cchar}, s))
-end
-
-function LibPETSc.TaoSetType(petsclib, tao, type::AbstractString)
-    s = String(type)
-    GC.@preserve s LibPETSc.TaoSetType(petsclib, tao, Base.unsafe_convert(Ptr{Cchar}, s))
-end

From 024f9d5c52042d096f068becc10aa1a596160ff9 Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Tue, 28 Apr 2026 13:01:16 +0000
Subject: [PATCH 25/39] undo changes

---
 src/PETSc.jl                       |   4 +
 src/autowrapped/senums_wrappers.jl | 122 ++++++++++++++---------------
 src/mat.jl                         |  11 ++-
 src/string_wrappers.jl             |  75 ++++++++++++++++++
 src/string_wrappers_extra.jl       |  27 +++++++
 5 files changed, 175 insertions(+), 64 deletions(-)
 create mode 100644 src/string_wrappers.jl
 create mode 100644 src/string_wrappers_extra.jl

diff --git a/src/PETSc.jl b/src/PETSc.jl
index 5f4cc0e4..ba5f084c 100644
--- a/src/PETSc.jl
+++ b/src/PETSc.jl
@@ -30,6 +30,10 @@ export dmda_star_fd_coloring
 
 using Libdl
 
+# String convenience wrappers for SetType functions
+include("string_wrappers.jl")
+include("string_wrappers_extra.jl")
+
 include("init.jl")
 include("vec.jl")       
 include("mat.jl")          
diff --git a/src/autowrapped/senums_wrappers.jl b/src/autowrapped/senums_wrappers.jl
index e8dfe9ae..e2748ceb 100644
--- a/src/autowrapped/senums_wrappers.jl
+++ b/src/autowrapped/senums_wrappers.jl
@@ -1,63 +1,63 @@
-# PETSc "type name" string aliases — all use Cstring so ccall handles AbstractString conversion automatically.
-PetscRegressorType=Cstring
-DMFieldType=Cstring
-DMPlexTransformType=Cstring
-PetscDrawType=Cstring
-PFType=Cstring
-DMAdaptorType=Cstring
-PetscFEType=Cstring
+# not quite sure yet how to deal with this
+PetscRegressorType=Ptr{Cchar}
+DMFieldType=Ptr{Cchar}
+DMPlexTransformType=Ptr{Cchar}
+PetscDrawType=Ptr{Cchar}
+PFType=Ptr{Cchar}
+DMAdaptorType=Ptr{Cchar}
+PetscFEType=Ptr{Cchar}
 VecType=Cstring
-VecTaggerType=Cstring
+VecTaggerType=Ptr{Cchar}
 MatType=Cstring
-MatSolverType=Cstring
-MatProductAlgorithm=Cstring
-MatOrderingType=Cstring
-MatColoringType=Cstring
-MatPartitioningType=Cstring
-MatMFFDType=Cstring
-SNESType=Cstring
-SNESLineSearchType=Cstring
-SNESMSType=Cstring
-DMType=Cstring
-KSPType=Cstring
-KSPGuessType=Cstring
-ISType=Cstring
-ISLocalToGlobalMappingType=Cstring
-PetscSectionSymType=Cstring
-TSType=Cstring
-TSTrajectoryType=Cstring
-TSSSPType=Cstring
-TSAdaptType=Cstring
-TSGLLEAdaptType=Cstring
-TSGLLEAcceptType=Cstring
-TSGLLEType=Cstring
-TSRKType=Cstring
-TSMPRKType=Cstring
-TSIRKType=Cstring
-TSGLEEType=Cstring
-TSARKIMEXType=Cstring
-TSDIRKType=Cstring
-TSRosWType=Cstring
-TSBasicSymplecticType=Cstring
-PetscSpaceType=Cstring
-MatCoarsenType=Cstring
-DMLabelType=Cstring
-PetscPartitionerType=Cstring
-PCType=Cstring
-PCGAMGType=Cstring
-PCGAMGClassicalType=Cstring
-PetscBenchType=Cstring
-PetscRandomType=Cstring
-TaoType=Cstring
-PetscViewerType=Cstring
-DMForestTopology=Cstring
-DMForestAdaptivityStrategy=Cstring
-CharacteristicType=Cstring
-PetscSFType=Cstring
-TaoLineSearchType=Cstring
-PetscDualSpaceType=Cstring
-AOType=Cstring
-PetscLimiterType=Cstring
-PetscFVType=Cstring
-PetscLogHandlerType=Cstring
-PetscDSType=Cstring
+MatSolverType=Ptr{Cchar}
+MatProductAlgorithm=Ptr{Cchar}
+MatOrderingType=Ptr{Cchar}
+MatColoringType=Ptr{Cchar}
+MatPartitioningType=Ptr{Cchar}
+MatMFFDType=Ptr{Cchar}
+SNESType=Ptr{Cchar}
+SNESLineSearchType=Ptr{Cchar}
+SNESMSType=Ptr{Cchar}
+DMType=Ptr{Cchar}
+KSPType=Ptr{Cchar}
+KSPGuessType=Ptr{Cchar}
+ISType=Ptr{Cchar}
+ISLocalToGlobalMappingType=Ptr{Cchar}
+PetscSectionSymType=Ptr{Cchar}
+TSType=Ptr{Cchar}
+TSTrajectoryType=Ptr{Cchar}
+TSSSPType=Ptr{Cchar}
+TSAdaptType=Ptr{Cchar}
+TSGLLEAdaptType=Ptr{Cchar}
+TSGLLEAcceptType=Ptr{Cchar}
+TSGLLEType=Ptr{Cchar}
+TSRKType=Ptr{Cchar}
+TSMPRKType=Ptr{Cchar}
+TSIRKType=Ptr{Cchar}
+TSGLEEType=Ptr{Cchar}
+TSARKIMEXType=Ptr{Cchar}
+TSDIRKType=Ptr{Cchar}
+TSRosWType=Ptr{Cchar}
+TSBasicSymplecticType=Ptr{Cchar}
+PetscSpaceType=Ptr{Cchar}
+MatCoarsenType=Ptr{Cchar}
+DMLabelType=Ptr{Cchar}
+PetscPartitionerType=Ptr{Cchar}
+PCType=Ptr{Cchar}
+PCGAMGType=Ptr{Cchar}
+PCGAMGClassicalType=Ptr{Cchar}
+PetscBenchType=Ptr{Cchar}
+PetscRandomType=Ptr{Cchar}
+TaoType=Ptr{Cchar}
+PetscViewerType=Ptr{Cchar}
+DMForestTopology=Ptr{Cchar}
+DMForestAdaptivityStrategy=Ptr{Cchar}
+CharacteristicType=Ptr{Cchar}
+PetscSFType=Ptr{Cchar}
+TaoLineSearchType=Ptr{Cchar}
+PetscDualSpaceType=Ptr{Cchar}
+AOType=Ptr{Cchar}
+PetscLimiterType=Ptr{Cchar}
+PetscFVType=Ptr{Cchar}
+PetscLogHandlerType=Ptr{Cchar}
+PetscDSType=Ptr{Cchar}
diff --git a/src/mat.jl b/src/mat.jl
index b6510d4a..456421ad 100644
--- a/src/mat.jl
+++ b/src/mat.jl
@@ -127,9 +127,14 @@ function MatSeqDense(
     @assert PetscScalar == petsclib.PetscScalar
     
     PetscInt = petsclib.PetscInt
-    mat = LibPETSc.MatCreateSeqDense(petsclib, comm, PetscInt(size(A, 1)), PetscInt(size(A, 2)), A[:])
-
-    finalizer(destroy, mat)
+    # PETSc stores the data pointer directly without copying, so we must keep the
+    # backing array alive for the entire lifetime of the PETSc Mat.  The finalizer
+    # closure captures `data`, making it reachable (and therefore uncollectable) for
+    # as long as `mat` is alive.
+    data = vec(A)
+    mat = LibPETSc.MatCreateSeqDense(petsclib, comm, PetscInt(size(A, 1)), PetscInt(size(A, 2)), data)
+
+    finalizer(m -> (destroy(m); data), mat)
     return mat
 end
 
diff --git a/src/string_wrappers.jl b/src/string_wrappers.jl
new file mode 100644
index 00000000..e911a9da
--- /dev/null
+++ b/src/string_wrappers.jl
@@ -0,0 +1,75 @@
+# Convenience overloads for PETSc Set*Type functions.
+# Each accepts AbstractString and converts to the Ptr{Cchar} the C API expects.
+# GC.@preserve keeps the String alive across the ccall inside the LibPETSc wrapper.
+
+"""
+    MatSetType(petsclib, mat, type::AbstractString)
+
+Set the matrix type. Accepts any `AbstractString`.
+
+# External Links
+$(_doc_external("Mat/MatSetType"))
+"""
+function LibPETSc.MatSetType(petsclib, mat, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.MatSetType(petsclib, mat, Base.unsafe_convert(Ptr{Cchar}, s))
+    return nothing
+end
+
+"""
+    VecSetType(petsclib, vec, type::AbstractString)
+
+Set the vector type. Accepts any `AbstractString`.
+
+# External Links
+$(_doc_external("Vec/VecSetType"))
+"""
+function LibPETSc.VecSetType(petsclib, vec, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.VecSetType(petsclib, vec, Base.unsafe_convert(Ptr{Cchar}, s))
+    return nothing
+end
+
+"""
+    KSPSetType(petsclib, ksp, type::AbstractString)
+
+Set the KSP solver type. Accepts any `AbstractString`.
+
+# External Links
+$(_doc_external("KSP/KSPSetType"))
+"""
+function LibPETSc.KSPSetType(petsclib, ksp, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.KSPSetType(petsclib, ksp, Base.unsafe_convert(Ptr{Cchar}, s))
+    return nothing
+end
+
+"""
+    SNESSetType(petsclib, snes, type::AbstractString)
+
+Set the SNES nonlinear solver type. Accepts any `AbstractString`.
+
+# External Links
+$(_doc_external("SNES/SNESSetType"))
+"""
+function LibPETSc.SNESSetType(petsclib, snes, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.SNESSetType(petsclib, snes, Base.unsafe_convert(Ptr{Cchar}, s))
+    return nothing
+end
+
+"""
+    DMSetType(petsclib, dm, type::AbstractString)
+
+Set the DM type. Accepts any `AbstractString`.
+
+# External Links
+$(_doc_external("DM/DMSetType"))
+"""
+function LibPETSc.DMSetType(petsclib, dm, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.DMSetType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, s))
+    return nothing
+end
+
+# DMSetVecType and DMSetMatType accept AbstractString directly (VecType/MatType = Cstring).
diff --git a/src/string_wrappers_extra.jl b/src/string_wrappers_extra.jl
new file mode 100644
index 00000000..ee6b855f
--- /dev/null
+++ b/src/string_wrappers_extra.jl
@@ -0,0 +1,27 @@
+"""
+    TSSetType(petsclib, ts, type::AbstractString)
+
+Set the TS time-stepping type. Accepts any `AbstractString`.
+
+# External Links
+$(_doc_external("TS/TSSetType"))
+"""
+function LibPETSc.TSSetType(petsclib, ts, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.TSSetType(petsclib, ts, Base.unsafe_convert(Ptr{Cchar}, s))
+    return nothing
+end
+
+"""
+    TaoSetType(petsclib, tao, type::AbstractString)
+
+Set the Tao optimization solver type. Accepts any `AbstractString`.
+
+# External Links
+$(_doc_external("Tao/TaoSetType"))
+"""
+function LibPETSc.TaoSetType(petsclib, tao, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.TaoSetType(petsclib, tao, Base.unsafe_convert(Ptr{Cchar}, s))
+    return nothing
+end

From c62ff6e76671ce2a0b23263676856487d5bf79d5 Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Tue, 28 Apr 2026 13:47:12 +0000
Subject: [PATCH 26/39] refactor: replace Ref{Any} hooks with type-based
 dispatch for GPU backends

Replace the three global `Ref{Any}` hook pattern (`_withlocalarray_device_hook`, `_get_petsc_arrays_hook`, `_restore_petsc_arrays_hook`) with type-based dispatch mechanism.

Adding a future AMD HIP backend requires only a new extension file in this way
---
 ext/PETScCUDAExt.jl | 201 +++++++++++++++--------------------------
 src/PETSc.jl        |   1 +
 src/vec.jl          | 216 +++++++++++++++++++++++++++++++-------------
 3 files changed, 229 insertions(+), 189 deletions(-)

diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
index 7bab66aa..347b0ddb 100644
--- a/ext/PETScCUDAExt.jl
+++ b/ext/PETScCUDAExt.jl
@@ -2,155 +2,104 @@ module PETScCUDAExt
 
 using PETSc
 using PETSc: LibPETSc, AbstractPetscVec
-using PETSc.LibPETSc: PetscMemType, PETSC_MEMTYPE_HOST
+using PETSc.LibPETSc: PETSC_MEMTYPE_DEVICE
 using CUDA
 
-# ── Internal: get one device-or-host array from a single Vec ─────────────────
-#
-# Uses VecGetArray{,Read,Write}AndMemType so PETSc tells us where the data is:
-#   PETSC_MEMTYPE_HOST   → return a plain Julia Vector (no copy)
-#   anything else (CUDA) → wrap the device pointer as a CuArray (no copy)
-#
-# The returned array has a finalizer that calls the matching VecRestoreArray*
-# so the caller just needs to finalize it when done, exactly like unsafe_localarray.
-#
-function _unsafe_localarray_device(
-    vec::AbstractPetscVec{PetscLib};
-    read::Bool = true,
-    write::Bool = true,
-) where {PetscLib}
-
-    if write && read
-        cpu_arr, mtype = LibPETSc.VecGetArrayAndMemType(PetscLib, vec)
-    elseif write
-        cpu_arr, mtype = LibPETSc.VecGetArrayWriteAndMemType(PetscLib, vec)
-    else
-        cpu_arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, vec)
-    end
+# ── CUDA memory backend ───────────────────────────────────────────────────────
 
-    if mtype === PETSC_MEMTYPE_HOST
-        # Data is on the host — attach a restore finalizer and return as-is.
-        finalizer(cpu_arr) do a
-            if write && read
-                LibPETSc.VecRestoreArrayAndMemType(PetscLib, vec, a)
-            elseif write
-                LibPETSc.VecRestoreArrayWriteAndMemType(PetscLib, vec, a)
-            else
-                LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, vec, a)
-            end
-            return nothing
-        end
-        return cpu_arr
-    else
-        # Data is on the GPU — wrap the device pointer as a CuArray.
-        # cpu_arr holds the raw device pointer in a Julia Vector shell; we must
-        # keep it alive (captured in the finalizer) so the pointer stays valid.
-        T   = eltype(cpu_arr)
-        n   = length(cpu_arr)
-        ptr = reinterpret(CuPtr{T}, UInt(pointer(cpu_arr)))
-        dev_arr = CUDA.unsafe_wrap(CuArray, ptr, n; own = false)
+struct CUDABackend <: PETSc.AbstractPETScMemBackend end
 
-        finalizer(dev_arr) do _
-            if write && read
-                LibPETSc.VecRestoreArrayAndMemType(PetscLib, vec, cpu_arr)
-            elseif write
-                LibPETSc.VecRestoreArrayWriteAndMemType(PetscLib, vec, cpu_arr)
-            else
-                LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, vec, cpu_arr)
-            end
-            return nothing
-        end
-        return dev_arr
-    end
-end
+PETSc._memtype_backend(::Val{PETSC_MEMTYPE_DEVICE}) = CUDABackend()
 
-# ── Public hook: register CUDA implementation for withlocalarray_device! ──────
-#
-# We cannot override PETSc.withlocalarray_device! with the same signature
-# during precompilation (Julia restriction).  Instead we register a closure
-# in __init__ that the base method dispatches to when the hook is non-nothing.
+# ── _wrap_localarray: device branch ──────────────────────────────────────────
 #
-function _cuda_withlocalarray_device_impl!(
-    f!,
-    vecs::NTuple{N};
-    read::Union{Bool, NTuple{N, Bool}}  = true,
-    write::Union{Bool, NTuple{N, Bool}} = true,
-) where {N}
-    read  isa NTuple{N, Bool} || (read  = ntuple(_ -> read,  N))
-    write isa NTuple{N, Bool} || (write = ntuple(_ -> write, N))
-
-    arrays = map(vecs, read, write) do v, r, w
-        _unsafe_localarray_device(v; read = r, write = w)
+# Called by `_unsafe_localarray_device` (defined in base vec.jl) when the Vec
+# is device-resident.  Wraps the device pointer as a CuArray (zero-copy) and
+# attaches a finalizer that calls the matching VecRestoreArray*AndMemType.
+# `cpu_arr` (the raw Julia Vector wrapping the device pointer) is captured in
+# the finalizer closure to keep it alive until the restore is done.
+
+function PETSc._wrap_localarray(
+    cpu_arr, ::CUDABackend, vec::AbstractPetscVec{PetscLib};
+    read::Bool, write::Bool,
+) where {PetscLib}
+    T   = eltype(cpu_arr)
+    n   = length(cpu_arr)
+    ptr = reinterpret(CuPtr{T}, UInt(pointer(cpu_arr)))
+    dev_arr = CUDA.unsafe_wrap(CuArray, ptr, n; own = false)
+    finalizer(dev_arr) do _
+        if write && read
+            LibPETSc.VecRestoreArrayAndMemType(PetscLib, vec, cpu_arr)
+        elseif write
+            LibPETSc.VecRestoreArrayWriteAndMemType(PetscLib, vec, cpu_arr)
+        else
+            LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, vec, cpu_arr)
+        end
+        return nothing
     end
-
-    val = f!(arrays...)
-
-    map(Base.finalize, arrays)
-
-    return val
+    return dev_arr
 end
 
-# ── GPU-aware PETSc array helpers ─────────────────────────────────────────────
+# ── _get_petsc_arrays_impl: CUDA cases ───────────────────────────────────────
 #
-# CUDA implementations of PETSc.get_petsc_arrays / PETSc.restore_petsc_arrays.
-# Registered as hooks in __init__ so the base-module functions dispatch here
-# whenever CUDA.jl is loaded.
+# Two methods cover all GPU sub-cases:
 #
-# Three sub-cases handled by get:
-#   Both Vecs on GPU  → zero-copy CuArray wraps, fx_bounce = nothing
-#   lx on GPU only    → wrap lx zero-copy; allocate GPU scratch for fx (bounce)
-#   Both Vecs on CPU  → copy lx H2D; allocate GPU scratch for fx (bounce)
+#   CUDABackend × CUDABackend → both Vecs on device: zero-copy wrap, no bounce
+#   any other mix             → at least one Vec is host-resident:
+#                               lx is wrapped zero-copy if on device, or
+#                               copied H2D if on host;
+#                               fx always gets a fresh scratch CuArray (bounce)
+#                               so the kernel writes there and restore copies D2H.
 #
-# restore then D2H-copies the bounce buffer (if any) before calling
-# VecRestoreArray*AndMemType on both Vecs.
-
-function _cuda_get_petsc_arrays_impl(petsclib, g_fx, l_x)
-    T      = petsclib.PetscScalar
-    fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, g_fx)
-    lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, l_x)
+# HostBackend × HostBackend is handled entirely in base (vec.jl) and never
+# reaches these methods.
+
+# Both Vecs on the device: zero-copy wrap, no scratch needed.
+function PETSc._get_petsc_arrays_impl(
+    petsclib, g_fx, l_x, ::Type{T}, fx_arr, lx_arr,
+    ::CUDABackend, ::CUDABackend,
+) where {T}
+    fx = CUDA.unsafe_wrap(CuArray,
+        reinterpret(CuPtr{T}, UInt(pointer(fx_arr))), length(fx_arr))
+    lx = CUDA.unsafe_wrap(CuArray,
+        reinterpret(CuPtr{T}, UInt(pointer(lx_arr))), length(lx_arr))
+    return fx, lx, fx_arr, lx_arr, nothing
+end
 
-    if fx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE &&
-       lx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
-        # Both on GPU: zero-copy wrap, no bounce needed.
-        fx = CUDA.unsafe_wrap(CuArray,
-            reinterpret(CuPtr{T}, UInt(pointer(fx_arr))), length(fx_arr))
-        lx = CUDA.unsafe_wrap(CuArray,
+# At least one Vec is host-resident (e.g. MG coarser levels, FD-coloring path).
+# Catch-all: less specific than (CUDABackend, CUDABackend), so Julia prefers
+# the method above when both are on the device.
+function PETSc._get_petsc_arrays_impl(
+    petsclib, g_fx, l_x, ::Type{T}, fx_arr, lx_arr,
+    fx_b::PETSc.AbstractPETScMemBackend, lx_b::PETSc.AbstractPETScMemBackend,
+) where {T}
+    lx_gpu = if lx_b isa CUDABackend
+        CUDA.unsafe_wrap(CuArray,
             reinterpret(CuPtr{T}, UInt(pointer(lx_arr))), length(lx_arr))
-        return fx, lx, fx_arr, lx_arr, nothing
     else
-        # At least one Vec is host-resident (e.g. freshly created coarser MG
-        # level, or FD-coloring CPU path).  Wrap or copy lx to GPU as needed,
-        # and allocate a GPU scratch buffer for fx so the kernel can write there;
-        # restore_petsc_arrays copies it back D2H after the kernel.
-        lx_gpu = if lx_mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
-            CUDA.unsafe_wrap(CuArray,
-                reinterpret(CuPtr{T}, UInt(pointer(lx_arr))), length(lx_arr))
-        else
-            tmp = CuArray{T}(undef, length(lx_arr))
-            copyto!(tmp, lx_arr)        # H2D: send ghost input to GPU
-            tmp
-        end
-        fx_gpu = CuArray{T}(undef, length(fx_arr))
-        return fx_gpu, lx_gpu, fx_arr, lx_arr, fx_gpu
+        tmp = CuArray{T}(undef, length(lx_arr))
+        copyto!(tmp, lx_arr)    # H2D: send ghost input to GPU
+        tmp
     end
+    fx_gpu = CuArray{T}(undef, length(fx_arr))  # scratch buffer for residual
+    return fx_gpu, lx_gpu, fx_arr, lx_arr, fx_gpu
 end
 
-function _cuda_restore_petsc_arrays_impl(
-    petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce,
+# ── _restore_petsc_arrays_impl: CUDA ─────────────────────────────────────────
+#
+# When fx is a CuArray (returned by the GPU _get_petsc_arrays_impl above):
+#   - if fx_bounce !== nothing, sync the device and copy the scratch D2H
+#   - call VecRestoreArray*AndMemType on both raw PETSc arrays
+
+function PETSc._restore_petsc_arrays_impl(
+    petsclib, g_fx, l_x, fx::CuArray, lx, fx_arr, lx_arr, fx_bounce,
 )
     if fx_bounce !== nothing
-        # D2H: copy GPU residual result back to the host PETSc array.
         CUDA.synchronize()
-        copyto!(fx_arr, fx_bounce)
+        copyto!(fx_arr, fx_bounce)  # D2H: copy residual back to host PETSc array
     end
     LibPETSc.VecRestoreArrayAndMemType(petsclib, g_fx, fx_arr)
     LibPETSc.VecRestoreArrayReadAndMemType(petsclib, l_x, lx_arr)
 end
 
-function __init__()
-    PETSc._withlocalarray_device_hook[] = _cuda_withlocalarray_device_impl!
-    PETSc._get_petsc_arrays_hook[]      = _cuda_get_petsc_arrays_impl
-    PETSc._restore_petsc_arrays_hook[]  = _cuda_restore_petsc_arrays_impl
-end
-
-end # module
+end # module PETScCUDAExt
diff --git a/src/PETSc.jl b/src/PETSc.jl
index ba5f084c..79187229 100644
--- a/src/PETSc.jl
+++ b/src/PETSc.jl
@@ -24,6 +24,7 @@ export LibPETSc
 export audit_petsc_file
 export set_petsclib
 export set_library!, unset_library!, library_info
+export AbstractPETScMemBackend, HostBackend
 export withlocalarray_device!
 export get_petsc_arrays, restore_petsc_arrays
 export dmda_star_fd_coloring
diff --git a/src/vec.jl b/src/vec.jl
index e989be1c..db00dab0 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -258,6 +258,84 @@ function unsafe_localarray(
 end
 
 
+# ── Memory backend type hierarchy ─────────────────────────────────────────────
+#
+# Extensions add their own backend singletons (e.g. `CUDABackend`) and overload
+# `_memtype_backend(::Val{PETSC_MEMTYPE_DEVICE})` to return them.  The base
+# package handles only `PETSC_MEMTYPE_HOST` → `HostBackend`.
+
+"""
+    AbstractPETScMemBackend
+
+Abstract supertype for PETSc memory backends.  The base package defines only
+[`HostBackend`](@ref).  GPU extensions add their own (e.g. `CUDABackend`).
+"""
+abstract type AbstractPETScMemBackend end
+
+"""
+    HostBackend <: AbstractPETScMemBackend
+
+Singleton dispatch type representing host (CPU) memory.
+"""
+struct HostBackend <: AbstractPETScMemBackend end
+
+"""
+    _memtype_backend(mtype::PetscMemType) → AbstractPETScMemBackend
+
+Convert a `PetscMemType` runtime enum value to a singleton dispatch type.
+GPU extensions overload `_memtype_backend(::Val{MT})` for their specific
+`PetscMemType` values (e.g. `PETSC_MEMTYPE_DEVICE` for CUDA).
+"""
+_memtype_backend(::Val{LibPETSc.PETSC_MEMTYPE_HOST}) = HostBackend()
+_memtype_backend(::Val{MT}) where {MT} =
+    error("No GPU backend loaded for PetscMemType $MT — load CUDA.jl, AMDGPU.jl, …")
+_memtype_backend(mt::LibPETSc.PetscMemType) = _memtype_backend(Val(mt))
+
+# ── Device-aware local array access ───────────────────────────────────────────
+#
+# `_unsafe_localarray_device` is the unified entry point: it calls
+# `VecGetArray*AndMemType`, converts the returned `PetscMemType` to a backend
+# singleton via `_memtype_backend`, and dispatches to `_wrap_localarray`.
+# GPU extensions add `_wrap_localarray` methods for their own backend types.
+
+function _unsafe_localarray_device(
+    vec::AbstractPetscVec{PetscLib};
+    read::Bool = true,
+    write::Bool = true,
+) where {PetscLib}
+    if write && read
+        cpu_arr, mtype = LibPETSc.VecGetArrayAndMemType(PetscLib, vec)
+    elseif write
+        cpu_arr, mtype = LibPETSc.VecGetArrayWriteAndMemType(PetscLib, vec)
+    else
+        cpu_arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, vec)
+    end
+    return _wrap_localarray(cpu_arr, _memtype_backend(mtype), vec; read, write)
+end
+
+function _wrap_localarray(
+    cpu_arr, ::HostBackend, vec::AbstractPetscVec{PetscLib};
+    read::Bool, write::Bool,
+) where {PetscLib}
+    finalizer(cpu_arr) do a
+        if write && read
+            LibPETSc.VecRestoreArrayAndMemType(PetscLib, vec, a)
+        elseif write
+            LibPETSc.VecRestoreArrayWriteAndMemType(PetscLib, vec, a)
+        else
+            LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, vec, a)
+        end
+        return nothing
+    end
+    return cpu_arr
+end
+
+# Fallback: no backend loaded for this PetscMemType.
+function _wrap_localarray(cpu_arr, b::AbstractPETScMemBackend, vec; kw...)
+    error("_wrap_localarray not implemented for backend $(typeof(b)) — " *
+          "load the corresponding GPU package (e.g. CUDA.jl)")
+end
+
 """
     withlocalarray!(
         f!,
@@ -266,10 +344,13 @@ end
         write::Union{Bool, NTuple{N, Bool}} = true,
     )
 
-Convert `x` to an `Array{PetscScalar}` using [`unsafe_localarray`](@ref) and
-apply the function `f!`.
+Apply `f!` to host-side local array views of `vecs` via `VecGetArray`.  The
+arrays are always plain `Array`s regardless of where the Vec lives; PETSc will
+stage the data to the host if necessary.  Use `read=false` if write-only,
+`write=false` if read-only.
 
-Use `read=false` if the array is write-only; `write=false` if read-only.
+For GPU-aware access (returns a `CuArray` when the Vec lives on the device) use
+[`withlocalarray_device!`](@ref) instead.
 
 # Examples
 ```julia-repl
@@ -303,9 +384,7 @@ function withlocalarray!(
         unsafe_localarray(v; read = r, write = w)
     end
     val = f!(arrays...)
-    map(arrays) do array
-        Base.finalize(array)
-    end
+    map(Base.finalize, arrays)
     return val
 end
 withlocalarray!(f!, vecs...; kwargs...) = withlocalarray!(f!, vecs; kwargs...)
@@ -313,23 +392,27 @@ withlocalarray!(f!, vecs...; kwargs...) = withlocalarray!(f!, vecs; kwargs...)
 """
     withlocalarray_device!(f!, vecs...; read, write)
 
-Like [`withlocalarray!`](@ref) but returns a device array (e.g. `CuArray`) when
-the underlying PETSc vector lives on GPU (i.e. `PetscMemType` is not HOST).
-
-When CUDA.jl is loaded the `PETScCUDAExt` extension sets the
-`_withlocalarray_device_hook` global to wrap device pointers as `CuArray`s
-without any host↔device copy.  When CUDA.jl is not loaded this falls back
-to [`withlocalarray!`](@ref).
+Like [`withlocalarray!`](@ref) but uses `VecGetArray*AndMemType` and dispatches
+on the memory location of each Vec via [`_memtype_backend`](@ref).  When a GPU
+backend extension is loaded and a Vec lives on the device, `f!` receives a
+device array (e.g. `CuArray`) — zero-copy with no host↔device transfer.  When
+all Vecs are host-resident, this behaves identically to `withlocalarray!`.
 """
-const _withlocalarray_device_hook = Ref{Any}(nothing)
+function withlocalarray_device!(
+    f!,
+    vecs::NTuple{N, AbstractPetscVec};
+    read::Union{Bool, NTuple{N, Bool}} = true,
+    write::Union{Bool, NTuple{N, Bool}} = true,
+) where {N}
+    read isa NTuple{N, Bool} || (read = ntuple(_ -> read, N))
+    write isa NTuple{N, Bool} || (write = ntuple(_ -> write, N))
 
-function withlocalarray_device!(f!, vecs::NTuple{N, AbstractPetscVec}; kwargs...) where {N}
-    hook = _withlocalarray_device_hook[]
-    if hook !== nothing
-        return hook(f!, vecs; kwargs...)
-    else
-        return withlocalarray!(f!, vecs; kwargs...)
+    arrays = map(vecs, read, write) do v, r, w
+        _unsafe_localarray_device(v; read = r, write = w)
     end
+    val = f!(arrays...)
+    map(Base.finalize, arrays)
+    return val
 end
 withlocalarray_device!(f!, vecs...; kwargs...) = withlocalarray_device!(f!, vecs; kwargs...)
 
@@ -455,50 +538,56 @@ end
 
 # ── GPU-aware array access helpers ────────────────────────────────────────────
 #
-# `get_petsc_arrays` returns a pair of arrays (read-write and read-only) that
-# are ready to be passed to a compute kernel, together with raw PETSc handles
-# and an optional "bounce" buffer needed for the restore step.
-#
-# When PETScCUDAExt is loaded (i.e. CUDA.jl is in the environment and has been
-# imported) the hooks below are replaced with CUDA-aware implementations that
-# wrap device pointers as CuArrays — zero-copy when both Vecs are already on
-# the device, or with a H2D copy of `l_x` when it is host-resident.  A GPU
-# scratch buffer ("bounce") is allocated for `g_fx` when it is host-resident so
-# the kernel can write into GPU memory; `restore_petsc_arrays` then copies it
-# back D2H before calling VecRestoreArray.
+# `get_petsc_arrays` calls `VecGetArrayAndMemType` on both Vecs, converts the
+# returned `PetscMemType` values to backend singletons, and dispatches to
+# `_get_petsc_arrays_impl`.  The base package handles the pure-CPU case
+# (HostBackend × HostBackend).  GPU extensions add `_get_petsc_arrays_impl`
+# methods for their backend combinations and a matching
+# `_restore_petsc_arrays_impl` method dispatched by `restore_petsc_arrays`.
 #
-# On the plain CPU path the hooks are `nothing` and the functions fall back to
-# `unsafe_localarray` with finalizer-based cleanup.
-
-const _get_petsc_arrays_hook     = Ref{Any}(nothing)
-const _restore_petsc_arrays_hook = Ref{Any}(nothing)
+# Return tuple:  (fx, lx, fx_arr, lx_arr, fx_bounce)
+#   CPU:  fx, lx are plain Arrays with VecRestore finalizers;
+#         fx_arr = lx_arr = fx_bounce = nothing
+#   GPU:  fx, lx are device arrays; fx_arr, lx_arr are raw PETSc arrays
+#         (needed for restore); fx_bounce is a scratch device array or nothing.
 
 """
     get_petsc_arrays(petsclib, g_fx, l_x) -> (fx, lx, fx_arr, lx_arr, fx_bounce)
 
-Return arrays for `g_fx` (read-write) and `l_x` (read-only) that are suitable
-for passing to a compute kernel.
-
-When PETScCUDAExt is active and either Vec lives on the GPU the returned
-`fx`/`lx` are `CuArray`s (zero-copy if both Vecs are device-resident, or with
-a host-to-device copy of `l_x` when only `l_x` is on the device).  If `g_fx`
-is host-resident a GPU scratch buffer is returned as `fx_bounce`; its contents
-must be written back by `restore_petsc_arrays` after the kernel completes.
+Return arrays for `g_fx` (read-write) and `l_x` (read-only) suitable for
+passing to a compute kernel.  Dispatches on the memory location of each Vec
+via `_memtype_backend`.
 
-On the CPU path (no CUDA or all Vecs on host) `fx`/`lx` are plain `Array`s and
-`fx_arr = lx_arr = fx_bounce = nothing`.
+On the pure-CPU path (`HostBackend × HostBackend`) `fx`/`lx` are plain
+`Array`s and `fx_arr = lx_arr = fx_bounce = nothing`.  When a GPU backend
+extension is loaded and a Vec lives on the device the returned `fx`/`lx` are
+device arrays.  An optional bounce buffer `fx_bounce` is allocated when `g_fx`
+is host-resident; its contents must be written back by `restore_petsc_arrays`
+after the kernel completes.
 
 See also: [`restore_petsc_arrays`](@ref)
 """
 function get_petsc_arrays(petsclib, g_fx, l_x)
-    hook = _get_petsc_arrays_hook[]
-    if hook !== nothing
-        return hook(petsclib, g_fx, l_x)
+    T = petsclib.PetscScalar
+    fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, g_fx)
+    lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, l_x)
+    return _get_petsc_arrays_impl(
+        petsclib, g_fx, l_x, T, fx_arr, lx_arr,
+        _memtype_backend(fx_mtype), _memtype_backend(lx_mtype),
+    )
+end
+
+# CPU base case: attach VecRestore finalizers and return the arrays directly.
+function _get_petsc_arrays_impl(
+    petsclib, g_fx, l_x, ::Type, fx_arr, lx_arr, ::HostBackend, ::HostBackend,
+)
+    finalizer(fx_arr) do a
+        LibPETSc.VecRestoreArrayAndMemType(petsclib, g_fx, a)
     end
-    # CPU fallback: plain arrays, cleanup via finalizers
-    fx = unsafe_localarray(g_fx; read = true, write = true)
-    lx = unsafe_localarray(l_x;  read = true, write = false)
-    return fx, lx, nothing, nothing, nothing
+    finalizer(lx_arr) do a
+        LibPETSc.VecRestoreArrayReadAndMemType(petsclib, l_x, a)
+    end
+    return fx_arr, lx_arr, nothing, nothing, nothing
 end
 
 """
@@ -506,18 +595,19 @@ end
 
 Restore PETSc Vecs after a kernel launched via [`get_petsc_arrays`](@ref).
 
-On the CUDA path this optionally synchronises the device and copies the bounce
-buffer back to the host PETSc array before calling the matching
-`VecRestoreArray*AndMemType` pair.  On the CPU path it simply finalizes the
-plain arrays returned by `unsafe_localarray`.
+Dispatches to `_restore_petsc_arrays_impl`.  On the CPU path (`fx_arr`,
+`lx_arr`, `fx_bounce` all `nothing`) this simply finalizes `fx` and `lx`,
+triggering the registered `VecRestoreArray*AndMemType` finalizers.  GPU backend
+extensions add a `_restore_petsc_arrays_impl` method for their array types.
 """
 function restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
-    hook = _restore_petsc_arrays_hook[]
-    if hook !== nothing
-        hook(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
-        return
-    end
-    # CPU fallback: finalizers registered by unsafe_localarray do the restore
+    _restore_petsc_arrays_impl(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
+end
+
+# CPU base case: VecRestore finalizers on fx/lx do the work.
+function _restore_petsc_arrays_impl(
+    petsclib, g_fx, l_x, fx, lx, ::Nothing, ::Nothing, ::Nothing,
+)
     Base.finalize(fx)
     Base.finalize(lx)
 end

From cefba0e5a3df26397cf814645cd86c468f18df4e Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Tue, 28 Apr 2026 14:45:48 +0000
Subject: [PATCH 27/39] implement Valentin's suggestions

Co-authored-by: Copilot <copilot@github.com>
---
 examples/ex19.jl    |  18 +++++---
 ext/PETScCUDAExt.jl |   1 +
 src/PETSc.jl        |   2 +-
 src/vec.jl          | 109 +++++++++++++++++++++++++++++++-------------
 4 files changed, 90 insertions(+), 40 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index 71651775..13ead70b 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -60,7 +60,7 @@
 =#
 
 # ── GPU switch ────────────────────────────────────────────────────────────────
-const useCUDA = true
+const useCUDA = false
 
 using MPI
 using PETSc
@@ -329,12 +329,16 @@ PETSc.withlocalarray!(x; read = false) do x_arr
     nx_own = xe - xs + 1;   ny_own = ye - ys + 1
     dx = one(_T) / (mx - 1)
     x_par = reshape(x_arr, 4, nx_own, ny_own)
-    for lj in 1:ny_own, li in 1:nx_own
-        ig = xs + li - 1
-        x_par[1, li, lj] = zero(_T)
-        x_par[2, li, lj] = zero(_T)
-        x_par[3, li, lj] = zero(_T)
-        x_par[4, li, lj] = user.grashof > 0 ? _T(ig - 1) * dx : zero(_T)
+    # Components u, v, ω start at zero; T is linear in x (or zero if grashof=0).
+    # Use broadcast-friendly assignment so this works for both Array and CuArray.
+    fill!(x_par, zero(_T))
+    if user.grashof > 0
+        # T = (ig - 1) * dx  where ig (1-based global x-index) = xs + li - 1
+        # Build on host then copy to the same device as x_arr.
+        t_cpu = _T.((xs - 1 : xs + nx_own - 2) .* dx)   # 1-D CPU, length nx_own
+        t_dev = similar(x_arr, nx_own)
+        copyto!(t_dev, t_cpu)
+        x_par[4, :, :] .= reshape(t_dev, nx_own, 1)
     end
 end
 
diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
index 347b0ddb..d7407d77 100644
--- a/ext/PETScCUDAExt.jl
+++ b/ext/PETScCUDAExt.jl
@@ -10,6 +10,7 @@ using CUDA
 struct CUDABackend <: PETSc.AbstractPETScMemBackend end
 
 PETSc._memtype_backend(::Val{PETSC_MEMTYPE_DEVICE}) = CUDABackend()
+PETSc._array_type(::CUDABackend) = CuArray
 
 # ── _wrap_localarray: device branch ──────────────────────────────────────────
 #
diff --git a/src/PETSc.jl b/src/PETSc.jl
index 79187229..c2bfe815 100644
--- a/src/PETSc.jl
+++ b/src/PETSc.jl
@@ -25,7 +25,7 @@ export audit_petsc_file
 export set_petsclib
 export set_library!, unset_library!, library_info
 export AbstractPETScMemBackend, HostBackend
-export withlocalarray_device!
+export determine_memtype
 export get_petsc_arrays, restore_petsc_arrays
 export dmda_star_fd_coloring
 
diff --git a/src/vec.jl b/src/vec.jl
index db00dab0..1f49b121 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -293,12 +293,16 @@ _memtype_backend(mt::LibPETSc.PetscMemType) = _memtype_backend(Val(mt))
 
 # ── Device-aware local array access ───────────────────────────────────────────
 #
-# `_unsafe_localarray_device` is the unified entry point: it calls
+# `_unsafe_localarray` is the unified entry point: it calls
 # `VecGetArray*AndMemType`, converts the returned `PetscMemType` to a backend
 # singleton via `_memtype_backend`, and dispatches to `_wrap_localarray`.
 # GPU extensions add `_wrap_localarray` methods for their own backend types.
+#
+# The typed overload `_unsafe_localarray(::Type{A}, vec; ...)` additionally
+# asserts that the returned array is of type `A`, giving a clear error when a
+# Vec is on an unexpected device.
 
-function _unsafe_localarray_device(
+function _unsafe_localarray(
     vec::AbstractPetscVec{PetscLib};
     read::Bool = true,
     write::Bool = true,
@@ -313,6 +317,21 @@ function _unsafe_localarray_device(
     return _wrap_localarray(cpu_arr, _memtype_backend(mtype), vec; read, write)
 end
 
+function _unsafe_localarray(
+    ::Type{A},
+    vec::AbstractPetscVec;
+    read::Bool = true,
+    write::Bool = true,
+) where {A <: AbstractArray}
+    arr = _unsafe_localarray(vec; read, write)
+    arr isa A && return arr
+    Base.finalize(arr)   # release the PETSc handle before throwing
+    throw(ArgumentError(
+        "expected array of type $A but Vec returned $(typeof(arr)). " *
+        "Check that the Vec lives on the expected device."
+    ))
+end
+
 function _wrap_localarray(
     cpu_arr, ::HostBackend, vec::AbstractPetscVec{PetscLib};
     read::Bool, write::Bool,
@@ -336,6 +355,34 @@ function _wrap_localarray(cpu_arr, b::AbstractPETScMemBackend, vec; kw...)
           "load the corresponding GPU package (e.g. CUDA.jl)")
 end
 
+"""
+    determine_memtype(vecs...) → Type{<:AbstractArray}
+
+Query the `PetscMemType` of each Vec and return the corresponding array type.
+Errors if the Vecs are on heterogeneous devices (different `PetscMemType`
+values), since a single `withlocalarray!` call cannot handle mixed backends.
+Returns `Vector` when all Vecs are host-resident.
+
+Extensions overload `_array_type(::AbstractPETScMemBackend)` to map backend
+singletons to concrete array types (e.g. `CUDABackend` → `CuArray`).
+"""
+function determine_memtype(vecs::AbstractPetscVec...)
+    backends = map(vecs) do v
+        PetscLib = typeof(v).parameters[1]
+        arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, v)
+        LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, v, arr)
+        _memtype_backend(mtype)
+    end
+    allequal(typeof.(backends)) || throw(ArgumentError(
+        "Vecs are on heterogeneous devices: $(unique(typeof.(backends))). " *
+        "Use withlocalarray!(::Type{A}, ...) to handle each backend explicitly."
+    ))
+    return _array_type(first(backends))
+end
+
+_array_type(::HostBackend) = Vector
+# GPU extensions add: _array_type(::CUDABackend) = CuArray
+
 """
     withlocalarray!(
         f!,
@@ -343,14 +390,27 @@ end
         read::Union{Bool, NTuple{N, Bool}} = true,
         write::Union{Bool, NTuple{N, Bool}} = true,
     )
+    withlocalarray!(::Type{A}, f!, vecs...; read, write) where {A <: AbstractArray}
 
-Apply `f!` to host-side local array views of `vecs` via `VecGetArray`.  The
-arrays are always plain `Array`s regardless of where the Vec lives; PETSc will
-stage the data to the host if necessary.  Use `read=false` if write-only,
-`write=false` if read-only.
+Apply `f!` to local array views of `vecs`.
 
-For GPU-aware access (returns a `CuArray` when the Vec lives on the device) use
-[`withlocalarray_device!`](@ref) instead.
+Uses `VecGetArray*AndMemType` internally.  When a GPU backend extension (e.g.
+`PETScCUDAExt`) is loaded and a Vec lives on the device, `f!` receives a device
+array (e.g. `CuArray`) — zero-copy, no host↔device transfer.  When all Vecs
+are host-resident, `f!` receives plain `Array`s.
+
+The optional `::Type{A}` first argument asserts that every array returned from
+`VecGetArray*AndMemType` is of type `A`.  This is useful when Vecs are known to
+be heterogeneous (e.g. some on host, some on device) and you need a type-stable
+code path: passing `CuArray` will error immediately if any Vec is host-resident,
+rather than silently returning a `Vector`.
+
+Use `read=false` if the array is write-only; `write=false` if read-only.
+
+!!! note
+    Operations inside `f!` must be compatible with the actual array type.
+    Scalar indexing is not supported on GPU arrays; use broadcasting or GPU
+    kernels instead.
 
 # Examples
 ```julia-repl
@@ -374,47 +434,32 @@ end
 function withlocalarray!(
     f!,
     vecs::NTuple{N, AbstractPetscVec};
-    read::Union{Bool, NTuple{N, Bool}} = true,
-    write::Union{Bool, NTuple{N, Bool}} = true,
+    kwargs...,
 ) where {N}
-    read isa NTuple{N, Bool} || (read = ntuple(_ -> read, N))
-    write isa NTuple{N, Bool} || (write = ntuple(_ -> write, N))
-
-    arrays = map(vecs, read, write) do v, r, w
-        unsafe_localarray(v; read = r, write = w)
-    end
-    val = f!(arrays...)
-    map(Base.finalize, arrays)
-    return val
+    A = determine_memtype(vecs...)
+    return withlocalarray!(A, f!, vecs; kwargs...)
 end
 withlocalarray!(f!, vecs...; kwargs...) = withlocalarray!(f!, vecs; kwargs...)
 
-"""
-    withlocalarray_device!(f!, vecs...; read, write)
-
-Like [`withlocalarray!`](@ref) but uses `VecGetArray*AndMemType` and dispatches
-on the memory location of each Vec via [`_memtype_backend`](@ref).  When a GPU
-backend extension is loaded and a Vec lives on the device, `f!` receives a
-device array (e.g. `CuArray`) — zero-copy with no host↔device transfer.  When
-all Vecs are host-resident, this behaves identically to `withlocalarray!`.
-"""
-function withlocalarray_device!(
+function withlocalarray!(
+    ::Type{A},
     f!,
     vecs::NTuple{N, AbstractPetscVec};
     read::Union{Bool, NTuple{N, Bool}} = true,
     write::Union{Bool, NTuple{N, Bool}} = true,
-) where {N}
+) where {A <: AbstractArray, N}
     read isa NTuple{N, Bool} || (read = ntuple(_ -> read, N))
     write isa NTuple{N, Bool} || (write = ntuple(_ -> write, N))
 
     arrays = map(vecs, read, write) do v, r, w
-        _unsafe_localarray_device(v; read = r, write = w)
+        _unsafe_localarray(A, v; read = r, write = w)
     end
     val = f!(arrays...)
     map(Base.finalize, arrays)
     return val
 end
-withlocalarray_device!(f!, vecs...; kwargs...) = withlocalarray_device!(f!, vecs; kwargs...)
+withlocalarray!(::Type{A}, f!, vecs...; kwargs...) where {A <: AbstractArray} =
+    withlocalarray!(A, f!, vecs; kwargs...)
 
 
 """

From e9f5e1d2e754ebdbd7e2bbd3a3b9ac867dc37b80 Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Tue, 28 Apr 2026 15:38:01 +0000
Subject: [PATCH 28/39] attempt to implement changed & add tests

---
 ext/PETScCUDAExt.jl |  2 +-
 src/dm.jl           | 12 +++++---
 src/vec.jl          | 33 +++++++++++++---------
 test/vec.jl         | 67 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+), 18 deletions(-)

diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
index d7407d77..914446d2 100644
--- a/ext/PETScCUDAExt.jl
+++ b/ext/PETScCUDAExt.jl
@@ -10,7 +10,7 @@ using CUDA
 struct CUDABackend <: PETSc.AbstractPETScMemBackend end
 
 PETSc._memtype_backend(::Val{PETSC_MEMTYPE_DEVICE}) = CUDABackend()
-PETSc._array_type(::CUDABackend) = CuArray
+PETSc._array_type(::Val{LibPETSc.PETSC_MEMTYPE_DEVICE}) = CuArray
 
 # ── _wrap_localarray: device branch ──────────────────────────────────────────
 #
diff --git a/src/dm.jl b/src/dm.jl
index a7a9f692..bdb80151 100644
--- a/src/dm.jl
+++ b/src/dm.jl
@@ -6,15 +6,19 @@ function Base.show(io::IO, v::AbstractPetscDM{PetscLib}) where {PetscLib}
         print(io, "PETSc DM (null pointer)")
         return
     end
-    
-    # Try to get DM info, but handle uninitialized DMs gracefully
+    # DMGetType internally calls DMInitializePackage which queries the PETSc
+    # options database.  Calling it before PETSc is initialised causes a C-level
+    # SIGSEGV that cannot be caught with try/catch.
+    if !initialized(PetscLib)
+        print(io, "PETSc DM (PETSc not initialized)")
+        return
+    end
     try
         ty = LibPETSc.DMGetType(PetscLib, v)
         di = LibPETSc.DMGetDimension(PetscLib, v)
         print(io, "PETSc DM $ty object in $di dimensions")
     catch
-        # DM not fully initialized yet (type not set)
-        print(io, "PETSc DM (not yet initialized)")
+        print(io, "PETSc DM (type not set)")
     end
     return nothing
 end
diff --git a/src/vec.jl b/src/vec.jl
index 1f49b121..58da6629 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -9,15 +9,19 @@ function Base.show(io::IO, v::AbstractPetscVec{PetscLib}) where {PetscLib}
         print(io, "PETSc Vec (null pointer)")
         return
     end
-    
-    # Try to get vector info, but handle uninitialized vectors gracefully
+    # VecGetType internally calls VecInitializePackage which queries the PETSc
+    # options database.  Calling it before PETSc is initialised causes a C-level
+    # SIGSEGV that cannot be caught with try/catch.
+    if !initialized(PetscLib)
+        print(io, "PETSc Vec (PETSc not initialized)")
+        return
+    end
     try
         ty = LibPETSc.VecGetType(PetscLib, v)
         si = LibPETSc.VecGetSize(PetscLib, v)
         print(io, "PETSc $ty Vec; length=$si")
     catch
-        # Vector not fully initialized yet (type not set)
-        print(io, "PETSc Vec (not yet initialized)")
+        print(io, "PETSc Vec (type not set)")
     end
     return nothing
 end
@@ -363,25 +367,28 @@ Errors if the Vecs are on heterogeneous devices (different `PetscMemType`
 values), since a single `withlocalarray!` call cannot handle mixed backends.
 Returns `Vector` when all Vecs are host-resident.
 
-Extensions overload `_array_type(::AbstractPETScMemBackend)` to map backend
-singletons to concrete array types (e.g. `CUDABackend` → `CuArray`).
+Extensions overload `_array_type(::Val{MT})` for a `PetscMemType` enum value
+`MT` to register the corresponding array type (e.g. `PETSC_MEMTYPE_DEVICE` →
+`CuArray`).
 """
 function determine_memtype(vecs::AbstractPetscVec...)
-    backends = map(vecs) do v
+    mtypes = map(vecs) do v
         PetscLib = typeof(v).parameters[1]
         arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, v)
         LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, v, arr)
-        _memtype_backend(mtype)
+        mtype
     end
-    allequal(typeof.(backends)) || throw(ArgumentError(
-        "Vecs are on heterogeneous devices: $(unique(typeof.(backends))). " *
+    allequal(mtypes) || throw(ArgumentError(
+        "Vecs are on heterogeneous devices: $(unique(mtypes)). " *
         "Use withlocalarray!(::Type{A}, ...) to handle each backend explicitly."
     ))
-    return _array_type(first(backends))
+    return _array_type(Val(first(mtypes)))
 end
 
-_array_type(::HostBackend) = Vector
-# GPU extensions add: _array_type(::CUDABackend) = CuArray
+_array_type(::Val{LibPETSc.PETSC_MEMTYPE_HOST}) = Vector
+_array_type(::Val{MT}) where {MT} =
+    error("No array type registered for PetscMemType $MT — load the corresponding GPU package (e.g. CUDA.jl)")
+# GPU extensions add: _array_type(::Val{LibPETSc.PETSC_MEMTYPE_DEVICE}) = CuArray
 
 """
     withlocalarray!(
diff --git a/test/vec.jl b/test/vec.jl
index b5f78a56..a5a9add2 100644
--- a/test/vec.jl
+++ b/test/vec.jl
@@ -292,6 +292,73 @@ end
         PETSc.destroy(petsc_x)
         PETSc.destroy(petsc_y)
 
+        PETSc.finalize(petsclib)
+    end
+end
+
+@testset "withlocalarray! typed (Array)" begin
+    for petsclib in PETSc.petsclibs
+        PETSc.initialize(petsclib)
+        PetscScalar = petsclib.PetscScalar
+        PetscInt    = petsclib.PetscInt
+        N           = PetscInt(10)
+        test_comm = Sys.iswindows() ? LibPETSc.PETSC_COMM_SELF : MPI.COMM_SELF
+        petsc_x = LibPETSc.VecCreateSeq(petsclib, test_comm, N)
+        petsc_y = LibPETSc.VecCreateSeq(petsclib, test_comm, N)
+
+        # determine_memtype returns Vector for CPU vecs
+        @test PETSc.determine_memtype(petsc_x) === Vector
+        @test PETSc.determine_memtype(petsc_x, petsc_y) === Vector
+
+        # typed single vec — write
+        PETSc.withlocalarray!(Vector, petsc_x; read = false, write = true) do x
+            @test x isa Vector
+            for i in eachindex(x)
+                x[i] = PetscScalar(i)
+            end
+        end
+        @test petsc_x[1:N] == PetscScalar.(1:N)
+
+        # typed two vecs as NTuple
+        PETSc.withlocalarray!(
+            Vector,
+            (petsc_x, petsc_y);
+            read = (false, false), write = (true, true),
+        ) do x, y
+            @test x isa Vector
+            @test y isa Vector
+            for i in eachindex(x)
+                x[i] = PetscScalar(i)
+                y[i] = PetscScalar(2i)
+            end
+        end
+        @test petsc_x[1:N] == PetscScalar.(1:N)
+        @test petsc_y[1:N] == PetscScalar.(2:2:2N)
+
+        # typed two vecs as splat
+        PETSc.withlocalarray!(
+            Vector, petsc_x, petsc_y;
+            read = (false, false), write = (true, true),
+        ) do x, y
+            @test x isa Vector
+            @test y isa Vector
+            for i in eachindex(x)
+                x[i] = PetscScalar(2i)
+                y[i] = PetscScalar(3i)
+            end
+        end
+        @test petsc_x[1:N] == PetscScalar.(2:2:2N)
+        @test petsc_y[1:N] == PetscScalar.(3:3:3N)
+
+        # wrong type raises ArgumentError (Matrix is not Vector)
+        @test_throws ArgumentError PETSc.withlocalarray!(
+            Matrix, petsc_x; read = true, write = false,
+        ) do x
+            nothing
+        end
+
+        PETSc.destroy(petsc_x)
+        PETSc.destroy(petsc_y)
         PETSc.finalize(petsclib)
     end
 end
\ No newline at end of file

From c15304b3d70c98660065597f4a6b62bb682cd72a Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Tue, 28 Apr 2026 16:21:17 +0000
Subject: [PATCH 29/39] attempt to fix the bugs

---
 ext/PETScCUDAExt.jl |  31 ++++++++++---
 src/vec.jl          | 103 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 110 insertions(+), 24 deletions(-)

diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
index 914446d2..4d9310f9 100644
--- a/ext/PETScCUDAExt.jl
+++ b/ext/PETScCUDAExt.jl
@@ -12,13 +12,32 @@ struct CUDABackend <: PETSc.AbstractPETScMemBackend end
 PETSc._memtype_backend(::Val{PETSC_MEMTYPE_DEVICE}) = CUDABackend()
 PETSc._array_type(::Val{LibPETSc.PETSC_MEMTYPE_DEVICE}) = CuArray
 
-# ── _wrap_localarray: device branch ──────────────────────────────────────────
+# ── No-finalizer acquire/release for withlocalarray! ─────────────────────────
+
+function PETSc._make_local_array(cpu_arr, ::CUDABackend)
+    T   = eltype(cpu_arr)
+    n   = length(cpu_arr)
+    ptr = reinterpret(CuPtr{T}, UInt(pointer(cpu_arr)))
+    return CUDA.unsafe_wrap(CuArray, ptr, n; own = false)
+end
+
+function PETSc._release_petsc_local_array(
+    cpu_arr, ::CUDABackend, vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
+) where {PLib}
+    if write && read
+        LibPETSc.VecRestoreArrayAndMemType(PLib, vec, cpu_arr)
+    elseif write
+        LibPETSc.VecRestoreArrayWriteAndMemType(PLib, vec, cpu_arr)
+    else
+        LibPETSc.VecRestoreArrayReadAndMemType(PLib, vec, cpu_arr)
+    end
+    return nothing
+end
+
+# ── _wrap_localarray: device branch (legacy, kept for backward compat) ────────
 #
-# Called by `_unsafe_localarray_device` (defined in base vec.jl) when the Vec
-# is device-resident.  Wraps the device pointer as a CuArray (zero-copy) and
-# attaches a finalizer that calls the matching VecRestoreArray*AndMemType.
-# `cpu_arr` (the raw Julia Vector wrapping the device pointer) is captured in
-# the finalizer closure to keep it alive until the restore is done.
+# No longer called by withlocalarray! (which uses _acquire/_release instead).
+# Retained in case external code calls _unsafe_localarray directly.
 
 function PETSc._wrap_localarray(
     cpu_arr, ::CUDABackend, vec::AbstractPetscVec{PetscLib};
diff --git a/src/vec.jl b/src/vec.jl
index 58da6629..eb1683d9 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -359,6 +359,66 @@ function _wrap_localarray(cpu_arr, b::AbstractPETScMemBackend, vec; kw...)
           "load the corresponding GPU package (e.g. CUDA.jl)")
 end
 
+# ── No-finalizer acquire/release ─────────────────────────────────────────────
+#
+# `withlocalarray!` uses these instead of the finalizer-based `_wrap_localarray`
+# to avoid a documented Julia pitfall: after `Base.finalize(x)` is called, if
+# `x` later becomes unreachable GC may invoke the finalizer *again*, leading to
+# a double VecRestore call on an already-freed Vec (→ SIGSEGV).
+# `try/finally` provides deterministic, single-execution cleanup.
+
+"""
+    _acquire_petsc_local_array(vec; read, write) -> (arr, cpu_arr, backend)
+
+Get the local array from `vec` via `VecGetArray*AndMemType` without
+registering a Julia finalizer.  Returns the user-visible array, the raw PETSc
+cpu_arr needed for restore, and the backend singleton.
+Extensions overload `_make_local_array(cpu_arr, backend)` to wrap the raw
+array for their device (e.g. `CUDABackend` → `CuArray`).
+"""
+function _acquire_petsc_local_array(
+    vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
+) where {PLib}
+    cpu_arr, mtype = if write && read
+        LibPETSc.VecGetArrayAndMemType(PLib, vec)
+    elseif write
+        LibPETSc.VecGetArrayWriteAndMemType(PLib, vec)
+    else
+        LibPETSc.VecGetArrayReadAndMemType(PLib, vec)
+    end
+    backend = _memtype_backend(mtype)
+    arr = _make_local_array(cpu_arr, backend)
+    return arr, cpu_arr, backend
+end
+
+# CPU: the raw PETSc array is already a Vector — return it directly.
+_make_local_array(cpu_arr, ::HostBackend) = cpu_arr
+_make_local_array(cpu_arr, b::AbstractPETScMemBackend) =
+    error("_make_local_array not implemented for backend $(typeof(b)) — " *
+          "load the corresponding GPU package (e.g. CUDA.jl)")
+
+"""
+    _release_petsc_local_array(cpu_arr, backend, vec; read, write)
+
+Restore a previously acquired local array.  Called in `finally` blocks by
+`withlocalarray!`.  Extensions overload this for GPU backends.
+"""
+function _release_petsc_local_array(
+    cpu_arr, ::HostBackend, vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
+) where {PLib}
+    if write && read
+        LibPETSc.VecRestoreArrayAndMemType(PLib, vec, cpu_arr)
+    elseif write
+        LibPETSc.VecRestoreArrayWriteAndMemType(PLib, vec, cpu_arr)
+    else
+        LibPETSc.VecRestoreArrayReadAndMemType(PLib, vec, cpu_arr)
+    end
+    return nothing
+end
+_release_petsc_local_array(cpu_arr, b::AbstractPETScMemBackend, vec; kw...) =
+    error("_release_petsc_local_array not implemented for backend $(typeof(b)) — " *
+          "load the corresponding GPU package (e.g. CUDA.jl)")
+
 """
     determine_memtype(vecs...) → Type{<:AbstractArray}
 
@@ -435,8 +495,6 @@ julia> withlocalarray!(
 end
 ```
 
-!!! note
-    `Base.finalize` is automatically called on the array.
 """
 function withlocalarray!(
     f!,
@@ -457,13 +515,27 @@ function withlocalarray!(
 ) where {A <: AbstractArray, N}
     read isa NTuple{N, Bool} || (read = ntuple(_ -> read, N))
     write isa NTuple{N, Bool} || (write = ntuple(_ -> write, N))
-
-    arrays = map(vecs, read, write) do v, r, w
-        _unsafe_localarray(A, v; read = r, write = w)
+    # Acquire all arrays first (no finalizers), then use try/finally for release.
+    # This avoids the Julia pitfall where Base.finalize + GC can both run the
+    # finalizer if the object becomes unreachable again (double-restore → crash).
+    acquired = map(vecs, read, write) do v, r, w
+        _acquire_petsc_local_array(v; read=r, write=w)
+    end
+    try
+        # Type check inside try so finally still releases on mismatch.
+        arrays = map(acquired) do (arr, cpu_arr, backend)
+            arr isa A || throw(ArgumentError(
+                "expected array of type $A but Vec returned $(typeof(arr)). " *
+                "Check that the Vec lives on the expected device."
+            ))
+            arr
+        end
+        return f!(arrays...)
+    finally
+        foreach(vecs, acquired, read, write) do v, (_, cpu_arr, backend), r, w
+            _release_petsc_local_array(cpu_arr, backend, v; read=r, write=w)
+        end
     end
-    val = f!(arrays...)
-    map(Base.finalize, arrays)
-    return val
 end
 withlocalarray!(::Type{A}, f!, vecs...; kwargs...) where {A <: AbstractArray} =
     withlocalarray!(A, f!, vecs; kwargs...)
@@ -629,16 +701,11 @@ function get_petsc_arrays(petsclib, g_fx, l_x)
     )
 end
 
-# CPU base case: attach VecRestore finalizers and return the arrays directly.
+# CPU base case: return arrays directly. restore_petsc_arrays calls VecRestore
+# explicitly — no finalizers to avoid the double-finalization crash.
 function _get_petsc_arrays_impl(
     petsclib, g_fx, l_x, ::Type, fx_arr, lx_arr, ::HostBackend, ::HostBackend,
 )
-    finalizer(fx_arr) do a
-        LibPETSc.VecRestoreArrayAndMemType(petsclib, g_fx, a)
-    end
-    finalizer(lx_arr) do a
-        LibPETSc.VecRestoreArrayReadAndMemType(petsclib, l_x, a)
-    end
     return fx_arr, lx_arr, nothing, nothing, nothing
 end
 
@@ -656,10 +723,10 @@ function restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bo
     _restore_petsc_arrays_impl(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
 end
 
-# CPU base case: VecRestore finalizers on fx/lx do the work.
+# CPU base case: call VecRestore directly (no finalizers).
 function _restore_petsc_arrays_impl(
     petsclib, g_fx, l_x, fx, lx, ::Nothing, ::Nothing, ::Nothing,
 )
-    Base.finalize(fx)
-    Base.finalize(lx)
+    LibPETSc.VecRestoreArrayAndMemType(petsclib, g_fx, fx)
+    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, l_x, lx)
 end

From 1b9bb29fe0310279c11e1b35a421c69af52e7666 Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Tue, 28 Apr 2026 18:32:59 +0000
Subject: [PATCH 30/39] next attenpt

---
 src/vec.jl | 46 +++++++++++-----------------------------------
 1 file changed, 11 insertions(+), 35 deletions(-)

diff --git a/src/vec.jl b/src/vec.jl
index eb1683d9..f3d977e4 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -440,7 +440,7 @@ function determine_memtype(vecs::AbstractPetscVec...)
     end
     allequal(mtypes) || throw(ArgumentError(
         "Vecs are on heterogeneous devices: $(unique(mtypes)). " *
-        "Use withlocalarray!(::Type{A}, ...) to handle each backend explicitly."
+        "Use withlocalarray!(f!, ::Type{A}, ...) to handle each backend explicitly."
     ))
     return _array_type(Val(first(mtypes)))
 end
@@ -461,37 +461,13 @@ _array_type(::Val{MT}) where {MT} =
 
 Apply `f!` to local array views of `vecs`.
 
-Uses `VecGetArray*AndMemType` internally.  When a GPU backend extension (e.g.
-`PETScCUDAExt`) is loaded and a Vec lives on the device, `f!` receives a device
-array (e.g. `CuArray`) — zero-copy, no host↔device transfer.  When all Vecs
-are host-resident, `f!` receives plain `Array`s.
+The optional `::Type{A}` second argument (after the do-block function) asserts
+that every array returned from `VecGetArray*AndMemType` is of type `A`.  Use
+it with do-block syntax:
 
-The optional `::Type{A}` first argument asserts that every array returned from
-`VecGetArray*AndMemType` is of type `A`.  This is useful when Vecs are known to
-be heterogeneous (e.g. some on host, some on device) and you need a type-stable
-code path: passing `CuArray` will error immediately if any Vec is host-resident,
-rather than silently returning a `Vector`.
-
-Use `read=false` if the array is write-only; `write=false` if read-only.
-
-!!! note
-    Operations inside `f!` must be compatible with the actual array type.
-    Scalar indexing is not supported on GPU arrays; use broadcasting or GPU
-    kernels instead.
-
-# Examples
-```julia-repl
-julia> withlocalarray!(x; write=true) do x
-   @. x .*= 2
-end
-
-julia> withlocalarray!(
-           x,
-           y;
-           read = (false, true),
-           write = (true, false)
-       ) do x, y
-   @. x .= 2 .+ y
+```julia
+withlocalarray!(Vector, petsc_x; write=true) do x
+    x .= 1
 end
 ```
 
@@ -502,13 +478,13 @@ function withlocalarray!(
     kwargs...,
 ) where {N}
     A = determine_memtype(vecs...)
-    return withlocalarray!(A, f!, vecs; kwargs...)
+    return withlocalarray!(f!, A, vecs; kwargs...)
 end
 withlocalarray!(f!, vecs...; kwargs...) = withlocalarray!(f!, vecs; kwargs...)
 
 function withlocalarray!(
-    ::Type{A},
     f!,
+    ::Type{A},
     vecs::NTuple{N, AbstractPetscVec};
     read::Union{Bool, NTuple{N, Bool}} = true,
     write::Union{Bool, NTuple{N, Bool}} = true,
@@ -537,8 +513,8 @@ function withlocalarray!(
         end
     end
 end
-withlocalarray!(::Type{A}, f!, vecs...; kwargs...) where {A <: AbstractArray} =
-    withlocalarray!(A, f!, vecs; kwargs...)
+withlocalarray!(f!, ::Type{A}, vecs...; kwargs...) where {A <: AbstractArray} =
+    withlocalarray!(f!, A, vecs; kwargs...)
 
 
 """

From 5508e8a61ed40679a97df60678470d6d0eb3dd4e Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Tue, 28 Apr 2026 18:40:34 +0000
Subject: [PATCH 31/39] vec: enum-based determine_memtype, _as_petsc_vec for
 VecPtr compat, try/finally cleanup

- determine_memtype now collects raw PetscMemType enum values and uses
  Val{MT} dispatch (_array_type) instead of backend singleton types,
  as requested by Valentin's review.
- Add _as_petsc_vec helper that converts any AbstractPetscVec to PetscVec
  (non-owning, wraps .ptr), fixing MethodError when VecPtr is passed to
  auto-generated *AndMemType ccall wrappers typed ::PetscVec.
  Applied in vec.jl and PETScCUDAExt.jl at every *AndMemType call site.
- withlocalarray! uses try/finally (no Base.finalize) to avoid double-
  execution of VecRestore* when Julia 1.12 concurrent GC races with
  explicit finalize calls.
  do-block compatibility.
---
 ext/PETScCUDAExt.jl | 18 ++++++++++--------
 src/vec.jl          | 41 +++++++++++++++++++++++++----------------
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
index 4d9310f9..7eb6a5b8 100644
--- a/ext/PETScCUDAExt.jl
+++ b/ext/PETScCUDAExt.jl
@@ -24,12 +24,13 @@ end
 function PETSc._release_petsc_local_array(
     cpu_arr, ::CUDABackend, vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
 ) where {PLib}
+    pv = PETSc._as_petsc_vec(vec)
     if write && read
-        LibPETSc.VecRestoreArrayAndMemType(PLib, vec, cpu_arr)
+        LibPETSc.VecRestoreArrayAndMemType(PLib, pv, cpu_arr)
     elseif write
-        LibPETSc.VecRestoreArrayWriteAndMemType(PLib, vec, cpu_arr)
+        LibPETSc.VecRestoreArrayWriteAndMemType(PLib, pv, cpu_arr)
     else
-        LibPETSc.VecRestoreArrayReadAndMemType(PLib, vec, cpu_arr)
+        LibPETSc.VecRestoreArrayReadAndMemType(PLib, pv, cpu_arr)
     end
     return nothing
 end
@@ -47,13 +48,14 @@ function PETSc._wrap_localarray(
     n   = length(cpu_arr)
     ptr = reinterpret(CuPtr{T}, UInt(pointer(cpu_arr)))
     dev_arr = CUDA.unsafe_wrap(CuArray, ptr, n; own = false)
+    pv = PETSc._as_petsc_vec(vec)
     finalizer(dev_arr) do _
         if write && read
-            LibPETSc.VecRestoreArrayAndMemType(PetscLib, vec, cpu_arr)
+            LibPETSc.VecRestoreArrayAndMemType(PetscLib, pv, cpu_arr)
         elseif write
-            LibPETSc.VecRestoreArrayWriteAndMemType(PetscLib, vec, cpu_arr)
+            LibPETSc.VecRestoreArrayWriteAndMemType(PetscLib, pv, cpu_arr)
         else
-            LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, vec, cpu_arr)
+            LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, pv, cpu_arr)
         end
         return nothing
     end
@@ -118,8 +120,8 @@ function PETSc._restore_petsc_arrays_impl(
         CUDA.synchronize()
         copyto!(fx_arr, fx_bounce)  # D2H: copy residual back to host PETSc array
     end
-    LibPETSc.VecRestoreArrayAndMemType(petsclib, g_fx, fx_arr)
-    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, l_x, lx_arr)
+    LibPETSc.VecRestoreArrayAndMemType(petsclib, PETSc._as_petsc_vec(g_fx), fx_arr)
+    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, PETSc._as_petsc_vec(l_x), lx_arr)
 end
 
 end # module PETScCUDAExt
diff --git a/src/vec.jl b/src/vec.jl
index f3d977e4..2ee6c2f0 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -311,12 +311,13 @@ function _unsafe_localarray(
     read::Bool = true,
     write::Bool = true,
 ) where {PetscLib}
+    pv = _as_petsc_vec(vec)
     if write && read
-        cpu_arr, mtype = LibPETSc.VecGetArrayAndMemType(PetscLib, vec)
+        cpu_arr, mtype = LibPETSc.VecGetArrayAndMemType(PetscLib, pv)
     elseif write
-        cpu_arr, mtype = LibPETSc.VecGetArrayWriteAndMemType(PetscLib, vec)
+        cpu_arr, mtype = LibPETSc.VecGetArrayWriteAndMemType(PetscLib, pv)
     else
-        cpu_arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, vec)
+        cpu_arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, pv)
     end
     return _wrap_localarray(cpu_arr, _memtype_backend(mtype), vec; read, write)
 end
@@ -379,12 +380,13 @@ array for their device (e.g. `CUDABackend` → `CuArray`).
 function _acquire_petsc_local_array(
     vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
 ) where {PLib}
+    pv = _as_petsc_vec(vec)
     cpu_arr, mtype = if write && read
-        LibPETSc.VecGetArrayAndMemType(PLib, vec)
+        LibPETSc.VecGetArrayAndMemType(PLib, pv)
     elseif write
-        LibPETSc.VecGetArrayWriteAndMemType(PLib, vec)
+        LibPETSc.VecGetArrayWriteAndMemType(PLib, pv)
     else
-        LibPETSc.VecGetArrayReadAndMemType(PLib, vec)
+        LibPETSc.VecGetArrayReadAndMemType(PLib, pv)
     end
     backend = _memtype_backend(mtype)
     arr = _make_local_array(cpu_arr, backend)
@@ -406,12 +408,13 @@ Restore a previously acquired local array.  Called in `finally` blocks by
 function _release_petsc_local_array(
     cpu_arr, ::HostBackend, vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
 ) where {PLib}
+    pv = _as_petsc_vec(vec)
     if write && read
-        LibPETSc.VecRestoreArrayAndMemType(PLib, vec, cpu_arr)
+        LibPETSc.VecRestoreArrayAndMemType(PLib, pv, cpu_arr)
     elseif write
-        LibPETSc.VecRestoreArrayWriteAndMemType(PLib, vec, cpu_arr)
+        LibPETSc.VecRestoreArrayWriteAndMemType(PLib, pv, cpu_arr)
     else
-        LibPETSc.VecRestoreArrayReadAndMemType(PLib, vec, cpu_arr)
+        LibPETSc.VecRestoreArrayReadAndMemType(PLib, pv, cpu_arr)
     end
     return nothing
 end
@@ -419,8 +422,13 @@ _release_petsc_local_array(cpu_arr, b::AbstractPETScMemBackend, vec; kw...) =
     error("_release_petsc_local_array not implemented for backend $(typeof(b)) — " *
           "load the corresponding GPU package (e.g. CUDA.jl)")
 
+# The auto-generated *AndMemType wrappers are typed `x::PetscVec`, but
+# `AbstractPetscVec` also includes `VecPtr`.  Convert transparently.
+_as_petsc_vec(v::LibPETSc.PetscVec) = v
+_as_petsc_vec(v::AbstractPetscVec{PetscLib}) where {PetscLib} =
+    LibPETSc.PetscVec{PetscLib}(v.ptr)
+
 """
-    determine_memtype(vecs...) → Type{<:AbstractArray}
 
 Query the `PetscMemType` of each Vec and return the corresponding array type.
 Errors if the Vecs are on heterogeneous devices (different `PetscMemType`
@@ -434,8 +442,9 @@ Extensions overload `_array_type(::Val{MT})` for a `PetscMemType` enum value
 function determine_memtype(vecs::AbstractPetscVec...)
     mtypes = map(vecs) do v
         PetscLib = typeof(v).parameters[1]
-        arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, v)
-        LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, v, arr)
+        pv = _as_petsc_vec(v)
+        arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, pv)
+        LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, pv, arr)
         mtype
     end
     allequal(mtypes) || throw(ArgumentError(
@@ -669,8 +678,8 @@ See also: [`restore_petsc_arrays`](@ref)
 """
 function get_petsc_arrays(petsclib, g_fx, l_x)
     T = petsclib.PetscScalar
-    fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, g_fx)
-    lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, l_x)
+    fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, _as_petsc_vec(g_fx))
+    lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, _as_petsc_vec(l_x))
     return _get_petsc_arrays_impl(
         petsclib, g_fx, l_x, T, fx_arr, lx_arr,
         _memtype_backend(fx_mtype), _memtype_backend(lx_mtype),
@@ -703,6 +712,6 @@ end
 function _restore_petsc_arrays_impl(
     petsclib, g_fx, l_x, fx, lx, ::Nothing, ::Nothing, ::Nothing,
 )
-    LibPETSc.VecRestoreArrayAndMemType(petsclib, g_fx, fx)
-    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, l_x, lx)
+    LibPETSc.VecRestoreArrayAndMemType(petsclib, _as_petsc_vec(g_fx), fx)
+    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, _as_petsc_vec(l_x), lx)
 end

From 56ecf83aa3a9f12bd75b35904d51a2233e1b551a Mon Sep 17 00:00:00 2001
From: Boris Kaus <boriskaus@gmail.com>
Date: Tue, 28 Apr 2026 18:54:37 +0000
Subject: [PATCH 32/39] update name to avoid warnin msg in test

---
 examples/ex51_implicit.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/ex51_implicit.jl b/examples/ex51_implicit.jl
index d5441f87..0f3180b4 100644
--- a/examples/ex51_implicit.jl
+++ b/examples/ex51_implicit.jl
@@ -34,7 +34,7 @@ using PETSc, MPI, Printf
 #
 #     solve_ex51_implicit(options = ["-ts_irk_nstages", "3"])
 
-mutable struct Ex51Context{PetscLib <: PETSc.LibPETSc.PetscLibType}
+mutable struct Ex51ImplicitContext{PetscLib <: PETSc.LibPETSc.PetscLibType}
     petsclib::PetscLib
 end
 
@@ -57,7 +57,7 @@ function ex51_rhs_ifunction!(
     # time derivative, and residual vectors arrive as raw `CVec` pointers. We
     # wrap them with `VecPtr(..., own = false)` so we can use PETSc.jl's array
     # helpers without taking ownership away from PETSc.
-    ctx = unsafe_pointer_to_objref(ctx_ptr)::Ex51Context
+    ctx = unsafe_pointer_to_objref(ctx_ptr)::Ex51ImplicitContext
     petsclib = ctx.petsclib
     # `own = false` since memory is managed by PETSc internally
     u = PETSc.VecPtr(petsclib, u_ptr, false)
@@ -146,7 +146,7 @@ function ex51_ijacobian!(
     B_ptr::PETSc.LibPETSc.CMat,
     ctx_ptr::Ptr{Cvoid},
 )::PETSc.LibPETSc.PetscErrorCode
-    ctx = unsafe_pointer_to_objref(ctx_ptr)::Ex51Context
+    ctx = unsafe_pointer_to_objref(ctx_ptr)::Ex51ImplicitContext
     petsclib = ctx.petsclib
     # `u` is borrowed from PETSc; do not take ownership.
     u = PETSc.VecPtr(petsclib, u_ptr, false)
@@ -308,7 +308,7 @@ function solve_ex51_implicit(;
     current_time = petsclib.PetscReal(NaN)
     error_norm = petsclib.PetscReal(NaN)
     solution = PetscScalar[]
-    ctx = Ex51Context(petsclib)
+    ctx = Ex51ImplicitContext(petsclib)
     petsc_options = PETSc.Options(petsclib; parsed_options...)
     pushed_options = false
 

From 12e4035c9cbaf4c908f2079f6f55594a1a0f5cff Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Tue, 28 Apr 2026 21:01:42 +0200
Subject: [PATCH 33/39] remove log_view to have less clutter in tests

---
 examples/ex45.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ex45.jl b/examples/ex45.jl
index 503f7006..8e0001e1 100644
--- a/examples/ex45.jl
+++ b/examples/ex45.jl
@@ -27,7 +27,7 @@ PETSc.set_library!(
 
 
 # Initialize PETSc
-PETSc.initialize(petsclib, log_view=true)
+PETSc.initialize(petsclib, log_view=false)
 
 function solve_ex45(N=7; da_grid_x=7, da_grid_y=7, da_grid_z=7, kwargs...)
     comm = MPI.COMM_WORLD

From 364432e4a3aca72fccdc6ca68c04860125ecb41d Mon Sep 17 00:00:00 2001
From: Boris Kaus <61824822+boriskaus@users.noreply.github.com>
Date: Tue, 28 Apr 2026 22:52:58 +0200
Subject: [PATCH 34/39] Update src/ts.jl

Co-authored-by: Valentin Churavy <v.churavy@gmail.com>
---
 src/ts.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ts.jl b/src/ts.jl
index c084e438..24e8643e 100644
--- a/src/ts.jl
+++ b/src/ts.jl
@@ -225,6 +225,7 @@ function LibPETSc.TSAdaptSetType(
 )
     s = String(type)
     GC.@preserve s LibPETSc.TSAdaptSetType(petsclib, adapt, Base.unsafe_convert(Ptr{Cchar}, s))
+    return nothing
 end
 
 """

From 5b86ae82b2476c8c6506643e3491d96c5366dc30 Mon Sep 17 00:00:00 2001
From: Boris Kaus <61824822+boriskaus@users.noreply.github.com>
Date: Tue, 28 Apr 2026 22:54:31 +0200
Subject: [PATCH 35/39] Update ext/PETScCUDAExt.jl

Co-authored-by: Valentin Churavy <v.churavy@gmail.com>
---
 ext/PETScCUDAExt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
index 7eb6a5b8..942bb8f5 100644
--- a/ext/PETScCUDAExt.jl
+++ b/ext/PETScCUDAExt.jl
@@ -10,7 +10,7 @@ using CUDA
 struct CUDABackend <: PETSc.AbstractPETScMemBackend end
 
 PETSc._memtype_backend(::Val{PETSC_MEMTYPE_DEVICE}) = CUDABackend()
-PETSc._array_type(::Val{LibPETSc.PETSC_MEMTYPE_DEVICE}) = CuArray
+PETSc._array_type(::Val{PETSC_MEMTYPE_DEVICE}) = CuArray
 
 # ── No-finalizer acquire/release for withlocalarray! ─────────────────────────
 

From fea971a7026d4b9a3c3863fedccc9d74b18c1c3c Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Fri, 15 May 2026 11:39:08 +0200
Subject: [PATCH 36/39] ex19: use withlocalarray! API, drop manual CUDA
 wrapping

---
 examples/ex19.jl | 136 +++++++++++++++++++++--------------------------
 1 file changed, 61 insertions(+), 75 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index 13ead70b..4c42dc11 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -68,10 +68,6 @@ using KernelAbstractions
 
 if useCUDA
     using CUDA
-    import CUDA: CuArray, CuPtr, unsafe_wrap
-    const backend = CUDABackend()
-else
-    const backend = KernelAbstractions.CPU()
 end
 
 
@@ -217,14 +213,6 @@ end
 # Captures from module scope: useCUDA, backend, CuArray, CuPtr,
 #   scatter_perturb_kernel!, fd_accumulate_kernel!, KernelAbstractions.
 #
-function maybe_wrap_device(arr, mtype, n, ::Type{T}, useCUDA) where T
-    if useCUDA && mtype == LibPETSc.PETSC_MEMTYPE_DEVICE
-        return unsafe_wrap(CuArray, CuPtr{T}(UInt64(pointer(arr))), n)
-    else
-        return arr
-    end
-end
-
 function fd_coloring_jac!(
     petsclib,
     snes,
@@ -232,7 +220,6 @@ function fd_coloring_jac!(
     f0_vec, f1_vec, x_pert_vec,
     val_dev  :: AbstractVector{T},
     n_colors    :: Int,
-    n_local_dofs :: Int,
     perturb_cols_dev,
     coo_idxs_dev,
     local_rows_dev,
@@ -240,34 +227,31 @@ function fd_coloring_jac!(
     inv_h :: T,
 ) where T
     LibPETSc.SNESComputeFunction(petsclib, snes, g_x, f0_vec)
-    f0_arr, f0_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, f0_vec)
-    f0_dev = maybe_wrap_device(f0_arr, f0_mtype, n_local_dofs, T, useCUDA)
-
-    for c in 1:n_colors
-        isempty(perturb_cols_dev[c]) && continue
-
-        LibPETSc.VecCopy(petsclib, g_x, x_pert_vec)
-        xp_arr, xp_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, x_pert_vec)
-        xp_dev = maybe_wrap_device(xp_arr, xp_mtype, n_local_dofs, T, useCUDA)
-        scatter_perturb_kernel!(backend, 64)(
-            xp_dev, perturb_cols_dev[c], h_eps;
-            ndrange = length(perturb_cols_dev[c]))
-        KernelAbstractions.synchronize(backend)
-        LibPETSc.VecRestoreArrayAndMemType(petsclib, x_pert_vec, xp_arr)
-
-        LibPETSc.SNESComputeFunction(petsclib, snes, x_pert_vec, f1_vec)
-
-        f1_arr, f1_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, f1_vec)
-        f1_dev = maybe_wrap_device(f1_arr, f1_mtype, n_local_dofs, T, useCUDA)
-        fd_accumulate_kernel!(backend, 64)(
-            val_dev, f0_dev, f1_dev,
-            coo_idxs_dev[c], local_rows_dev[c], inv_h;
-            ndrange = length(coo_idxs_dev[c]))
-        KernelAbstractions.synchronize(backend)
-        LibPETSc.VecRestoreArrayReadAndMemType(petsclib, f1_vec, f1_arr)
+    PETSc.withlocalarray!(f0_vec; read=true, write=false) do f0
+        for c in 1:n_colors
+            isempty(perturb_cols_dev[c]) && continue
+
+            LibPETSc.VecCopy(petsclib, g_x, x_pert_vec)
+            PETSc.withlocalarray!(x_pert_vec; read=true, write=true) do xp
+                kb = KernelAbstractions.get_backend(xp)
+                scatter_perturb_kernel!(kb, 64)(
+                    xp, perturb_cols_dev[c], h_eps;
+                    ndrange = length(perturb_cols_dev[c]))
+                KernelAbstractions.synchronize(kb)
+            end
+
+            LibPETSc.SNESComputeFunction(petsclib, snes, x_pert_vec, f1_vec)
+
+            PETSc.withlocalarray!(f1_vec; read=true, write=false) do f1
+                kb = KernelAbstractions.get_backend(f1)
+                fd_accumulate_kernel!(kb, 64)(
+                    val_dev, f0, f1,
+                    coo_idxs_dev[c], local_rows_dev[c], inv_h;
+                    ndrange = length(coo_idxs_dev[c]))
+                KernelAbstractions.synchronize(kb)
+            end
+        end
     end
-
-    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, f0_vec, f0_arr)
     return nothing
 end
 
@@ -351,14 +335,6 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     l_x = PETSc.DMLocalVec(da)
     PETSc.dm_global_to_local!(g_x, l_x, da, PETSc.INSERT_VALUES)
 
-    # Get arrays for the output (g_fx) and ghost-padded input (l_x) Vecs.
-    # On GPU, returns CuArray wrappers (zero-copy when both Vecs are device-
-    # resident) together with the raw PETSc handles needed for the restore call.
-    # On CPU, returns plain Array views backed by VecGetArray.
-    # fx_bounce is a GPU scratch buffer used when g_fx is host-resident; it is
-    # copied back D2H by restore_petsc_arrays after the kernel completes.
-    fx, lx, fx_arr, lx_arr, fx_bounce = PETSc.get_petsc_arrays(petsclib, g_fx, l_x)
-
     corners       = PETSc.getcorners(da)
     ghost_corners = PETSc.getghostcorners(da)
 
@@ -369,14 +345,7 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
 
     nx_own = xe  - xs  + 1;  ny_own = ye  - ys  + 1
     nx_g   = xeg - xsg + 1;  ny_g   = yeg - ysg + 1
-
-    # Plain [dof, x, y] arrays — no OffsetArray, safe for KA on GPU
-    x_par = reshape(lx, 4, nx_g,   ny_g)
-    f_par = reshape(fx, 4, nx_own, ny_own)
-
-    # Ghost offset: ghost-array index for owned start = 1 + ox (0 at domain wall)
-    ox = xs - xsg
-    oy = ys - ysg
+    ox = xs - xsg;  oy = ys - ysg
 
     # Recompute grid metrics from the DM so this callback is correct on every
     # MG level (coarsen/refine changes mx/my; capturing outer-scope values
@@ -385,20 +354,26 @@ PETSc.setfunction!(snes, r) do g_fx, snes, g_x
     mx_    = Int(info_.global_size[1])
     my_    = Int(info_.global_size[2])
     dhx_   = _T(mx_ - 1);   dhy_   = _T(my_ - 1)
-    hx_    = one(_T) / dhx_; hy_   = one(_T) / dhy_
+    hx_    = one(_T) / dhx_; hy_    = one(_T) / dhy_
     hydhx_ = hy_ * dhx_;    hxdhy_ = hx_ * dhy_
     lid_   = _T(1) / dhx_    # lidvelocity = 1/(mx-1)
 
-    cavity_residual_kernel!(backend, 64)(
-        f_par, x_par,
-        dhx_, dhy_, hx_, hy_, hydhx_, hxdhy_,
-        user.grashof, user.prandtl, lid_,
-        mx_, my_, xs, ys, ox, oy;
-        ndrange = (nx_own, ny_own),
-    )
-    KernelAbstractions.synchronize(backend)
+    # withlocalarray! handles CPU/GPU dispatch: returns Vector on host, CuArray
+    # on device.  Both vecs must be on the same device (guaranteed per MG level).
+    PETSc.withlocalarray!(g_fx, l_x; read=(true, true), write=(true, false)) do fx, lx
+        kern  = KernelAbstractions.get_backend(fx)
+        x_par = reshape(lx, 4, nx_g,   ny_g)
+        f_par = reshape(fx, 4, nx_own, ny_own)
+        cavity_residual_kernel!(kern, 64)(
+            f_par, x_par,
+            dhx_, dhy_, hx_, hy_, hydhx_, hxdhy_,
+            user.grashof, user.prandtl, lid_,
+            mx_, my_, xs, ys, ox, oy;
+            ndrange = (nx_own, ny_own),
+        )
+        KernelAbstractions.synchronize(kern)
+    end
 
-    PETSc.restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
     PETSc.destroy(l_x)
     return PetscInt(0)
 end
@@ -476,7 +451,7 @@ PETSc.setjacobian!(snes, J) do Jmat, actual_snes, g_x
     fd_coloring_jac!(
         petsclib, actual_snes, g_x,
         f0_vec, f1_vec, x_pert_vec, val_dev,
-        n_colors, n_local_dofs,
+        n_colors,
         perturb_cols_dev, coo_idxs_dev, local_rows_dev,
         h_eps, inv_h,
     )
@@ -516,8 +491,17 @@ if MPI.Comm_rank(comm) == 0
 end
 
 # ── Cleanup ───────────────────────────────────────────────────────────────────
-# Run a full GC now so any lingering VecRestoreArray finalizers from
-# withlocalarray! run while PETSc is still valid, then barrier all ranks.
+# Destroy the PETSc options database stored on the SNES explicitly.  Its GC
+# finalizer calls PetscOptionsDestroy (which touches MPI internally); if GC
+# runs it after PETSc/MPI are finalized the process crashes.  Destroying here
+# while PETSc is still active is always safe.
+if !isnothing(snes.opts)
+    PETSc.destroy(snes.opts)
+    snes.opts = nothing
+end
+
+# Run a full GC so any remaining PETSc-object finalizers fire while PETSc is
+# still active.
 GC.gc(true)
 MPI.Barrier(comm)
 
@@ -533,11 +517,13 @@ PETSc.destroy(r)
 PETSc.destroy(da)
 PETSc.finalize(petsclib)
 
-# On macOS ARM64 with MPICH ch4:ofi, MPICH's C atexit handler crashes during
-# process teardown (SIGSEGV in libfabric/OFI cleanup).  Using quick_exit(0)
-# after explicitly finalizing PETSc and MPI bypasses all C atexit() handlers
-# (while still running at_quick_exit() handlers) and avoids the crash.
-# All MPI communication is already complete at this point.
 MPI.Barrier(comm)
 MPI.Finalize()
-ccall(:quick_exit, Cvoid, (Cint,), 0)
\ No newline at end of file
+
+# On macOS ARM64 with MPICH ch4:ofi, MPICH's C atexit handler crashes during
+# process teardown (SIGSEGV in libfabric/OFI cleanup).  quick_exit(0) bypasses
+# all C atexit() handlers and avoids the crash.  Skip when running interactively
+# (e.g. include("ex19.jl") in the REPL) so the Julia session isn't killed.
+if !isinteractive()
+    ccall(:quick_exit, Cvoid, (Cint,), 0)
+end
\ No newline at end of file

From e97aed543324c92f88ab0c0cebe618798a59607d Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Fri, 15 May 2026 11:43:57 +0200
Subject: [PATCH 37/39] =?UTF-8?q?ext/vec:=20rename=20CUDABackend=E2=86=92C?=
 =?UTF-8?q?UDAMemBackend,=20drop=20=5F=20prefix=20from=20internal=20functi?=
 =?UTF-8?q?ons?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ext/PETScCUDAExt.jl |  38 +++++++--------
 src/vec.jl          | 112 ++++++++++++++++++++++----------------------
 2 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
index 942bb8f5..74bb1f00 100644
--- a/ext/PETScCUDAExt.jl
+++ b/ext/PETScCUDAExt.jl
@@ -7,24 +7,24 @@ using CUDA
 
 # ── CUDA memory backend ───────────────────────────────────────────────────────
 
-struct CUDABackend <: PETSc.AbstractPETScMemBackend end
+struct CUDAMemBackend <: PETSc.AbstractPETScMemBackend end
 
-PETSc._memtype_backend(::Val{PETSC_MEMTYPE_DEVICE}) = CUDABackend()
-PETSc._array_type(::Val{PETSC_MEMTYPE_DEVICE}) = CuArray
+PETSc.memtype_backend(::Val{PETSC_MEMTYPE_DEVICE}) = CUDAMemBackend()
+PETSc.array_type(::Val{PETSC_MEMTYPE_DEVICE}) = CuArray
 
 # ── No-finalizer acquire/release for withlocalarray! ─────────────────────────
 
-function PETSc._make_local_array(cpu_arr, ::CUDABackend)
+function PETSc.make_local_array(cpu_arr, ::CUDAMemBackend)
     T   = eltype(cpu_arr)
     n   = length(cpu_arr)
     ptr = reinterpret(CuPtr{T}, UInt(pointer(cpu_arr)))
     return CUDA.unsafe_wrap(CuArray, ptr, n; own = false)
 end
 
-function PETSc._release_petsc_local_array(
-    cpu_arr, ::CUDABackend, vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
+function PETSc.release_petsc_local_array(
+    cpu_arr, ::CUDAMemBackend, vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
 ) where {PLib}
-    pv = PETSc._as_petsc_vec(vec)
+    pv = PETSc.as_petsc_vec(vec)
     if write && read
         LibPETSc.VecRestoreArrayAndMemType(PLib, pv, cpu_arr)
     elseif write
@@ -40,15 +40,15 @@ end
 # No longer called by withlocalarray! (which uses _acquire/_release instead).
 # Retained in case external code calls _unsafe_localarray directly.
 
-function PETSc._wrap_localarray(
-    cpu_arr, ::CUDABackend, vec::AbstractPetscVec{PetscLib};
+function PETSc.wrap_localarray(
+    cpu_arr, ::CUDAMemBackend, vec::AbstractPetscVec{PetscLib};
     read::Bool, write::Bool,
 ) where {PetscLib}
     T   = eltype(cpu_arr)
     n   = length(cpu_arr)
     ptr = reinterpret(CuPtr{T}, UInt(pointer(cpu_arr)))
     dev_arr = CUDA.unsafe_wrap(CuArray, ptr, n; own = false)
-    pv = PETSc._as_petsc_vec(vec)
+    pv = PETSc.as_petsc_vec(vec)
     finalizer(dev_arr) do _
         if write && read
             LibPETSc.VecRestoreArrayAndMemType(PetscLib, pv, cpu_arr)
@@ -66,7 +66,7 @@ end
 #
 # Two methods cover all GPU sub-cases:
 #
-#   CUDABackend × CUDABackend → both Vecs on device: zero-copy wrap, no bounce
+#   CUDAMemBackend × CUDAMemBackend → both Vecs on device: zero-copy wrap, no bounce
 #   any other mix             → at least one Vec is host-resident:
 #                               lx is wrapped zero-copy if on device, or
 #                               copied H2D if on host;
@@ -77,9 +77,9 @@ end
 # reaches these methods.
 
 # Both Vecs on the device: zero-copy wrap, no scratch needed.
-function PETSc._get_petsc_arrays_impl(
+function PETSc.get_petsc_arrays_impl(
     petsclib, g_fx, l_x, ::Type{T}, fx_arr, lx_arr,
-    ::CUDABackend, ::CUDABackend,
+    ::CUDAMemBackend, ::CUDAMemBackend,
 ) where {T}
     fx = CUDA.unsafe_wrap(CuArray,
         reinterpret(CuPtr{T}, UInt(pointer(fx_arr))), length(fx_arr))
@@ -89,13 +89,13 @@ function PETSc._get_petsc_arrays_impl(
 end
 
 # At least one Vec is host-resident (e.g. MG coarser levels, FD-coloring path).
-# Catch-all: less specific than (CUDABackend, CUDABackend), so Julia prefers
+# Catch-all: less specific than (CUDAMemBackend, CUDAMemBackend), so Julia prefers
 # the method above when both are on the device.
-function PETSc._get_petsc_arrays_impl(
+function PETSc.get_petsc_arrays_impl(
     petsclib, g_fx, l_x, ::Type{T}, fx_arr, lx_arr,
     fx_b::PETSc.AbstractPETScMemBackend, lx_b::PETSc.AbstractPETScMemBackend,
 ) where {T}
-    lx_gpu = if lx_b isa CUDABackend
+    lx_gpu = if lx_b isa CUDAMemBackend
         CUDA.unsafe_wrap(CuArray,
             reinterpret(CuPtr{T}, UInt(pointer(lx_arr))), length(lx_arr))
     else
@@ -113,15 +113,15 @@ end
 #   - if fx_bounce !== nothing, sync the device and copy the scratch D2H
 #   - call VecRestoreArray*AndMemType on both raw PETSc arrays
 
-function PETSc._restore_petsc_arrays_impl(
+function PETSc.restore_petsc_arrays_impl(
     petsclib, g_fx, l_x, fx::CuArray, lx, fx_arr, lx_arr, fx_bounce,
 )
     if fx_bounce !== nothing
         CUDA.synchronize()
         copyto!(fx_arr, fx_bounce)  # D2H: copy residual back to host PETSc array
     end
-    LibPETSc.VecRestoreArrayAndMemType(petsclib, PETSc._as_petsc_vec(g_fx), fx_arr)
-    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, PETSc._as_petsc_vec(l_x), lx_arr)
+    LibPETSc.VecRestoreArrayAndMemType(petsclib, PETSc.as_petsc_vec(g_fx), fx_arr)
+    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, PETSc.as_petsc_vec(l_x), lx_arr)
 end
 
 end # module PETScCUDAExt
diff --git a/src/vec.jl b/src/vec.jl
index 2ee6c2f0..433fe5d5 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -264,15 +264,15 @@ end
 
 # ── Memory backend type hierarchy ─────────────────────────────────────────────
 #
-# Extensions add their own backend singletons (e.g. `CUDABackend`) and overload
-# `_memtype_backend(::Val{PETSC_MEMTYPE_DEVICE})` to return them.  The base
+# Extensions add their own backend singletons (e.g. `CUDAMemBackend`) and overload
+# `memtype_backend(::Val{PETSC_MEMTYPE_DEVICE})` to return them.  The base
 # package handles only `PETSC_MEMTYPE_HOST` → `HostBackend`.
 
 """
     AbstractPETScMemBackend
 
 Abstract supertype for PETSc memory backends.  The base package defines only
-[`HostBackend`](@ref).  GPU extensions add their own (e.g. `CUDABackend`).
+[`HostBackend`](@ref).  GPU extensions add their own (e.g. `CUDAMemBackend`).
 """
 abstract type AbstractPETScMemBackend end
 
@@ -284,23 +284,23 @@ Singleton dispatch type representing host (CPU) memory.
 struct HostBackend <: AbstractPETScMemBackend end
 
 """
-    _memtype_backend(mtype::PetscMemType) → AbstractPETScMemBackend
+    memtype_backend(mtype::PetscMemType) → AbstractPETScMemBackend
 
 Convert a `PetscMemType` runtime enum value to a singleton dispatch type.
-GPU extensions overload `_memtype_backend(::Val{MT})` for their specific
+GPU extensions overload `memtype_backend(::Val{MT})` for their specific
 `PetscMemType` values (e.g. `PETSC_MEMTYPE_DEVICE` for CUDA).
 """
-_memtype_backend(::Val{LibPETSc.PETSC_MEMTYPE_HOST}) = HostBackend()
-_memtype_backend(::Val{MT}) where {MT} =
+memtype_backend(::Val{LibPETSc.PETSC_MEMTYPE_HOST}) = HostBackend()
+memtype_backend(::Val{MT}) where {MT} =
     error("No GPU backend loaded for PetscMemType $MT — load CUDA.jl, AMDGPU.jl, …")
-_memtype_backend(mt::LibPETSc.PetscMemType) = _memtype_backend(Val(mt))
+memtype_backend(mt::LibPETSc.PetscMemType) = memtype_backend(Val(mt))
 
 # ── Device-aware local array access ───────────────────────────────────────────
 #
 # `_unsafe_localarray` is the unified entry point: it calls
 # `VecGetArray*AndMemType`, converts the returned `PetscMemType` to a backend
-# singleton via `_memtype_backend`, and dispatches to `_wrap_localarray`.
-# GPU extensions add `_wrap_localarray` methods for their own backend types.
+# singleton via `memtype_backend`, and dispatches to `wrap_localarray`.
+# GPU extensions add `wrap_localarray` methods for their own backend types.
 #
 # The typed overload `_unsafe_localarray(::Type{A}, vec; ...)` additionally
 # asserts that the returned array is of type `A`, giving a clear error when a
@@ -311,7 +311,7 @@ function _unsafe_localarray(
     read::Bool = true,
     write::Bool = true,
 ) where {PetscLib}
-    pv = _as_petsc_vec(vec)
+    pv = as_petsc_vec(vec)
     if write && read
         cpu_arr, mtype = LibPETSc.VecGetArrayAndMemType(PetscLib, pv)
     elseif write
@@ -319,7 +319,7 @@ function _unsafe_localarray(
     else
         cpu_arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, pv)
     end
-    return _wrap_localarray(cpu_arr, _memtype_backend(mtype), vec; read, write)
+    return wrap_localarray(cpu_arr, memtype_backend(mtype), vec; read, write)
 end
 
 function _unsafe_localarray(
@@ -337,7 +337,7 @@ function _unsafe_localarray(
     ))
 end
 
-function _wrap_localarray(
+function wrap_localarray(
     cpu_arr, ::HostBackend, vec::AbstractPetscVec{PetscLib};
     read::Bool, write::Bool,
 ) where {PetscLib}
@@ -355,32 +355,32 @@ function _wrap_localarray(
 end
 
 # Fallback: no backend loaded for this PetscMemType.
-function _wrap_localarray(cpu_arr, b::AbstractPETScMemBackend, vec; kw...)
-    error("_wrap_localarray not implemented for backend $(typeof(b)) — " *
+function wrap_localarray(cpu_arr, b::AbstractPETScMemBackend, vec; kw...)
+    error("wrap_localarray not implemented for backend $(typeof(b)) — " *
           "load the corresponding GPU package (e.g. CUDA.jl)")
 end
 
 # ── No-finalizer acquire/release ─────────────────────────────────────────────
 #
-# `withlocalarray!` uses these instead of the finalizer-based `_wrap_localarray`
+# `withlocalarray!` uses these instead of the finalizer-based `wrap_localarray`
 # to avoid a documented Julia pitfall: after `Base.finalize(x)` is called, if
 # `x` later becomes unreachable GC may invoke the finalizer *again*, leading to
 # a double VecRestore call on an already-freed Vec (→ SIGSEGV).
 # `try/finally` provides deterministic, single-execution cleanup.
 
 """
-    _acquire_petsc_local_array(vec; read, write) -> (arr, cpu_arr, backend)
+    acquire_petsc_local_array(vec; read, write) -> (arr, cpu_arr, backend)
 
 Get the local array from `vec` via `VecGetArray*AndMemType` without
 registering a Julia finalizer.  Returns the user-visible array, the raw PETSc
 cpu_arr needed for restore, and the backend singleton.
-Extensions overload `_make_local_array(cpu_arr, backend)` to wrap the raw
-array for their device (e.g. `CUDABackend` → `CuArray`).
+Extensions overload `make_local_array(cpu_arr, backend)` to wrap the raw
+array for their device (e.g. `CUDAMemBackend` → `CuArray`).
 """
-function _acquire_petsc_local_array(
+function acquire_petsc_local_array(
     vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
 ) where {PLib}
-    pv = _as_petsc_vec(vec)
+    pv = as_petsc_vec(vec)
     cpu_arr, mtype = if write && read
         LibPETSc.VecGetArrayAndMemType(PLib, pv)
     elseif write
@@ -388,27 +388,27 @@ function _acquire_petsc_local_array(
     else
         LibPETSc.VecGetArrayReadAndMemType(PLib, pv)
     end
-    backend = _memtype_backend(mtype)
-    arr = _make_local_array(cpu_arr, backend)
+    backend = memtype_backend(mtype)
+    arr = make_local_array(cpu_arr, backend)
     return arr, cpu_arr, backend
 end
 
 # CPU: the raw PETSc array is already a Vector — return it directly.
-_make_local_array(cpu_arr, ::HostBackend) = cpu_arr
-_make_local_array(cpu_arr, b::AbstractPETScMemBackend) =
-    error("_make_local_array not implemented for backend $(typeof(b)) — " *
+make_local_array(cpu_arr, ::HostBackend) = cpu_arr
+make_local_array(_, b::AbstractPETScMemBackend) =
+    error("make_local_array not implemented for backend $(typeof(b)) — " *
           "load the corresponding GPU package (e.g. CUDA.jl)")
 
 """
-    _release_petsc_local_array(cpu_arr, backend, vec; read, write)
+    release_petsc_local_array(cpu_arr, backend, vec; read, write)
 
 Restore a previously acquired local array.  Called in `finally` blocks by
 `withlocalarray!`.  Extensions overload this for GPU backends.
 """
-function _release_petsc_local_array(
+function release_petsc_local_array(
     cpu_arr, ::HostBackend, vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
 ) where {PLib}
-    pv = _as_petsc_vec(vec)
+    pv = as_petsc_vec(vec)
     if write && read
         LibPETSc.VecRestoreArrayAndMemType(PLib, pv, cpu_arr)
     elseif write
@@ -418,14 +418,14 @@ function _release_petsc_local_array(
     end
     return nothing
 end
-_release_petsc_local_array(cpu_arr, b::AbstractPETScMemBackend, vec; kw...) =
-    error("_release_petsc_local_array not implemented for backend $(typeof(b)) — " *
+release_petsc_local_array(cpu_arr, b::AbstractPETScMemBackend, vec; kw...) =
+    error("release_petsc_local_array not implemented for backend $(typeof(b)) — " *
           "load the corresponding GPU package (e.g. CUDA.jl)")
 
 # The auto-generated *AndMemType wrappers are typed `x::PetscVec`, but
 # `AbstractPetscVec` also includes `VecPtr`.  Convert transparently.
-_as_petsc_vec(v::LibPETSc.PetscVec) = v
-_as_petsc_vec(v::AbstractPetscVec{PetscLib}) where {PetscLib} =
+as_petsc_vec(v::LibPETSc.PetscVec) = v
+as_petsc_vec(v::AbstractPetscVec{PetscLib}) where {PetscLib} =
     LibPETSc.PetscVec{PetscLib}(v.ptr)
 
 """
@@ -435,14 +435,14 @@ Errors if the Vecs are on heterogeneous devices (different `PetscMemType`
 values), since a single `withlocalarray!` call cannot handle mixed backends.
 Returns `Vector` when all Vecs are host-resident.
 
-Extensions overload `_array_type(::Val{MT})` for a `PetscMemType` enum value
+Extensions overload `array_type(::Val{MT})` for a `PetscMemType` enum value
 `MT` to register the corresponding array type (e.g. `PETSC_MEMTYPE_DEVICE` →
 `CuArray`).
 """
 function determine_memtype(vecs::AbstractPetscVec...)
     mtypes = map(vecs) do v
         PetscLib = typeof(v).parameters[1]
-        pv = _as_petsc_vec(v)
+        pv = as_petsc_vec(v)
         arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, pv)
         LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, pv, arr)
         mtype
@@ -451,13 +451,13 @@ function determine_memtype(vecs::AbstractPetscVec...)
         "Vecs are on heterogeneous devices: $(unique(mtypes)). " *
         "Use withlocalarray!(f!, ::Type{A}, ...) to handle each backend explicitly."
     ))
-    return _array_type(Val(first(mtypes)))
+    return array_type(Val(first(mtypes)))
 end
 
-_array_type(::Val{LibPETSc.PETSC_MEMTYPE_HOST}) = Vector
-_array_type(::Val{MT}) where {MT} =
+array_type(::Val{LibPETSc.PETSC_MEMTYPE_HOST}) = Vector
+array_type(::Val{MT}) where {MT} =
     error("No array type registered for PetscMemType $MT — load the corresponding GPU package (e.g. CUDA.jl)")
-# GPU extensions add: _array_type(::Val{LibPETSc.PETSC_MEMTYPE_DEVICE}) = CuArray
+# GPU extensions add: array_type(::Val{LibPETSc.PETSC_MEMTYPE_DEVICE}) = CuArray
 
 """
     withlocalarray!(
@@ -504,7 +504,7 @@ function withlocalarray!(
     # This avoids the Julia pitfall where Base.finalize + GC can both run the
     # finalizer if the object becomes unreachable again (double-restore → crash).
     acquired = map(vecs, read, write) do v, r, w
-        _acquire_petsc_local_array(v; read=r, write=w)
+        acquire_petsc_local_array(v; read=r, write=w)
     end
     try
         # Type check inside try so finally still releases on mismatch.
@@ -518,7 +518,7 @@ function withlocalarray!(
         return f!(arrays...)
     finally
         foreach(vecs, acquired, read, write) do v, (_, cpu_arr, backend), r, w
-            _release_petsc_local_array(cpu_arr, backend, v; read=r, write=w)
+            release_petsc_local_array(cpu_arr, backend, v; read=r, write=w)
         end
     end
 end
@@ -649,10 +649,10 @@ end
 #
 # `get_petsc_arrays` calls `VecGetArrayAndMemType` on both Vecs, converts the
 # returned `PetscMemType` values to backend singletons, and dispatches to
-# `_get_petsc_arrays_impl`.  The base package handles the pure-CPU case
-# (HostBackend × HostBackend).  GPU extensions add `_get_petsc_arrays_impl`
+# `get_petsc_arrays_impl`.  The base package handles the pure-CPU case
+# (HostBackend × HostBackend).  GPU extensions add `get_petsc_arrays_impl`
 # methods for their backend combinations and a matching
-# `_restore_petsc_arrays_impl` method dispatched by `restore_petsc_arrays`.
+# `restore_petsc_arrays_impl` method dispatched by `restore_petsc_arrays`.
 #
 # Return tuple:  (fx, lx, fx_arr, lx_arr, fx_bounce)
 #   CPU:  fx, lx are plain Arrays with VecRestore finalizers;
@@ -665,7 +665,7 @@ end
 
 Return arrays for `g_fx` (read-write) and `l_x` (read-only) suitable for
 passing to a compute kernel.  Dispatches on the memory location of each Vec
-via `_memtype_backend`.
+via `memtype_backend`.
 
 On the pure-CPU path (`HostBackend × HostBackend`) `fx`/`lx` are plain
 `Array`s and `fx_arr = lx_arr = fx_bounce = nothing`.  When a GPU backend
@@ -678,17 +678,17 @@ See also: [`restore_petsc_arrays`](@ref)
 """
 function get_petsc_arrays(petsclib, g_fx, l_x)
     T = petsclib.PetscScalar
-    fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, _as_petsc_vec(g_fx))
-    lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, _as_petsc_vec(l_x))
-    return _get_petsc_arrays_impl(
+    fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, as_petsc_vec(g_fx))
+    lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, as_petsc_vec(l_x))
+    return get_petsc_arrays_impl(
         petsclib, g_fx, l_x, T, fx_arr, lx_arr,
-        _memtype_backend(fx_mtype), _memtype_backend(lx_mtype),
+        memtype_backend(fx_mtype), memtype_backend(lx_mtype),
     )
 end
 
 # CPU base case: return arrays directly. restore_petsc_arrays calls VecRestore
 # explicitly — no finalizers to avoid the double-finalization crash.
-function _get_petsc_arrays_impl(
+function get_petsc_arrays_impl(
     petsclib, g_fx, l_x, ::Type, fx_arr, lx_arr, ::HostBackend, ::HostBackend,
 )
     return fx_arr, lx_arr, nothing, nothing, nothing
@@ -699,19 +699,19 @@ end
 
 Restore PETSc Vecs after a kernel launched via [`get_petsc_arrays`](@ref).
 
-Dispatches to `_restore_petsc_arrays_impl`.  On the CPU path (`fx_arr`,
+Dispatches to `restore_petsc_arrays_impl`.  On the CPU path (`fx_arr`,
 `lx_arr`, `fx_bounce` all `nothing`) this simply finalizes `fx` and `lx`,
 triggering the registered `VecRestoreArray*AndMemType` finalizers.  GPU backend
-extensions add a `_restore_petsc_arrays_impl` method for their array types.
+extensions add a `restore_petsc_arrays_impl` method for their array types.
 """
 function restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
-    _restore_petsc_arrays_impl(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
+    restore_petsc_arrays_impl(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
 end
 
 # CPU base case: call VecRestore directly (no finalizers).
-function _restore_petsc_arrays_impl(
+function restore_petsc_arrays_impl(
     petsclib, g_fx, l_x, fx, lx, ::Nothing, ::Nothing, ::Nothing,
 )
-    LibPETSc.VecRestoreArrayAndMemType(petsclib, _as_petsc_vec(g_fx), fx)
-    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, _as_petsc_vec(l_x), lx)
+    LibPETSc.VecRestoreArrayAndMemType(petsclib, as_petsc_vec(g_fx), fx)
+    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, as_petsc_vec(l_x), lx)
 end

From 25b745725f4457550231369489339e105c35d3ec Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Fri, 15 May 2026 14:12:46 +0200
Subject: [PATCH 38/39] address remaining comments by Valentin

---
 docs/src/man/gpu.md | 94 +++++++++++++++++++++------------------------
 ext/PETScCUDAExt.jl | 12 +++---
 src/vec.jl          |  2 +-
 test/runtests.jl    |  9 +++--
 4 files changed, 56 insertions(+), 61 deletions(-)

diff --git a/docs/src/man/gpu.md b/docs/src/man/gpu.md
index 8b04190b..dc417d11 100644
--- a/docs/src/man/gpu.md
+++ b/docs/src/man/gpu.md
@@ -1,11 +1,11 @@
 # GPU Support (CUDA + KernelAbstractions)
 
-Julia has outstanding support for GPUs as it compiles machine code for the particular devices. Importantly, all modern GPUs are supported, which implies that it is quite straightforward to write GPU kernels in Julia, for example using packages such as [KernelAbstractions](https://github.com/JuliaGPU/KernelAbstractions.jl).
+Julia has outstanding support for GPUs as it compiles machine code for the particular devices. Importantly, all modern GPUs are supported, which implies that it is quite straightforward to write GPU kernels in Julia, for example using packages such as [KernelAbstractions](https://github.com/JuliaGPU/KernelAbstractions.jl) with no need to write new code when you use AMD instead of NVIDIA GPU's
 
 PETSc has also added GPU support in recent years, and PETSc vector and matrix objects, along with many of the solvers, can be moved to the GPU.
 
-GPU support in PETSc.jl requires a **locally built PETSc** with CUDA or HIP enabled — the precompiled `PETSc_jll` binaries do not include GPU support. See [Installation](@ref) for instructions on pointing PETSc.jl at a local library.
-The examples below are given for CUDA. Doing this on AMD machines (HIP) will likely work the same but will require a specific extension to be added.
+GPU support in PETSc.jl requires a **locally built PETSc** with CUDA or HIP enabled — the precompiled `PETSc_jll` binaries do not include GPU support. See [Installation](@ref) for instructions on pointing PETSc.jl to a local library.
+The example below are given for CUDA. Doing this on AMD machines (HIP) will likely work the same, but will require a specific extension to be added.
 
 ## Prerequisites
 
@@ -33,82 +33,65 @@ PETSc manages where vector data lives (host or device). The extension inspects t
 
 ## Public API
 
-### `withlocalarray_device!`
+### `withlocalarray!`
 
-Callback-based access to the underlying array of one or more Vecs, returning a `CuArray` when the data is on the device:
-
-```julia
-withlocalarray_device!(f!, vecs...; read=true, write=true)
-```
+`withlocalarray!` gives callback-based access to the underlying array of one or more Vecs.
+When CUDA.jl is loaded, it automatically returns a `CuArray` for device-resident Vecs and a plain `Vector` for host-resident Vecs:
 
 ```julia
 using PETSc, CUDA, KernelAbstractions
 
-withlocalarray_device!(my_vec; read=false, write=true) do arr
-    # arr is a CuArray if the Vec lives on the GPU, plain Array otherwise
+# single Vec
+withlocalarray!(my_vec; write=true) do arr
+    # arr is CuArray on GPU, Vector on CPU
     fill!(arr, 42)
 end
-```
 
-For multiple Vecs, pass keyword tuples to control read/write access per Vec:
-
-```julia
-withlocalarray_device!(
-    (x_vec, f_vec);
-    read  = (true,  false),
-    write = (false, true),
-) do x_arr, f_arr
-    my_kernel!(backend)(f_arr, x_arr; ndrange = length(f_arr))
-    KernelAbstractions.synchronize(backend)
+# two Vecs — backend selected from the array type at runtime
+withlocalarray!(g_fx, l_x; read=(true, true), write=(true, false)) do fx, lx
+    kern = KernelAbstractions.get_backend(fx)
+    my_kernel!(kern, 256)(fx, lx; ndrange = length(fx))
+    KernelAbstractions.synchronize(kern)
 end
 ```
 
-### `get_petsc_arrays` / `restore_petsc_arrays`
-
-Lower-level paired get/restore for the residual function pattern, where you need both a global output Vec and a local (ghost-padded) input Vec:
-
-```julia
-fx, lx, fx_arr, lx_arr, fx_bounce = get_petsc_arrays(petsclib, g_fx, l_x)
-# launch kernel writing into fx, reading from lx
-restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
-```
-
-- When both Vecs are on the GPU, `fx` and `lx` are zero-copy `CuArray` wrappers.
-- When `l_x` is host-resident (e.g. on a coarser MG level), the data is copied host→device before the kernel and the result is copied device→host by `restore_petsc_arrays`.
-- On a CPU-only path (CUDA.jl not loaded, or all Vecs on host), `fx`/`lx` are plain `Array`s with no copies.
+The array type (`CuArray` or `Vector`) is determined automatically from the `PetscMemType`
+of each Vec, so the same code works on both CPU and GPU.  On coarser multigrid levels where
+Vecs may be host-resident even in a CUDA run, the CPU path is taken transparently.
 
 ## Writing portable kernels with KernelAbstractions
 
-Select the backend at the top of your script based on the `useCUDA` flag:
+`using CUDA` is sufficient — loading it activates `PETScCUDAExt` automatically and no
+extra imports are needed.  Write kernels with `@kernel` and select the backend at
+runtime via `KernelAbstractions.get_backend`:
 
 ```julia
-using KernelAbstractions
-using CUDA
-import CUDA: CuArray, CuPtr, unsafe_wrap
-
-const backend = CUDABackend()   # or CPU() for a CPU run
-```
-
-Write kernels with `@kernel` so the same code runs on both backends:
+using PETSc, CUDA, KernelAbstractions
 
-```julia
 @kernel function my_kernel!(out, inp)
     i = @index(Global)
     out[i] = inp[i] * 2
 end
 
-# launch:
-my_kernel!(backend, 256)(out_arr, inp_arr; ndrange = length(out_arr))
-KernelAbstractions.synchronize(backend)
+withlocalarray!(out_vec, inp_vec; read=(true, true), write=(true, false)) do out, inp
+    kern = KernelAbstractions.get_backend(out)
+    my_kernel!(kern, 256)(out, inp; ndrange = length(out))
+    KernelAbstractions.synchronize(kern)
+end
 ```
 
+The same kernel runs on CPU (when Vecs are host-resident) and GPU (when Vecs are device-resident) without any code changes.
+
 ## Example
 
 [`examples/ex19.jl`](https://github.com/JuliaParallel/PETSc.jl/blob/main/examples/ex19.jl) is a full 2D driven-cavity example (velocity–vorticity–temperature) that demonstrates:
 
 - Switching between CPU and GPU with a single `useCUDA` flag.
 - FD coloring-based Jacobian assembly running entirely on-device.
-- `get_petsc_arrays` / `restore_petsc_arrays` in the residual callback.
+- `withlocalarray!` in the residual callback for transparent CPU/GPU dispatch.
+  
+- Using [KernelAbstractions](https://github.com/JuliaGPU/KernelAbstractions.jl) to run kernels on various flavors of GPUs or CPUs.
+  
 - Multigrid preconditioning with coarser levels falling back to a CPU Jacobian.
 
 To run it on a GPU:
@@ -128,6 +111,17 @@ To run it on a GPU:
 > Check with `grep sizeof_PetscInt $PETSC_DIR/$PETSC_ARCH/include/petscconf.h`.
 
 
+## Profiling
+
+To profile GPU activity, wrap the solve call with `CUDA.@profile`:
+
+```julia
+using CUDA
+CUDA.@profile PETSc.solve!(x, snes)
+```
+
+This records a trace compatible with NSight Systems (`nsys profile`) and NVTX. For a quick check of kernel timing, `CUDA.@profile external=true` launches the process under `nsys` automatically.
+
 ## Performance 
 
 We have checked the performance of `examples/ex19.jl` by running it on GPU and on 1 or 32 CPU's of a Grace-Hopper 200 machine, using the following options:
@@ -206,4 +200,4 @@ Vectors stay GPU-resident. The number of host↔device copies is small relative
 
 *4. Residual evaluation (KernelAbstractions)*
 
-`MatFDColorApply` — which drives all residual evaluations for the finite-difference Jacobian — reports 0% GPU %F in PETSc's profiler. This is expected: the residual kernel is launched by Julia's CUDA.jl runtime (via KernelAbstractions) and its flops are invisible to PETSc's event system. GPU execution is confirmed indirectly by the GpuToCpu transfer pattern in `MatFDColorApply`: PETSc hands off perturbed vectors, the KA kernel evaluates the residual on the GPU, and the result is returned. On the GH200's unified memory architecture these transfers are intra-device and incur minimal latency.
\ No newline at end of file
+`MatFDColorApply` performs residual evaluations for the finite-difference Jacobian — reports 0% GPU %F in PETSc's profiler. This is expected: the residual kernel is launched by Julia's CUDA.jl runtime (via KernelAbstractions) and its flops are invisible to PETSc's event system. GPU execution is confirmed indirectly by the GpuToCpu transfer pattern in `MatFDColorApply`: PETSc hands off perturbed vectors, the KA kernel evaluates the residual on the GPU, and the result is returned.
\ No newline at end of file
diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
index 74bb1f00..1571cb86 100644
--- a/ext/PETScCUDAExt.jl
+++ b/ext/PETScCUDAExt.jl
@@ -35,10 +35,10 @@ function PETSc.release_petsc_local_array(
     return nothing
 end
 
-# ── _wrap_localarray: device branch (legacy, kept for backward compat) ────────
+# ── wrap_localarray: device branch (legacy, kept for backward compat) ─────────
 #
-# No longer called by withlocalarray! (which uses _acquire/_release instead).
-# Retained in case external code calls _unsafe_localarray directly.
+# No longer called by withlocalarray! (which uses acquire/release instead).
+# Retained in case external code calls unsafe_localarray directly.
 
 function PETSc.wrap_localarray(
     cpu_arr, ::CUDAMemBackend, vec::AbstractPetscVec{PetscLib};
@@ -62,7 +62,7 @@ function PETSc.wrap_localarray(
     return dev_arr
 end
 
-# ── _get_petsc_arrays_impl: CUDA cases ───────────────────────────────────────
+# ── get_petsc_arrays_impl: CUDA cases ────────────────────────────────────────
 #
 # Two methods cover all GPU sub-cases:
 #
@@ -107,9 +107,9 @@ function PETSc.get_petsc_arrays_impl(
     return fx_gpu, lx_gpu, fx_arr, lx_arr, fx_gpu
 end
 
-# ── _restore_petsc_arrays_impl: CUDA ─────────────────────────────────────────
+# ── restore_petsc_arrays_impl: CUDA ──────────────────────────────────────────
 #
-# When fx is a CuArray (returned by the GPU _get_petsc_arrays_impl above):
+# When fx is a CuArray (returned by the GPU get_petsc_arrays_impl above):
 #   - if fx_bounce !== nothing, sync the device and copy the scratch D2H
 #   - call VecRestoreArray*AndMemType on both raw PETSc arrays
 
diff --git a/src/vec.jl b/src/vec.jl
index 433fe5d5..2b91c348 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -362,7 +362,7 @@ end
 
 # ── No-finalizer acquire/release ─────────────────────────────────────────────
 #
-# `withlocalarray!` uses these instead of the finalizer-based `wrap_localarray`
+# `withlocalarray!` uses these instead of the finalizer-based `unsafe_localarray`
 # to avoid a documented Julia pitfall: after `Base.finalize(x)` is called, if
 # `x` later becomes unreachable GC may invoke the finalizer *again*, leading to
 # a double VecRestore call on an already-freed Vec (→ SIGSEGV).
diff --git a/test/runtests.jl b/test/runtests.jl
index 4e87c216..2c1d3a90 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,13 +7,14 @@ Pkg.instantiate()
 
 # When set_library! has been used, petsc_library is a path string and PETSc_jll
 # is not loaded.  Only import JLL-specific symbols when using the default binaries.
-const _using_custom_lib = PETSc.petsclibs[1].petsc_library isa AbstractString
+const USING_CUSTOM_LIB = PETSc.petsclibs[1].petsc_library isa AbstractString
 
-if _using_custom_lib
-    @info "Testing PETSc.jl with custom library" path=PETSc.petsclibs[1].petsc_library
+import MPIPreferences  # always import to ensure local MPI API is configured
+
+if USING_CUSTOM_LIB
+    @info "Testing PETSc.jl with custom library" path=PETSc.petsclibs[1].petsc_library MPIPreferences.binary MPIPreferences.abi
 else
     using PETSc_jll
-    import MPIPreferences
     @info "Testing PETSc.jl with" MPIPreferences.binary MPIPreferences.abi PETSc_jll.host_platform
 end
 

From 0c83cea8ab213260012f8fdd2b73961f613b4e0b Mon Sep 17 00:00:00 2001
From: Boris Kaus <kaus@uni-mainz.de>
Date: Fri, 15 May 2026 14:48:56 +0200
Subject: [PATCH 39/39] test ex19 as well

---
 examples/ex19.jl    |  1 -
 ext/PETScCUDAExt.jl |  2 +-
 src/vec.jl          | 40 +++++++++++++++++-----------------------
 3 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/examples/ex19.jl b/examples/ex19.jl
index 4c42dc11..11539c5b 100644
--- a/examples/ex19.jl
+++ b/examples/ex19.jl
@@ -1,4 +1,3 @@
-# EXCLUDE FROM TESTING
 #=
   ex19.jl — 2D Driven Cavity: velocity–vorticity–temperature formulation
   Port of PETSc snes/tutorials/ex19.c
diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
index 1571cb86..9082f737 100644
--- a/ext/PETScCUDAExt.jl
+++ b/ext/PETScCUDAExt.jl
@@ -73,7 +73,7 @@ end
 #                               fx always gets a fresh scratch CuArray (bounce)
 #                               so the kernel writes there and restore copies D2H.
 #
-# HostBackend × HostBackend is handled entirely in base (vec.jl) and never
+# nothing × nothing (host) is handled entirely in base (vec.jl) and never
 # reaches these methods.
 
 # Both Vecs on the device: zero-copy wrap, no scratch needed.
diff --git a/src/vec.jl b/src/vec.jl
index 2b91c348..1d920fb2 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -264,33 +264,27 @@ end
 
 # ── Memory backend type hierarchy ─────────────────────────────────────────────
 #
-# Extensions add their own backend singletons (e.g. `CUDAMemBackend`) and overload
-# `memtype_backend(::Val{PETSC_MEMTYPE_DEVICE})` to return them.  The base
-# package handles only `PETSC_MEMTYPE_HOST` → `HostBackend`.
+# `memtype_backend` maps a `PetscMemType` to a dispatch tag.
+# Host memory returns `nothing` (no KA backend — avoids confusion with
+# KernelAbstractions.CPU()).  GPU extensions return their own singleton
+# (e.g. `CUDAMemBackend`) by overloading `memtype_backend(::Val{MT})`.
 
 """
     AbstractPETScMemBackend
 
-Abstract supertype for PETSc memory backends.  The base package defines only
-[`HostBackend`](@ref).  GPU extensions add their own (e.g. `CUDAMemBackend`).
+Abstract supertype for GPU memory backends used by PETSc extensions.
+Host memory is represented by `nothing`, not a subtype of this.
+GPU extensions define their own concrete subtype (e.g. `CUDAMemBackend`).
 """
 abstract type AbstractPETScMemBackend end
 
 """
-    HostBackend <: AbstractPETScMemBackend
+    memtype_backend(mtype::PetscMemType) → Nothing | AbstractPETScMemBackend
 
-Singleton dispatch type representing host (CPU) memory.
+Convert a `PetscMemType` to a dispatch tag.  Returns `nothing` for host memory;
+GPU extensions return their own singleton for device memory.
 """
-struct HostBackend <: AbstractPETScMemBackend end
-
-"""
-    memtype_backend(mtype::PetscMemType) → AbstractPETScMemBackend
-
-Convert a `PetscMemType` runtime enum value to a singleton dispatch type.
-GPU extensions overload `memtype_backend(::Val{MT})` for their specific
-`PetscMemType` values (e.g. `PETSC_MEMTYPE_DEVICE` for CUDA).
-"""
-memtype_backend(::Val{LibPETSc.PETSC_MEMTYPE_HOST}) = HostBackend()
+memtype_backend(::Val{LibPETSc.PETSC_MEMTYPE_HOST}) = nothing
 memtype_backend(::Val{MT}) where {MT} =
     error("No GPU backend loaded for PetscMemType $MT — load CUDA.jl, AMDGPU.jl, …")
 memtype_backend(mt::LibPETSc.PetscMemType) = memtype_backend(Val(mt))
@@ -338,7 +332,7 @@ function _unsafe_localarray(
 end
 
 function wrap_localarray(
-    cpu_arr, ::HostBackend, vec::AbstractPetscVec{PetscLib};
+    cpu_arr, ::Nothing, vec::AbstractPetscVec{PetscLib};
     read::Bool, write::Bool,
 ) where {PetscLib}
     finalizer(cpu_arr) do a
@@ -394,7 +388,7 @@ function acquire_petsc_local_array(
 end
 
 # CPU: the raw PETSc array is already a Vector — return it directly.
-make_local_array(cpu_arr, ::HostBackend) = cpu_arr
+make_local_array(cpu_arr, ::Nothing) = cpu_arr
 make_local_array(_, b::AbstractPETScMemBackend) =
     error("make_local_array not implemented for backend $(typeof(b)) — " *
           "load the corresponding GPU package (e.g. CUDA.jl)")
@@ -406,7 +400,7 @@ Restore a previously acquired local array.  Called in `finally` blocks by
 `withlocalarray!`.  Extensions overload this for GPU backends.
 """
 function release_petsc_local_array(
-    cpu_arr, ::HostBackend, vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
+    cpu_arr, ::Nothing, vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
 ) where {PLib}
     pv = as_petsc_vec(vec)
     if write && read
@@ -650,7 +644,7 @@ end
 # `get_petsc_arrays` calls `VecGetArrayAndMemType` on both Vecs, converts the
 # returned `PetscMemType` values to backend singletons, and dispatches to
 # `get_petsc_arrays_impl`.  The base package handles the pure-CPU case
-# (HostBackend × HostBackend).  GPU extensions add `get_petsc_arrays_impl`
+# (host × host (both backends nothing)).  GPU extensions add `get_petsc_arrays_impl`
 # methods for their backend combinations and a matching
 # `restore_petsc_arrays_impl` method dispatched by `restore_petsc_arrays`.
 #
@@ -667,7 +661,7 @@ Return arrays for `g_fx` (read-write) and `l_x` (read-only) suitable for
 passing to a compute kernel.  Dispatches on the memory location of each Vec
 via `memtype_backend`.
 
-On the pure-CPU path (`HostBackend × HostBackend`) `fx`/`lx` are plain
+On the pure-CPU path (`host × host (both backends nothing)`) `fx`/`lx` are plain
 `Array`s and `fx_arr = lx_arr = fx_bounce = nothing`.  When a GPU backend
 extension is loaded and a Vec lives on the device the returned `fx`/`lx` are
 device arrays.  An optional bounce buffer `fx_bounce` is allocated when `g_fx`
@@ -689,7 +683,7 @@ end
 # CPU base case: return arrays directly. restore_petsc_arrays calls VecRestore
 # explicitly — no finalizers to avoid the double-finalization crash.
 function get_petsc_arrays_impl(
-    petsclib, g_fx, l_x, ::Type, fx_arr, lx_arr, ::HostBackend, ::HostBackend,
+    petsclib, g_fx, l_x, ::Type, fx_arr, lx_arr, ::Nothing, ::Nothing,
 )
     return fx_arr, lx_arr, nothing, nothing, nothing
 end