diff --git a/Project.toml b/Project.toml
index 20894395..1369e621 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "PETSc"
 uuid = "ace2c81b-2b5f-4b1e-a30d-d662738edfe0"
-version = "0.4.9"
+version = "0.4.10-DEV"
 authors = ["Boris Kaus <kaus@uni-mainz.de>", "Viral B. Shah <virals@gmail.com>", "Valentin Churavy <v.churavy@gmail.com>", "Erik Schnetter <eschnetter@perimeterinstitute.ca>", "Jeremy E. Kozdon <jeremy@kozdon.net>", "Simon Byrne <simonbyrne@gmail.com>"]
 
 [deps]
@@ -18,6 +18,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 
 [compat]
+CUDA = "5"
 ForwardDiff = "0.10, 1"
 Libdl = "^1.10"
 LinearAlgebra = "^1.10"
@@ -32,8 +33,15 @@ Statistics = "^1.10"
 UnicodePlots = "3.0"
 julia = "^1.10"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+PETScCUDAExt = "CUDA"
+
 [extras]
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -43,4 +51,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 
 [targets]
-test = ["ForwardDiff", "UnicodePlots", "Test", "Plots", "SparseDiffTools", "Printf", "Random", "CairoMakie"]
+test = ["ForwardDiff", "UnicodePlots", "Test", "Plots", "SparseDiffTools", "Printf", "Random", "CairoMakie", "KernelAbstractions"]
diff --git a/docs/make.jl b/docs/make.jl
index 17f7c94f..d9fcc08d 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -49,6 +49,7 @@ makedocs(;
         ],
         "Utilities" => "man/utilities.md",
         "Running on HPC Systems" => "man/hpc.md",
+        "GPU Support (CUDA)" => "man/gpu.md",
         "FAQ"  => "man/FAQ.md",
         "Contributing"  => "man/contributing.md",
         "Funding" => "man/funding.md",
diff --git a/docs/src/man/gpu.md b/docs/src/man/gpu.md
new file mode 100644
index 00000000..dc417d11
--- /dev/null
+++ b/docs/src/man/gpu.md
@@ -0,0 +1,203 @@
+# GPU Support (CUDA + KernelAbstractions)
+
+Julia has outstanding support for GPUs as it compiles machine code for the particular devices. Importantly, all modern GPUs are supported, which implies that it is quite straightforward to write GPU kernels in Julia, for example using packages such as [KernelAbstractions](https://github.com/JuliaGPU/KernelAbstractions.jl) with no need to write new code when you use AMD instead of NVIDIA GPU's
+
+PETSc has also added GPU support in recent years, and PETSc vector and matrix objects, along with many of the solvers, can be moved to the GPU.
+
+GPU support in PETSc.jl requires a **locally built PETSc** with CUDA or HIP enabled — the precompiled `PETSc_jll` binaries do not include GPU support. See [Installation](@ref) for instructions on pointing PETSc.jl to a local library.
+The example below are given for CUDA. Doing this on AMD machines (HIP) will likely work the same, but will require a specific extension to be added.
+
+## Prerequisites
+
+1. **PETSc built with CUDA** — configure with `--with-cuda=1` (and the matching `--with-cuda-dir`). Confirm with:
+   ```bash
+   grep -i cuda $PETSC_DIR/$PETSC_ARCH/include/petscconf.h
+   # should show: #define PETSC_HAVE_CUDA 1
+   ```
+
+2. **CUDA.jl** in your Julia environment:
+   ```julia
+   pkg> add CUDA
+   ```
+
+3. **KernelAbstractions.jl** if you want to write portable GPU/CPU kernels:
+   ```julia
+   pkg> add KernelAbstractions
+   ```
+
+## How it works
+
+When CUDA.jl is loaded alongside PETSc.jl, the `PETScCUDAExt` extension is activated automatically. It registers CUDA-aware implementations for the functions below via Julia's package extension mechanism — no extra configuration is needed.
+
+PETSc manages where vector data lives (host or device). The extension inspects the `PetscMemType` returned by `VecGetArray*AndMemType` calls and either wraps the device pointer as a `CuArray` (zero-copy) or allocates a bounce buffer if the data needs to move between host and device.
+
+## Public API
+
+### `withlocalarray!`
+
+`withlocalarray!` gives callback-based access to the underlying array of one or more Vecs.
+When CUDA.jl is loaded, it automatically returns a `CuArray` for device-resident Vecs and a plain `Vector` for host-resident Vecs:
+
+```julia
+using PETSc, CUDA, KernelAbstractions
+
+# single Vec
+withlocalarray!(my_vec; write=true) do arr
+    # arr is CuArray on GPU, Vector on CPU
+    fill!(arr, 42)
+end
+
+# two Vecs — backend selected from the array type at runtime
+withlocalarray!(g_fx, l_x; read=(true, true), write=(true, false)) do fx, lx
+    kern = KernelAbstractions.get_backend(fx)
+    my_kernel!(kern, 256)(fx, lx; ndrange = length(fx))
+    KernelAbstractions.synchronize(kern)
+end
+```
+
+The array type (`CuArray` or `Vector`) is determined automatically from the `PetscMemType`
+of each Vec, so the same code works on both CPU and GPU.  On coarser multigrid levels where
+Vecs may be host-resident even in a CUDA run, the CPU path is taken transparently.
+
+## Writing portable kernels with KernelAbstractions
+
+`using CUDA` is sufficient — loading it activates `PETScCUDAExt` automatically and no
+extra imports are needed.  Write kernels with `@kernel` and select the backend at
+runtime via `KernelAbstractions.get_backend`:
+
+```julia
+using PETSc, CUDA, KernelAbstractions
+
+@kernel function my_kernel!(out, inp)
+    i = @index(Global)
+    out[i] = inp[i] * 2
+end
+
+withlocalarray!(out_vec, inp_vec; read=(true, true), write=(true, false)) do out, inp
+    kern = KernelAbstractions.get_backend(out)
+    my_kernel!(kern, 256)(out, inp; ndrange = length(out))
+    KernelAbstractions.synchronize(kern)
+end
+```
+
+The same kernel runs on CPU (when Vecs are host-resident) and GPU (when Vecs are device-resident) without any code changes.
+
+## Example
+
+[`examples/ex19.jl`](https://github.com/JuliaParallel/PETSc.jl/blob/main/examples/ex19.jl) is a full 2D driven-cavity example (velocity–vorticity–temperature) that demonstrates:
+
+- Switching between CPU and GPU with a single `useCUDA` flag.
+- FD coloring-based Jacobian assembly running entirely on-device.
+- `withlocalarray!` in the residual callback for transparent CPU/GPU dispatch.
+  
+- Using [KernelAbstractions](https://github.com/JuliaGPU/KernelAbstractions.jl) to run kernels on various flavors of GPUs or CPUs.
+  
+- Multigrid preconditioning with coarser levels falling back to a CPU Jacobian.
+
+To run it on a GPU:
+
+1. Set `const useCUDA = true` near the top of `ex19.jl`.
+2. Ensure your local PETSc build has CUDA support and is linked via `PETSc.set_library!`.
+3. Launch:
+   ```bash
+   julia --project ex19.jl -da_grid_x 256 -da_grid_y 256 \
+       -pc_type mg -pc_mg_levels 4 \
+       -mg_levels_ksp_type chebyshev -mg_levels_pc_type jacobi \
+       -snes_monitor -ksp_monitor
+   ```
+
+> [!NOTE]
+> The `PetscInt` type in your `LocalPreferences.toml` must match the PETSc build.
+> Check with `grep sizeof_PetscInt $PETSC_DIR/$PETSC_ARCH/include/petscconf.h`.
+
+
+## Profiling
+
+To profile GPU activity, wrap the solve call with `CUDA.@profile`:
+
+```julia
+using CUDA
+CUDA.@profile PETSc.solve!(x, snes)
+```
+
+This records a trace compatible with NSight Systems (`nsys profile`) and NVTX. For a quick check of kernel timing, `CUDA.@profile external=true` launches the process under `nsys` automatically.
+
+## Performance 
+
+We have checked the performance of `examples/ex19.jl` by running it on GPU and on 1 or 32 CPU's of a Grace-Hopper 200 machine, using the following options:
+
+```bash
+$mpiexec -n 1 julia --project ex19.jl -snes_monitor -snes_converged_reason -snes_monitor   -pc_type mg  -mg_levels_ksp_type chebyshev -mg_levels_pc_type jacobi -ksp_monitor -log_view  -mg_levels_esteig_ksp_max_it 20  -mg_levels_esteig_ksp_max_it 20 -mg_levels_ksp_chebyshev_esteig 0,0.1,0,1.1 -mg_levels_ksp_max_it 3  -da_grid_x 513 -da_grid_y 513 -pc_mg_levels 6 
+```
+We performed tests in which we doubled the resolution in `x` and `y` while increasing the number of multigrid levels such that the coarse grid level has the same size in all cases.
+
+The results of the full solve include booting up Julia and computing the coloring pattern (which doesn't scale well).
+
+Below we report results for the inner solve itself (KSPSolve) on a GPU (with 1 MPI rank, which is a requirement of PETSc), and compare that with the results on CPU on 1 core and on 32 cores
+
+
+**KSPSolve**:
+| Resolution | GPU time (s) | GPU (GFlop/s) | CPU-1 time (s) | CPU-1 (GFlop/s) | CPU-32 time (s) | CPU-32 (GFlop/s) |
+|---|---|---|---|---|---|---|
+| 513²   |  0.116 | 144.3 |   4.337 |  4.1 |  0.297 | 61.0 |
+| 1025²  |  0.299 | 249.5 |  19.10  |  3.9 |  1.196 | 61.7 |
+| 2049²  |  1.118 | 295.5 |  89.76  |  3.6 |  5.757 | 56.6 |
+| 4097²  |  4.540 | 312.4 | 422.2   |  3.3 | 28.30  | 49.4 |
+
+
+**SNESSolve**:
+| Resolution | GPU time (s) | GPU (GFlop/s) | CPU-1 time (s) | CPU-1 (GFlop/s) | CPU-32 time (s) | CPU-32 (GFlop/s) |
+|---|---|---|---|---|---|---|
+| 513²   |  3.645 |  6.7 |   8.041 | 3.2 |  2.118 | 12.3 |
+| 1025²  |  5.127 | 20.5 |  32.75  | 3.2 |  3.698 | 28.2 |
+| 2049²  | 11.37  | 39.7 | 144.1   | 3.1 | 10.57  | 42.3 |
+| 4097²  | 36.88  | 47.7 | 658.3   | 2.9 | 43.58  | 43.2 |
+
+From this it is clear that the `KSPSolve` itself is very efficient on the GPU (and clearly beats the CPU), but that there is quite some overhead when we compare it with `SNESSolve` where this difference is not so large anymore.
+
+
+
+***GPU Utilisation Evidence — ex19.jl on GH200 (SM90)***
+
+Lets have a look in detail on whether the example actually runs on the GPU:
+
+All GPU runs use a CUDA-enabled PETSc build (cuSPARSE, cuBLAS) with KernelAbstractions.jl providing the residual kernel. The PETSc `GPU %F` column (fraction of flops executed on GPU) and the host↔device transfer logs provide direct evidence of efficient GPU utilisation.
+
+*1. Flop fraction on GPU*
+
+Nearly all floating point work executes on the GPU across all resolutions and all major solver phases:
+
+| Event | GPU %F |
+|---|---|
+| KSPSolve | 100% |
+| PCApply | 100% |
+| PCSetUp | 98–99% |
+| SNESSolve | 99–100% |
+
+At the kernel level, all key GMRES and multigrid operations run fully on the GPU: `MatMult`, `MatResidual`, `VecMAXPY`, `VecMDot`, `VecAXPBYCZ`, `VecAYPX`, `VecPointwiseMult`, `VecNormalize` all report 100% GPU %F.
+
+*2. Sparse matrix storage and transfers*
+
+System matrices are assembled on the host and uploaded once per Newton iteration via `MatCUSPARSECopyTo`, then all SpMV operations run in cuSPARSE format. Only 3 GpuToCpu copies occur per solve (one per Newton iteration), confirming matrices remain GPU-resident throughout.
+
+| Resolution | Matrix upload size (MB) |
+|---|---|
+| 513²  |    495 |
+| 1025² |  1,990 |
+| 2049² |  7,950 |
+| 4097² | 31,800 |
+
+*3. Vector transfers*
+
+Vectors stay GPU-resident. The number of host↔device copies is small relative to the hundreds of thousands of vector operations performed:
+
+| Resolution | CpuToGpu count | CpuToGpu (MB) | GpuToCpu count | GpuToCpu (MB) |
+|---|---|---|---|---|
+| 513²  | 37 |    109 | 24 |    42 |
+| 1025² | 55 |    437 | 36 |   168 |
+| 2049² | 64 |  1,750 | 42 |   672 |
+| 4097² | 73 |  6,980 | 48 | 2,690 |
+
+*4. Residual evaluation (KernelAbstractions)*
+
+`MatFDColorApply` performs residual evaluations for the finite-difference Jacobian — reports 0% GPU %F in PETSc's profiler. This is expected: the residual kernel is launched by Julia's CUDA.jl runtime (via KernelAbstractions) and its flops are invisible to PETSc's event system. GPU execution is confirmed indirectly by the GpuToCpu transfer pattern in `MatFDColorApply`: PETSc hands off perturbed vectors, the KA kernel evaluates the residual on the GPU, and the result is returned.
\ No newline at end of file
diff --git a/examples/ex19.jl b/examples/ex19.jl
new file mode 100644
index 00000000..11539c5b
--- /dev/null
+++ b/examples/ex19.jl
@@ -0,0 +1,528 @@
+#=
+  ex19.jl — 2D Driven Cavity: velocity–vorticity–temperature formulation
+  Port of PETSc snes/tutorials/ex19.c
+
+  Solves on the unit square [0,1]²:
+    −∇²u  − ∂ω/∂y               = 0
+    −∇²v  + ∂ω/∂x               = 0
+    −∇²ω  + ∇·(uω, vω) − Gr·∂T/∂x = 0
+    −∇²T  + Pr·∇·(uT, vT)       = 0
+
+  4 DOFs per node: (u, v, ω, T).  Convective terms are upwinded.
+
+  Boundary conditions:
+    All walls:   u = v = 0  (no-slip),  except top lid: u = lidvelocity
+    Left:        T = 0  (cold, Dirichlet)
+    Right:       T = 1  (hot, Dirichlet, only when Gr > 0)
+    Top/bottom:  ∂T/∂n = 0  (insulated, Neumann)
+    ω:           derived from the no-slip condition at each wall
+
+  Usage (from the examples/ directory):
+    # Basic run (4×4 default grid)
+    julia --project ex19.jl
+
+    # Larger grid with SNES convergence output
+    julia --project ex19.jl -snes_monitor -snes_converged_reason -da_grid_x 129 -da_grid_y 129
+
+    # With PETSc performance log
+    julia --project ex19.jl -da_grid_x 129 -da_grid_y 129 -log_view
+
+    # Multigrid preconditioner (3 levels, Chebyshev+Jacobi smoothers)
+    julia --project ex19.jl -da_grid_x 125 -da_grid_y 125 \\
+        -pc_type mg -pc_mg_levels 3 \\
+        -mg_levels_ksp_type chebyshev -mg_levels_pc_type jacobi \\
+        -snes_monitor -ksp_monitor
+
+    # MPI parallel (4 ranks)
+    mpiexec -n 4 julia --project ex19.jl \\
+        -da_grid_x 256 -da_grid_y 256 \\
+        -pc_type mg -pc_mg_levels 4 \\
+        -mg_levels_ksp_type chebyshev -mg_levels_pc_type jacobi
+
+    # GPU (CUDA) — set  useCUDA = true  at the top of the file, then:
+    julia --project ex19.jl -da_grid_x 256 -da_grid_y 256 \\
+        -pc_type mg -pc_mg_levels 4 \\
+        -mg_levels_ksp_type chebyshev -mg_levels_pc_type jacobi
+
+  Jacobian strategy:
+    Fine-grid level: manual FD coloring via PETSc.dmda_star_fd_coloring +
+      MatSetPreallocationCOOLocal + MatSetValuesCOO.  On GPU the entire
+      perturb → F(x+h) → accumulate loop runs on-device with no host copies.
+    Coarser MG levels: fall back to SNESComputeJacobianDefaultColor
+      (correct for each level's DM, CPU-only).
+
+  Requires: LocalPreferences.toml in examples/ with PetscInt = "Int32" matching
+    the PETSc build (check with: grep sizeof_PetscInt petscconf.h).
+
+  GPU usage: set  useCUDA = true  at the top of this file.
+    Requires PETSc built with --with-cuda, and CUDA.jl in the environment.
+=#
+
+# ── GPU switch ────────────────────────────────────────────────────────────────
+const useCUDA = false
+
+using MPI
+using PETSc
+using KernelAbstractions
+
+if useCUDA
+    using CUDA
+end
+
+
+# ── Physical parameters ──────────────────────────────────────────────────────
+Base.@kwdef mutable struct AppCtx{T}
+    lidvelocity :: T = one(T)
+    prandtl     :: T = one(T)
+    grashof     :: T = one(T)
+end
+
+# ── Residual kernel ───────────────────────────────────────────────────────────
+#
+# Iterates over the locally-owned grid: (li, lj) ∈ 1..nx_own × 1..ny_own.
+#
+# Array layout (plain 1-based, no OffsetArray — GPU-compatible):
+#   x_par[dof, xi, xj]   ghost input,  xi = li + ox, xj = lj + oy
+#   f_par[dof, li, lj]   owned output
+#
+# ox = xs - xsg, oy = ys - ysg  (ghost offsets; 0 at a domain boundary, 1 otherwise)
+#
+# Corner priority matches the C code: left/right walls processed last, so they
+# take precedence over bottom/top at corners.  Achieved here by checking i==1
+# and i==mx before j==1 and j==my.
+#
+@kernel function cavity_residual_kernel!(
+    f_par,
+    x_par,
+    dhx, dhy,           # mx-1, my-1
+    hx,  hy,            # 1/dhx, 1/dhy
+    hydhx, hxdhy,       # metric factors for Laplacian scaling
+    grashof, prandtl, lid,
+    mx :: Int, my :: Int,
+    xs :: Int, ys :: Int,
+    ox :: Int, oy :: Int,
+)
+    li, lj = @index(Global, NTuple)
+
+    i  = xs + li - 1   # global 1-based x-coordinate
+    j  = ys + lj - 1   # global 1-based y-coordinate
+    xi = li + ox        # ghost-array x-index for this point
+    xj = lj + oy        # ghost-array y-index for this point
+
+    # ── Boundary conditions ───────────────────────────────────────────────────
+
+    if i == 1                           # left wall — cold (T = 0)
+        f_par[1, li, lj] = x_par[1, xi,   xj  ]
+        f_par[2, li, lj] = x_par[2, xi,   xj  ]
+        f_par[3, li, lj] = x_par[3, xi,   xj  ] -
+            (x_par[2, xi+1, xj] - x_par[2, xi, xj]) * dhx
+        f_par[4, li, lj] = x_par[4, xi,   xj  ]
+
+    elseif i == mx                      # right wall — hot (T = 1 when Gr > 0)
+        f_par[1, li, lj] = x_par[1, xi,   xj  ]
+        f_par[2, li, lj] = x_par[2, xi,   xj  ]
+        f_par[3, li, lj] = x_par[3, xi,   xj  ] -
+            (x_par[2, xi, xj] - x_par[2, xi-1, xj]) * dhx
+        f_par[4, li, lj] = x_par[4, xi,   xj  ] -
+            (grashof > zero(grashof) ? one(grashof) : zero(grashof))
+
+    elseif j == 1                       # bottom wall — no-slip, insulated
+        f_par[1, li, lj] = x_par[1, xi,   xj  ]
+        f_par[2, li, lj] = x_par[2, xi,   xj  ]
+        f_par[3, li, lj] = x_par[3, xi,   xj  ] +
+            (x_par[1, xi, xj+1] - x_par[1, xi, xj]) * dhy
+        f_par[4, li, lj] = x_par[4, xi,   xj  ] - x_par[4, xi, xj+1]
+
+    elseif j == my                      # top wall — moving lid, insulated
+        f_par[1, li, lj] = x_par[1, xi,   xj  ] - lid
+        f_par[2, li, lj] = x_par[2, xi,   xj  ]
+        f_par[3, li, lj] = x_par[3, xi,   xj  ] +
+            (x_par[1, xi, xj] - x_par[1, xi, xj-1]) * dhy
+        f_par[4, li, lj] = x_par[4, xi,   xj  ] - x_par[4, xi, xj-1]
+
+    else                                # ── interior point ───────────────────
+
+        # Upwind split of advecting velocities
+        vx  = x_par[1, xi, xj];  avx = abs(vx)
+        vxp = oftype(vx, 0.5) * (vx + avx)   # max(vx, 0)
+        vxm = oftype(vx, 0.5) * (vx - avx)   # min(vx, 0)
+
+        vy  = x_par[2, xi, xj];  avy = abs(vy)
+        vyp = oftype(vy, 0.5) * (vy + avy)
+        vym = oftype(vy, 0.5) * (vy - avy)
+
+        # u-equation:  −∇²u − ∂ω/∂y = 0
+        cu  = x_par[1, xi, xj]
+        uxx = (2cu - x_par[1, xi-1, xj] - x_par[1, xi+1, xj]) * hydhx
+        uyy = (2cu - x_par[1, xi, xj-1] - x_par[1, xi, xj+1]) * hxdhy
+        f_par[1, li, lj] = uxx + uyy -
+            oftype(cu, 0.5) * (x_par[3, xi, xj+1] - x_par[3, xi, xj-1]) * hx
+
+        # v-equation:  −∇²v + ∂ω/∂x = 0
+        cv  = x_par[2, xi, xj]
+        vxx = (2cv - x_par[2, xi-1, xj] - x_par[2, xi+1, xj]) * hydhx
+        vyy = (2cv - x_par[2, xi, xj-1] - x_par[2, xi, xj+1]) * hxdhy
+        f_par[2, li, lj] = vxx + vyy +
+            oftype(cv, 0.5) * (x_par[3, xi+1, xj] - x_par[3, xi-1, xj]) * hy
+
+        # ω-equation:  −∇²ω + ∇·(uω, vω) − Gr·∂T/∂x = 0
+        cω  = x_par[3, xi, xj]
+        wxx = (2cω - x_par[3, xi-1, xj] - x_par[3, xi+1, xj]) * hydhx
+        wyy = (2cω - x_par[3, xi, xj-1] - x_par[3, xi, xj+1]) * hxdhy
+        f_par[3, li, lj] = wxx + wyy +
+            (vxp * (cω - x_par[3, xi-1, xj]) + vxm * (x_par[3, xi+1, xj] - cω)) * hy +
+            (vyp * (cω - x_par[3, xi, xj-1]) + vym * (x_par[3, xi, xj+1] - cω)) * hx -
+            oftype(cω, 0.5) * grashof * (x_par[4, xi+1, xj] - x_par[4, xi-1, xj]) * hy
+
+        # T-equation:  −∇²T + Pr·∇·(uT, vT) = 0
+        cT  = x_par[4, xi, xj]
+        txx = (2cT - x_par[4, xi-1, xj] - x_par[4, xi+1, xj]) * hydhx
+        tyy = (2cT - x_par[4, xi, xj-1] - x_par[4, xi, xj+1]) * hxdhy
+        f_par[4, li, lj] = txx + tyy + prandtl * (
+            (vxp * (cT - x_par[4, xi-1, xj]) + vxm * (x_par[4, xi+1, xj] - cT)) * hy +
+            (vyp * (cT - x_par[4, xi, xj-1]) + vym * (x_par[4, xi, xj+1] - cT)) * hx)
+    end
+end
+
+# ── FD-coloring GPU helper kernels ────────────────────────────────────────────
+#
+# scatter_perturb_kernel!: add h to selected entries of a vector.
+#   cols[k] is the 1-based Julia index to perturb.
+#
+@kernel function scatter_perturb_kernel!(x, cols, h)
+    k = @index(Global)
+    @inbounds x[cols[k]] += h
+end
+
+# fd_accumulate_kernel!: write (f1 − f0)/h into val at COO indices.
+#   coo_idxs[k] and row_idxs[k] are 1-based Julia indices.
+#
+@kernel function fd_accumulate_kernel!(val, f0, f1, coo_idxs, row_idxs, inv_h)
+    k = @index(Global)
+    @inbounds val[coo_idxs[k]] = (f1[row_idxs[k]] - f0[row_idxs[k]]) * inv_h
+end
+
+# ── FD-coloring Jacobian fill ─────────────────────────────────────────────────
+#
+# Fills val_dev[k] with the forward-difference Jacobian value for COO slot k.
+# Loops over colors: scatter +h onto the owned columns of that color, evaluate
+# F(x + h·eₖ), then accumulate (F1 − F0)/h into val_dev at the matching slots.
+# Does NOT call MatSetValuesCOO or MatAssembly; those remain with the caller.
+#
+# Captures from module scope: useCUDA, backend, CuArray, CuPtr,
+#   scatter_perturb_kernel!, fd_accumulate_kernel!, KernelAbstractions.
+#
+function fd_coloring_jac!(
+    petsclib,
+    snes,
+    g_x,
+    f0_vec, f1_vec, x_pert_vec,
+    val_dev  :: AbstractVector{T},
+    n_colors    :: Int,
+    perturb_cols_dev,
+    coo_idxs_dev,
+    local_rows_dev,
+    h_eps :: T,
+    inv_h :: T,
+) where T
+    LibPETSc.SNESComputeFunction(petsclib, snes, g_x, f0_vec)
+    PETSc.withlocalarray!(f0_vec; read=true, write=false) do f0
+        for c in 1:n_colors
+            isempty(perturb_cols_dev[c]) && continue
+
+            LibPETSc.VecCopy(petsclib, g_x, x_pert_vec)
+            PETSc.withlocalarray!(x_pert_vec; read=true, write=true) do xp
+                kb = KernelAbstractions.get_backend(xp)
+                scatter_perturb_kernel!(kb, 64)(
+                    xp, perturb_cols_dev[c], h_eps;
+                    ndrange = length(perturb_cols_dev[c]))
+                KernelAbstractions.synchronize(kb)
+            end
+
+            LibPETSc.SNESComputeFunction(petsclib, snes, x_pert_vec, f1_vec)
+
+            PETSc.withlocalarray!(f1_vec; read=true, write=false) do f1
+                kb = KernelAbstractions.get_backend(f1)
+                fd_accumulate_kernel!(kb, 64)(
+                    val_dev, f0, f1,
+                    coo_idxs_dev[c], local_rows_dev[c], inv_h;
+                    ndrange = length(coo_idxs_dev[c]))
+                KernelAbstractions.synchronize(kb)
+            end
+        end
+    end
+    return nothing
+end
+
+# ── Setup ─────────────────────────────────────────────────────────────────────
+opts     = isinteractive() ? NamedTuple() : PETSc.parse_options(filter(a -> a != "-log_view", ARGS))
+log_view = "-log_view" in ARGS
+
+petsclib = PETSc.getlib(; PetscScalar = Float64, PetscInt = Int32)
+PETSc.initialize(petsclib; log_view)
+
+_T        = Float64
+PetscInt = petsclib.PetscInt
+comm     = MPI.COMM_WORLD
+
+# DMDA: 4×4 default (matches ex19.c); override via -da_grid_x / -da_grid_y
+da = PETSc.DMDA(
+    petsclib, comm,
+    (PETSc.DM_BOUNDARY_NONE, PETSc.DM_BOUNDARY_NONE),
+    (4, 4),
+    4,   # DOFs per node: (u, v, ω, T)
+    1,   # stencil width
+    PETSc.DMDA_STENCIL_STAR;
+    opts...,
+)
+
+# GPU vecs and GPU matrix enable a fully GPU-resident FD coloring path via
+# COO preallocation.  No host↔device bouncing in residual or Jacobian.
+if useCUDA
+    LibPETSc.DMSetVecType(petsclib, da, "cuda")
+    LibPETSc.DMSetMatType(petsclib, da, "aijcusparse")
+end
+
+snes = PETSc.SNES(petsclib, comm; opts...)
+PETSc.setDM!(snes, da)
+
+# Actual grid size after setfromoptions (may differ from the 4×4 default)
+info = PETSc.getinfo(da)
+mx   = Int(info.global_size[1])
+my   = Int(info.global_size[2])
+
+user = AppCtx{_T}(
+    lidvelocity = _T(1) / (mx - 1),
+    prandtl     = _T(1),
+    grashof     = _T(1),
+)
+
+# Precomputed grid metrics
+dhx   = _T(mx - 1);   dhy   = _T(my - 1)
+hx    = one(_T) / dhx; hy    = one(_T) / dhy
+hydhx = hy * dhx;    hxdhy = hx * dhy
+
+# ── Initial condition: u = v = ω = 0, T linear in x ─────────────────────────
+x = PETSc.DMGlobalVec(da)
+
+PETSc.withlocalarray!(x; read = false) do x_arr
+    corners = PETSc.getcorners(da)
+    xs = corners.lower[1];  ys = corners.lower[2]
+    xe = corners.upper[1];  ye = corners.upper[2]
+    nx_own = xe - xs + 1;   ny_own = ye - ys + 1
+    dx = one(_T) / (mx - 1)
+    x_par = reshape(x_arr, 4, nx_own, ny_own)
+    # Components u, v, ω start at zero; T is linear in x (or zero if grashof=0).
+    # Use broadcast-friendly assignment so this works for both Array and CuArray.
+    fill!(x_par, zero(_T))
+    if user.grashof > 0
+        # T = (ig - 1) * dx  where ig (1-based global x-index) = xs + li - 1
+        # Build on host then copy to the same device as x_arr.
+        t_cpu = _T.((xs - 1 : xs + nx_own - 2) .* dx)   # 1-D CPU, length nx_own
+        t_dev = similar(x_arr, nx_own)
+        copyto!(t_dev, t_cpu)
+        x_par[4, :, :] .= reshape(t_dev, nx_own, 1)
+    end
+end
+
+# ── Residual callback ─────────────────────────────────────────────────────────
+r = similar(x)
+
+PETSc.setfunction!(snes, r) do g_fx, snes, g_x
+    da = PETSc.getDM(snes)
+
+    l_x = PETSc.DMLocalVec(da)
+    PETSc.dm_global_to_local!(g_x, l_x, da, PETSc.INSERT_VALUES)
+
+    corners       = PETSc.getcorners(da)
+    ghost_corners = PETSc.getghostcorners(da)
+
+    xs  = corners.lower[1];        ys  = corners.lower[2]
+    xe  = corners.upper[1];        ye  = corners.upper[2]
+    xsg = ghost_corners.lower[1];  ysg = ghost_corners.lower[2]
+    xeg = ghost_corners.upper[1];  yeg = ghost_corners.upper[2]
+
+    nx_own = xe  - xs  + 1;  ny_own = ye  - ys  + 1
+    nx_g   = xeg - xsg + 1;  ny_g   = yeg - ysg + 1
+    ox = xs - xsg;  oy = ys - ysg
+
+    # Recompute grid metrics from the DM so this callback is correct on every
+    # MG level (coarsen/refine changes mx/my; capturing outer-scope values
+    # would give wrong stencil weights and lid velocity on coarse grids).
+    info_  = PETSc.getinfo(da)
+    mx_    = Int(info_.global_size[1])
+    my_    = Int(info_.global_size[2])
+    dhx_   = _T(mx_ - 1);   dhy_   = _T(my_ - 1)
+    hx_    = one(_T) / dhx_; hy_    = one(_T) / dhy_
+    hydhx_ = hy_ * dhx_;    hxdhy_ = hx_ * dhy_
+    lid_   = _T(1) / dhx_    # lidvelocity = 1/(mx-1)
+
+    # withlocalarray! handles CPU/GPU dispatch: returns Vector on host, CuArray
+    # on device.  Both vecs must be on the same device (guaranteed per MG level).
+    PETSc.withlocalarray!(g_fx, l_x; read=(true, true), write=(true, false)) do fx, lx
+        kern  = KernelAbstractions.get_backend(fx)
+        x_par = reshape(lx, 4, nx_g,   ny_g)
+        f_par = reshape(fx, 4, nx_own, ny_own)
+        cavity_residual_kernel!(kern, 64)(
+            f_par, x_par,
+            dhx_, dhy_, hx_, hy_, hydhx_, hxdhy_,
+            user.grashof, user.prandtl, lid_,
+            mx_, my_, xs, ys, ox, oy;
+            ndrange = (nx_own, ny_own),
+        )
+        KernelAbstractions.synchronize(kern)
+    end
+
+    PETSc.destroy(l_x)
+    return PetscInt(0)
+end
+
+# ── Jacobian: manual FD coloring with GPU-efficient COO matrix assembly ───────
+#
+# Setup (one-time):
+#   dmda_star_fd_coloring  — builds IS_COLORING_LOCAL coloring and analytically
+#     derives the STAR-stencil COO (row, col) triplets; returns per-color index
+#     arrays for the owned columns and COO slots to fill.
+#   MatSetPreallocationCOOLocal  — registers the COO pattern (enables on-device
+#     scatter on GPU; avoids per-entry hash-table lookups on CPU).
+#
+# Each Newton step (fd_coloring_jac!):
+#   For each color: copy x → x_pert, scatter +h to owned cols of that color
+#   (GPU kernel), evaluate F(x_pert), accumulate (F1−F0)/h into val[] (GPU
+#   kernel).  Then MatSetValuesCOO assembles J from val[] in one call.
+#
+# NOTE: For MG, coarser levels fall back to SNESComputeJacobianDefaultColor
+#       (correct, but CPU-only FD coloring for those levels).
+
+# ── Coloring + COO index setup ────────────────────────────────────────────────
+# Builds the IS_COLORING_LOCAL coloring for da's STAR stencil, ghost-local COO
+# (row, col) pairs, and per-color owned-column / COO-entry index arrays.
+# 2-D DMDA STAR stencil only; see PETSc.dmda_star_fd_coloring for 3-D notes.
+coloring      = PETSc.dmda_star_fd_coloring(petsclib, da)
+n_colors      = coloring.n_colors
+n_local_dofs  = coloring.n_local_dofs
+nnz_coo       = coloring.nnz_coo
+row_coo_local = coloring.row_coo_local
+col_coo_local = coloring.col_coo_local
+
+# ── Create J ─────────────────────────────────────────────────────────────────
+J = LibPETSc.DMCreateMatrix(petsclib, da)
+# Register the COO pattern on both CPU and GPU.  This allows MatSetValuesCOO
+# to be used for assembly in both cases, avoiding per-entry hash-table lookups
+# that MatSetValuesLocal incurs.  On GPU it also enables device-side scatter.
+LibPETSc.MatSetPreallocationCOOLocal(petsclib, J, LibPETSc.PetscCount(nnz_coo), row_coo_local, col_coo_local)
+
+# ── Per-color index arrays for the FD loop ───────────────────────────────────
+perturb_cols_1b = coloring.perturb_cols
+coo_idxs_1b     = coloring.coo_idxs
+local_rows_1b   = coloring.local_rows
+
+if useCUDA
+    perturb_cols_dev = [CuArray(v) for v in perturb_cols_1b]
+    coo_idxs_dev     = [CuArray(v) for v in coo_idxs_1b]
+    local_rows_dev   = [CuArray(v) for v in local_rows_1b]
+    val_dev          = CUDA.zeros(_T, nnz_coo)
+else
+    perturb_cols_dev = perturb_cols_1b
+    coo_idxs_dev     = coo_idxs_1b
+    local_rows_dev   = local_rows_1b
+    val_dev          = zeros(_T, nnz_coo)
+end
+
+# ── Scratch vectors for the FD loop ────────────────────────────────────────
+x_pert_vec = LibPETSc.VecDuplicate(petsclib, x)
+f0_vec     = LibPETSc.VecDuplicate(petsclib, x)
+f1_vec     = LibPETSc.VecDuplicate(petsclib, x)
+h_eps      = _T(sqrt(eps(_T)))
+inv_h      = _T(1) / h_eps
+
+# ── Jacobian callback ────────────────────────────────────────────────────────
+PETSc.setjacobian!(snes, J) do Jmat, actual_snes, g_x
+    # For MG: if this is a coarser level (different DM than the fine-grid da),
+    # fall back to PETSc's built-in FD coloring (correct for that level's DM).
+    da_level = PETSc.getDM(actual_snes)
+    if da_level.ptr != da.ptr
+        LibPETSc.SNESComputeJacobianDefaultColor(petsclib, actual_snes, g_x, Jmat, Jmat, C_NULL)
+        return PetscInt(0)
+    end
+
+    # ── FD coloring: fill val_dev with Jacobian entries ────────────────────────
+    fd_coloring_jac!(
+        petsclib, actual_snes, g_x,
+        f0_vec, f1_vec, x_pert_vec, val_dev,
+        n_colors,
+        perturb_cols_dev, coo_idxs_dev, local_rows_dev,
+        h_eps, inv_h,
+    )
+
+    # ── Assemble J via COO ─────────────────────────────────────────────────────
+    # On GPU: pass a raw device pointer so cuSPARSE scatters directly on device.
+    # On CPU: pass the Vector{T} directly (uses the vector overload).
+    # Both paths use the COO pattern registered by MatSetPreallocationCOOLocal.
+    if useCUDA
+        LibPETSc.MatSetValuesCOO(petsclib, Jmat, Ptr{_T}(UInt64(pointer(val_dev))), LibPETSc.INSERT_VALUES)
+    else
+        LibPETSc.MatSetValuesCOO(petsclib, Jmat, val_dev, LibPETSc.INSERT_VALUES)
+    end
+    LibPETSc.MatAssemblyBegin(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
+    LibPETSc.MatAssemblyEnd(petsclib, Jmat, LibPETSc.MAT_FINAL_ASSEMBLY)
+    if useCUDA
+        # Force GPU→CPU sync so both copies are valid.
+        # MatBindToCPU(PETSC_TRUE) triggers MatSeqAIJCUSPARSECopyFromGPU when
+        # offloadmask==PETSC_OFFLOAD_GPU, making the CPU CSR correct.
+        # MatBindToCPU(PETSC_FALSE) then releases the CPU-only restriction,
+        # leaving offloadmask==PETSC_OFFLOAD_BOTH so that:
+        #   MatGetDiagonal (Jacobi smoother in MG)  → reads CPU copy ✓
+        #   MatPtAP (Galerkin coarse-op formation)  → uses GPU copy  ✓
+        LibPETSc.MatBindToCPU(petsclib, Jmat, LibPETSc.PETSC_TRUE)
+        LibPETSc.MatBindToCPU(petsclib, Jmat, LibPETSc.PETSC_FALSE)
+    end
+    return PetscInt(0)
+end
+
+# ── Solve ─────────────────────────────────────────────────────────────────────
+@show Threads.nthreads()
+PETSc.solve!(x, snes)
+
+if MPI.Comm_rank(comm) == 0
+    its = LibPETSc.SNESGetIterationNumber(petsclib, snes)
+    println("SNES converged in $its Newton iterations.")
+end
+
+# ── Cleanup ───────────────────────────────────────────────────────────────────
+# Destroy the PETSc options database stored on the SNES explicitly.  Its GC
+# finalizer calls PetscOptionsDestroy (which touches MPI internally); if GC
+# runs it after PETSc/MPI are finalized the process crashes.  Destroying here
+# while PETSc is still active is always safe.
+if !isnothing(snes.opts)
+    PETSc.destroy(snes.opts)
+    snes.opts = nothing
+end
+
+# Run a full GC so any remaining PETSc-object finalizers fire while PETSc is
+# still active.
+GC.gc(true)
+MPI.Barrier(comm)
+
+# SNES holds internal PETSc references to J, da, and r — destroy it first so
+# those reference counts are decremented before we explicitly free the objects.
+PETSc.destroy(snes)
+PETSc.destroy(J)
+PETSc.destroy(x_pert_vec)
+PETSc.destroy(f0_vec)
+PETSc.destroy(f1_vec)
+PETSc.destroy(x)
+PETSc.destroy(r)
+PETSc.destroy(da)
+PETSc.finalize(petsclib)
+
+MPI.Barrier(comm)
+MPI.Finalize()
+
+# On macOS ARM64 with MPICH ch4:ofi, MPICH's C atexit handler crashes during
+# process teardown (SIGSEGV in libfabric/OFI cleanup).  quick_exit(0) bypasses
+# all C atexit() handlers and avoids the crash.  Skip when running interactively
+# (e.g. include("ex19.jl") in the REPL) so the Julia session isn't killed.
+if !isinteractive()
+    ccall(:quick_exit, Cvoid, (Cint,), 0)
+end
\ No newline at end of file
diff --git a/examples/ex45.jl b/examples/ex45.jl
index 503f7006..8e0001e1 100644
--- a/examples/ex45.jl
+++ b/examples/ex45.jl
@@ -27,7 +27,7 @@ PETSc.set_library!(
 
 
 # Initialize PETSc
-PETSc.initialize(petsclib, log_view=true)
+PETSc.initialize(petsclib, log_view=false)
 
 function solve_ex45(N=7; da_grid_x=7, da_grid_y=7, da_grid_z=7, kwargs...)
     comm = MPI.COMM_WORLD
diff --git a/examples/ex51_implicit.jl b/examples/ex51_implicit.jl
index d5441f87..0f3180b4 100644
--- a/examples/ex51_implicit.jl
+++ b/examples/ex51_implicit.jl
@@ -34,7 +34,7 @@ using PETSc, MPI, Printf
 #
 #     solve_ex51_implicit(options = ["-ts_irk_nstages", "3"])
 
-mutable struct Ex51Context{PetscLib <: PETSc.LibPETSc.PetscLibType}
+mutable struct Ex51ImplicitContext{PetscLib <: PETSc.LibPETSc.PetscLibType}
     petsclib::PetscLib
 end
 
@@ -57,7 +57,7 @@ function ex51_rhs_ifunction!(
     # time derivative, and residual vectors arrive as raw `CVec` pointers. We
     # wrap them with `VecPtr(..., own = false)` so we can use PETSc.jl's array
     # helpers without taking ownership away from PETSc.
-    ctx = unsafe_pointer_to_objref(ctx_ptr)::Ex51Context
+    ctx = unsafe_pointer_to_objref(ctx_ptr)::Ex51ImplicitContext
     petsclib = ctx.petsclib
     # `own = false` since memory is managed by PETSc internally
     u = PETSc.VecPtr(petsclib, u_ptr, false)
@@ -146,7 +146,7 @@ function ex51_ijacobian!(
     B_ptr::PETSc.LibPETSc.CMat,
     ctx_ptr::Ptr{Cvoid},
 )::PETSc.LibPETSc.PetscErrorCode
-    ctx = unsafe_pointer_to_objref(ctx_ptr)::Ex51Context
+    ctx = unsafe_pointer_to_objref(ctx_ptr)::Ex51ImplicitContext
     petsclib = ctx.petsclib
     # `u` is borrowed from PETSc; do not take ownership.
     u = PETSc.VecPtr(petsclib, u_ptr, false)
@@ -308,7 +308,7 @@ function solve_ex51_implicit(;
     current_time = petsclib.PetscReal(NaN)
     error_norm = petsclib.PetscReal(NaN)
     solution = PetscScalar[]
-    ctx = Ex51Context(petsclib)
+    ctx = Ex51ImplicitContext(petsclib)
     petsc_options = PETSc.Options(petsclib; parsed_options...)
     pushed_options = false
 
diff --git a/ext/PETScCUDAExt.jl b/ext/PETScCUDAExt.jl
new file mode 100644
index 00000000..9082f737
--- /dev/null
+++ b/ext/PETScCUDAExt.jl
@@ -0,0 +1,127 @@
+module PETScCUDAExt
+
+using PETSc
+using PETSc: LibPETSc, AbstractPetscVec
+using PETSc.LibPETSc: PETSC_MEMTYPE_DEVICE
+using CUDA
+
+# ── CUDA memory backend ───────────────────────────────────────────────────────
+
+struct CUDAMemBackend <: PETSc.AbstractPETScMemBackend end
+
+PETSc.memtype_backend(::Val{PETSC_MEMTYPE_DEVICE}) = CUDAMemBackend()
+PETSc.array_type(::Val{PETSC_MEMTYPE_DEVICE}) = CuArray
+
+# ── No-finalizer acquire/release for withlocalarray! ─────────────────────────
+
+function PETSc.make_local_array(cpu_arr, ::CUDAMemBackend)
+    T   = eltype(cpu_arr)
+    n   = length(cpu_arr)
+    ptr = reinterpret(CuPtr{T}, UInt(pointer(cpu_arr)))
+    return CUDA.unsafe_wrap(CuArray, ptr, n; own = false)
+end
+
+function PETSc.release_petsc_local_array(
+    cpu_arr, ::CUDAMemBackend, vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
+) where {PLib}
+    pv = PETSc.as_petsc_vec(vec)
+    if write && read
+        LibPETSc.VecRestoreArrayAndMemType(PLib, pv, cpu_arr)
+    elseif write
+        LibPETSc.VecRestoreArrayWriteAndMemType(PLib, pv, cpu_arr)
+    else
+        LibPETSc.VecRestoreArrayReadAndMemType(PLib, pv, cpu_arr)
+    end
+    return nothing
+end
+
+# ── wrap_localarray: device branch (legacy, kept for backward compat) ─────────
+#
+# No longer called by withlocalarray! (which uses acquire/release instead).
+# Retained in case external code calls unsafe_localarray directly.
+
+function PETSc.wrap_localarray(
+    cpu_arr, ::CUDAMemBackend, vec::AbstractPetscVec{PetscLib};
+    read::Bool, write::Bool,
+) where {PetscLib}
+    T   = eltype(cpu_arr)
+    n   = length(cpu_arr)
+    ptr = reinterpret(CuPtr{T}, UInt(pointer(cpu_arr)))
+    dev_arr = CUDA.unsafe_wrap(CuArray, ptr, n; own = false)
+    pv = PETSc.as_petsc_vec(vec)
+    finalizer(dev_arr) do _
+        if write && read
+            LibPETSc.VecRestoreArrayAndMemType(PetscLib, pv, cpu_arr)
+        elseif write
+            LibPETSc.VecRestoreArrayWriteAndMemType(PetscLib, pv, cpu_arr)
+        else
+            LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, pv, cpu_arr)
+        end
+        return nothing
+    end
+    return dev_arr
+end
+
+# ── get_petsc_arrays_impl: CUDA cases ────────────────────────────────────────
+#
+# Two methods cover all GPU sub-cases:
+#
+#   CUDAMemBackend × CUDAMemBackend → both Vecs on device: zero-copy wrap, no bounce
+#   any other mix             → at least one Vec is host-resident:
+#                               lx is wrapped zero-copy if on device, or
+#                               copied H2D if on host;
+#                               fx always gets a fresh scratch CuArray (bounce)
+#                               so the kernel writes there and restore copies D2H.
+#
+# nothing × nothing (host) is handled entirely in base (vec.jl) and never
+# reaches these methods.
+
+# Both Vecs on the device: zero-copy wrap, no scratch needed.
+function PETSc.get_petsc_arrays_impl(
+    petsclib, g_fx, l_x, ::Type{T}, fx_arr, lx_arr,
+    ::CUDAMemBackend, ::CUDAMemBackend,
+) where {T}
+    fx = CUDA.unsafe_wrap(CuArray,
+        reinterpret(CuPtr{T}, UInt(pointer(fx_arr))), length(fx_arr))
+    lx = CUDA.unsafe_wrap(CuArray,
+        reinterpret(CuPtr{T}, UInt(pointer(lx_arr))), length(lx_arr))
+    return fx, lx, fx_arr, lx_arr, nothing
+end
+
+# At least one Vec is host-resident (e.g. MG coarser levels, FD-coloring path).
+# Catch-all: less specific than (CUDAMemBackend, CUDAMemBackend), so Julia prefers
+# the method above when both are on the device.
+function PETSc.get_petsc_arrays_impl(
+    petsclib, g_fx, l_x, ::Type{T}, fx_arr, lx_arr,
+    fx_b::PETSc.AbstractPETScMemBackend, lx_b::PETSc.AbstractPETScMemBackend,
+) where {T}
+    lx_gpu = if lx_b isa CUDAMemBackend
+        CUDA.unsafe_wrap(CuArray,
+            reinterpret(CuPtr{T}, UInt(pointer(lx_arr))), length(lx_arr))
+    else
+        tmp = CuArray{T}(undef, length(lx_arr))
+        copyto!(tmp, lx_arr)    # H2D: send ghost input to GPU
+        tmp
+    end
+    fx_gpu = CuArray{T}(undef, length(fx_arr))  # scratch buffer for residual
+    return fx_gpu, lx_gpu, fx_arr, lx_arr, fx_gpu
+end
+
+# ── restore_petsc_arrays_impl: CUDA ──────────────────────────────────────────
+#
+# When fx is a CuArray (returned by the GPU get_petsc_arrays_impl above):
+#   - if fx_bounce !== nothing, sync the device and copy the scratch D2H
+#   - call VecRestoreArray*AndMemType on both raw PETSc arrays
+
+function PETSc.restore_petsc_arrays_impl(
+    petsclib, g_fx, l_x, fx::CuArray, lx, fx_arr, lx_arr, fx_bounce,
+)
+    if fx_bounce !== nothing
+        CUDA.synchronize()
+        copyto!(fx_arr, fx_bounce)  # D2H: copy residual back to host PETSc array
+    end
+    LibPETSc.VecRestoreArrayAndMemType(petsclib, PETSc.as_petsc_vec(g_fx), fx_arr)
+    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, PETSc.as_petsc_vec(l_x), lx_arr)
+end
+
+end # module PETScCUDAExt
diff --git a/src/PETSc.jl b/src/PETSc.jl
index 97b613bb..c2bfe815 100644
--- a/src/PETSc.jl
+++ b/src/PETSc.jl
@@ -24,9 +24,17 @@ export LibPETSc
 export audit_petsc_file
 export set_petsclib
 export set_library!, unset_library!, library_info
+export AbstractPETScMemBackend, HostBackend
+export determine_memtype
+export get_petsc_arrays, restore_petsc_arrays
+export dmda_star_fd_coloring
 
 using Libdl
 
+# String convenience wrappers for SetType functions
+include("string_wrappers.jl")
+include("string_wrappers_extra.jl")
+
 include("init.jl")
 include("vec.jl")       
 include("mat.jl")          
@@ -39,11 +47,6 @@ include("sys.jl")
 include("dmda.jl")          
 include("dmstag.jl")
 
-# String convenience wrappers for SetType functions
-include("string_wrappers.jl")       
-include("string_wrappers_extra.jl")
-
-
 include("audit.jl")
 
 
diff --git a/src/autowrapped/DM_wrappers.jl b/src/autowrapped/DM_wrappers.jl
index a1aeb59e..2ab00b21 100644
--- a/src/autowrapped/DM_wrappers.jl
+++ b/src/autowrapped/DM_wrappers.jl
@@ -2618,14 +2618,14 @@ See also:
 # External Links
 $(_doc_external("DM/DMSetVecType"))
 """
-function DMSetVecType(petsclib::PetscLibType, dm::PetscDM, ctype::VecType) end
+function DMSetVecType(petsclib::PetscLibType, dm::PetscDM, ctype::Union{Cstring, AbstractString}) end
 
-@for_petsc function DMSetVecType(petsclib::$UnionPetscLib, dm::PetscDM, ctype::VecType )
+@for_petsc function DMSetVecType(petsclib::$UnionPetscLib, dm::PetscDM, ctype::Union{Cstring, AbstractString} )
 
     @chk ccall(
                (:DMSetVecType, $petsc_library),
                PetscErrorCode,
-               (CDM, VecType),
+               (CDM, Cstring),
                dm, ctype,
               )
 
@@ -2773,14 +2773,14 @@ See also:
 # External Links
 $(_doc_external("DM/DMSetMatType"))
 """
-function DMSetMatType(petsclib::PetscLibType, dm::PetscDM, ctype::MatType) end
+function DMSetMatType(petsclib::PetscLibType, dm::PetscDM, ctype::Union{Cstring, AbstractString}) end
 
-@for_petsc function DMSetMatType(petsclib::$UnionPetscLib, dm::PetscDM, ctype::MatType )
+@for_petsc function DMSetMatType(petsclib::$UnionPetscLib, dm::PetscDM, ctype::Union{Cstring, AbstractString} )
 
     @chk ccall(
                (:DMSetMatType, $petsc_library),
                PetscErrorCode,
-               (CDM, MatType),
+               (CDM, Cstring),
                dm, ctype,
               )
 
diff --git a/src/autowrapped/ISaddons_wrappers.jl b/src/autowrapped/ISaddons_wrappers.jl
index 7b22049e..a4645084 100644
--- a/src/autowrapped/ISaddons_wrappers.jl
+++ b/src/autowrapped/ISaddons_wrappers.jl
@@ -1309,12 +1309,12 @@ $(_doc_external("Vec/ISColoringDestroy"))
 function ISColoringDestroy(petsclib::PetscLibType, iscoloring::ISColoring) end
 
 @for_petsc function ISColoringDestroy(petsclib::$UnionPetscLib, iscoloring::ISColoring )
-
+    iscoloring_ref = Ref(iscoloring)
     @chk ccall(
                (:ISColoringDestroy, $petsc_library),
                PetscErrorCode,
                (Ptr{ISColoring},),
-               iscoloring,
+               iscoloring_ref,
               )
 
 
@@ -1387,7 +1387,7 @@ function ISColoringView(petsclib::PetscLibType, iscoloring::ISColoring, viewer::
 end 
 
 """
-	n::PetscInt,nc::PetscInt = ISColoringGetColors(petsclib::PetscLibType,iscoloring::ISColoring, colors::ISColoringValue) 
+	n::PetscInt, nc::PetscInt, colors::Vector{UInt16} = ISColoringGetColors(petsclib::PetscLibType, iscoloring::ISColoring)
 Returns an array with the color for each local node
 
 Not Collective
@@ -1396,9 +1396,9 @@ Input Parameter:
 - `iscoloring` - the coloring context
 
 Output Parameters:
-- `n`      - number of nodes
+- `n`      - number of nodes (DOFs)
 - `nc`     - number of colors
-- `colors` - color for each node
+- `colors` - copy of the color array (one `UInt16` entry per DOF)
 
 Level: advanced
 
@@ -1407,23 +1407,30 @@ Level: advanced
 # External Links
 $(_doc_external("Vec/ISColoringGetColors"))
 """
-function ISColoringGetColors(petsclib::PetscLibType, iscoloring::ISColoring, colors::ISColoringValue) end
+function ISColoringGetColors(petsclib::PetscLibType, iscoloring::ISColoring) end
 
-@for_petsc function ISColoringGetColors(petsclib::$UnionPetscLib, iscoloring::ISColoring, colors::ISColoringValue )
-	n_ = Ref{$PetscInt}()
-	nc_ = Ref{$PetscInt}()
+@for_petsc function ISColoringGetColors(petsclib::$UnionPetscLib, iscoloring::ISColoring)
+    n_  = Ref{$PetscInt}()
+    nc_ = Ref{$PetscInt}()
+    # PETSc returns a pointer into its own storage; we must not free it.
+    colors_ptr_ = Ref{ISColoringValue}(C_NULL)
 
     @chk ccall(
                (:ISColoringGetColors, $petsc_library),
                PetscErrorCode,
-               (ISColoring, Ptr{$PetscInt}, Ptr{$PetscInt}, ISColoringValue),
-               iscoloring, n_, nc_, colors,
+               (ISColoring, Ptr{$PetscInt}, Ptr{$PetscInt}, Ptr{ISColoringValue}),
+               iscoloring, n_, nc_, colors_ptr_,
               )
 
-	n = n_[]
-	nc = nc_[]
+    n  = n_[]
+    nc = nc_[]
+    # ISColoringValue is a PETSc opaque pointer alias for `unsigned short *`.
+    # Reinterpret as Ptr{UInt16} and copy to a Julia-owned Vector so the
+    # caller can safely use the data after ISColoringDestroy.
+    colors_raw = Ptr{UInt16}(UInt(colors_ptr_[]))
+    colors = copy(unsafe_wrap(Vector{UInt16}, colors_raw, Int(n); own = false))
 
-	return n,nc
+    return n, nc, colors
 end 
 
 """
diff --git a/src/autowrapped/Mat_wrappers.jl b/src/autowrapped/Mat_wrappers.jl
index 7f726f56..3be1ef05 100644
--- a/src/autowrapped/Mat_wrappers.jl
+++ b/src/autowrapped/Mat_wrappers.jl
@@ -19698,6 +19698,7 @@ Level: beginner
 $(_doc_external("Mat/MatSetValuesCOO"))
 """
 function MatSetValuesCOO(petsclib::PetscLibType, A::PetscMat, coo_v::Vector{PetscScalar}, imode::InsertMode) end
+function MatSetValuesCOO(petsclib::PetscLibType, A::PetscMat, coo_v::Ptr{PetscScalar}, imode::InsertMode) end
 
 @for_petsc function MatSetValuesCOO(petsclib::$UnionPetscLib, A::PetscMat, coo_v::Vector{$PetscScalar}, imode::InsertMode )
 
@@ -19712,6 +19713,19 @@ function MatSetValuesCOO(petsclib::PetscLibType, A::PetscMat, coo_v::Vector{Pets
 	return nothing
 end 
 
+@for_petsc function MatSetValuesCOO(petsclib::$UnionPetscLib, A::PetscMat, coo_v::Ptr{$PetscScalar}, imode::InsertMode )
+
+    @chk ccall(
+               (:MatSetValuesCOO, $petsc_library),
+               PetscErrorCode,
+               (CMat, Ptr{$PetscScalar}, InsertMode),
+               A, coo_v, imode,
+              )
+
+
+	return nothing
+end 
+
 """
 	MatSetBindingPropagates(petsclib::PetscLibType,A::PetscMat, flg::PetscBool) 
 Sets whether the state of being bound to the CPU for a GPU matrix type propagates to child and some other associated objects
diff --git a/src/autowrapped/SNES_wrappers.jl b/src/autowrapped/SNES_wrappers.jl
index 1f3d04cd..105796e0 100644
--- a/src/autowrapped/SNES_wrappers.jl
+++ b/src/autowrapped/SNES_wrappers.jl
@@ -5118,7 +5118,7 @@ function SNESComputeJacobianDefault(petsclib::PetscLibType, snes::PetscSNES, x1:
                (:SNESComputeJacobianDefault, $petsc_library),
                PetscErrorCode,
                (CSNES, CVec, CMat, CMat, Ptr{Cvoid}),
-               snes, x1, J, B, ctx,
+               snes, x1, J, B, C_NULL,
               )
 
 
@@ -5396,9 +5396,9 @@ Options Database Keys:
 # External Links
 $(_doc_external("SNES/SNESComputeJacobianDefaultColor"))
 """
-function SNESComputeJacobianDefaultColor(petsclib::PetscLibType, snes::PetscSNES, x1::PetscVec, J::PetscMat, B::PetscMat, ctx::Cvoid) end
+function SNESComputeJacobianDefaultColor(petsclib::PetscLibType, snes::PetscSNES, x1::PetscVec, J::PetscMat, B::PetscMat, ctx::Ptr{Cvoid}) end
 
-@for_petsc function SNESComputeJacobianDefaultColor(petsclib::$UnionPetscLib, snes::PetscSNES, x1::PetscVec, J::PetscMat, B::PetscMat, ctx::Cvoid )
+@for_petsc function SNESComputeJacobianDefaultColor(petsclib::$UnionPetscLib, snes::PetscSNES, x1::PetscVec, J::PetscMat, B::PetscMat, ctx::Ptr{Cvoid})
 
     @chk ccall(
                (:SNESComputeJacobianDefaultColor, $petsc_library),
diff --git a/src/autowrapped/Vec_wrappers.jl b/src/autowrapped/Vec_wrappers.jl
index 12edff86..624b92ee 100644
--- a/src/autowrapped/Vec_wrappers.jl
+++ b/src/autowrapped/Vec_wrappers.jl
@@ -1340,10 +1340,10 @@ function VecGetArrayAndMemType(petsclib::PetscLibType, x::PetscVec) end
               )
 
 	a = unsafe_wrap(Array, a_[], VecGetLocalSize(petsclib, x); own = false)
-	mtype = unsafe_string(mtype_[])
+	mtype = mtype_[]
 
 	return a,mtype
-end 
+end
 
 """
 	VecRestoreArrayAndMemType(petsclib::PetscLibType,x::PetscVec, a::Vector{PetscScalar}) 
@@ -1414,10 +1414,10 @@ function VecGetArrayReadAndMemType(petsclib::PetscLibType, x::PetscVec) end
               )
 
 	a = unsafe_wrap(Array, a_[], VecGetLocalSize(petsclib, x); own = false)
-	mtype = unsafe_string(mtype_[])
+	mtype = mtype_[]
 
 	return a,mtype
-end 
+end
 
 """
 	VecRestoreArrayReadAndMemType(petsclib::PetscLibType,x::PetscVec, a::Vector{PetscScalar}) 
@@ -1487,10 +1487,10 @@ function VecGetArrayWriteAndMemType(petsclib::PetscLibType, x::PetscVec) end
               )
 
 	a = unsafe_wrap(Array, a_[], VecGetLocalSize(petsclib, x); own = false)
-	mtype = unsafe_string(mtype_[])
+	mtype = mtype_[]
 
 	return a,mtype
-end 
+end
 
 """
 	VecRestoreArrayWriteAndMemType(petsclib::PetscLibType,x::PetscVec, a::Vector{PetscScalar}) 
diff --git a/src/autowrapped/petsc_library.jl b/src/autowrapped/petsc_library.jl
index 7a9151ee..070a3f12 100644
--- a/src/autowrapped/petsc_library.jl
+++ b/src/autowrapped/petsc_library.jl
@@ -388,6 +388,7 @@ include("PetscDraw_wrappers.jl")
 include("PetscRegressor_wrappers.jl")
 include("PF_wrappers.jl")
 include("IS_wrappers.jl")
+# include("PC_wrappers.jl")  # excluded: PC type in ccall signatures needs fixing
 include("TS_wrappers.jl")
 include("AO_wrappers.jl")
 include("Tao_wrappers.jl")
diff --git a/src/autowrapped/senums_wrappers.jl b/src/autowrapped/senums_wrappers.jl
index 251b2fd3..e2748ceb 100644
--- a/src/autowrapped/senums_wrappers.jl
+++ b/src/autowrapped/senums_wrappers.jl
@@ -6,9 +6,9 @@ PetscDrawType=Ptr{Cchar}
 PFType=Ptr{Cchar}
 DMAdaptorType=Ptr{Cchar}
 PetscFEType=Ptr{Cchar}
-VecType=Ptr{Cchar}
+VecType=Cstring
 VecTaggerType=Ptr{Cchar}
-MatType=Ptr{Cchar}
+MatType=Cstring
 MatSolverType=Ptr{Cchar}
 MatProductAlgorithm=Ptr{Cchar}
 MatOrderingType=Ptr{Cchar}
diff --git a/src/dm.jl b/src/dm.jl
index 49561639..bdb80151 100644
--- a/src/dm.jl
+++ b/src/dm.jl
@@ -6,15 +6,19 @@ function Base.show(io::IO, v::AbstractPetscDM{PetscLib}) where {PetscLib}
         print(io, "PETSc DM (null pointer)")
         return
     end
-    
-    # Try to get DM info, but handle uninitialized DMs gracefully
+    # DMGetType internally calls DMInitializePackage which queries the PETSc
+    # options database.  Calling it before PETSc is initialised causes a C-level
+    # SIGSEGV that cannot be caught with try/catch.
+    if !initialized(PetscLib)
+        print(io, "PETSc DM (PETSc not initialized)")
+        return
+    end
     try
         ty = LibPETSc.DMGetType(PetscLib, v)
         di = LibPETSc.DMGetDimension(PetscLib, v)
         print(io, "PETSc DM $ty object in $di dimensions")
     catch
-        # DM not fully initialized yet (type not set)
-        print(io, "PETSc DM (not yet initialized)")
+        print(io, "PETSc DM (type not set)")
     end
     return nothing
 end
@@ -445,3 +449,4 @@ function MatAIJ(da::AbstractPetscDM{PetscLib}) where {PetscLib}
     J = LibPETSc.DMCreateMatrix(getlib(PetscLib), da)
     return J
 end
+
diff --git a/src/dmda.jl b/src/dmda.jl
index 96fc9c8c..92caf412 100644
--- a/src/dmda.jl
+++ b/src/dmda.jl
@@ -204,4 +204,175 @@ function localinteriorlinearindex(da::AbstractPetscDM{PetscLib}) where PetscLib
     upper = CartesianIndex(ndofs(da), ghost_corners.upper)
     ind_local = LinearIndices(lower:upper)[:, l_inds][:]
     return ind_local
+end
+
+"""
+    dmda_star_fd_coloring(petsclib, da)
+
+Build all data needed for manual FD coloring of a **2-D** DMDA with a STAR
+stencil, using `IS_COLORING_LOCAL` and ghost-local COO indexing.
+
+Specifically, this function:
+1. Creates an `IS_COLORING_LOCAL` `ISColoring` via `DMCreateColoring` and
+   extracts the per-DOF color vector (ghost-local layout).
+2. Enumerates all STAR-stencil (row, col) pairs for every owned node and
+   records their ghost-local 0-based indices and colors.
+3. Builds per-color index arrays (`perturb_cols`, `coo_idxs`, `local_rows`)
+   ready for use in an FD coloring Newton loop.
+
+Returns a `NamedTuple`:
+- `n_colors`      — number of colors
+- `n_local_dofs`  — number of locally owned DOFs (owned nodes × dof/node)
+- `nnz_coo`       — total number of COO entries
+- `row_coo_local` — ghost-local 0-based row indices (`Vector{PetscInt}`)
+- `col_coo_local` — ghost-local 0-based column indices (`Vector{PetscInt}`)
+- `perturb_cols`  — `perturb_cols[c]`: 1-based owned-local column indices
+                    with color `c-1`; used to scatter `+h` perturbations.
+- `coo_idxs`      — `coo_idxs[c]`: 1-based COO entry indices for color `c-1`
+- `local_rows`    — `local_rows[c]`: corresponding 1-based owned-local
+                    residual-row indices; used to read `(f1-f0)/h`.
+
+!!! note "2-D DMDA STAR stencil only"
+    Neighbor enumeration covers only `±x` and `±y` directions.  The ghost-local
+    flat-index formula is `d + ix*dof + iy*dof*nx_g`.  For a 3-D DMDA:
+    - add `±z` neighbors guarded by `kk > 1` / `kk < mz`,
+    - extend the flat-index formula with `+ iz*dof*nx_g*ny_g`,
+    - reshape `col_colors_mat` to `(dof, nx_g, ny_g, nz_g)`,
+    - decode `z_owned` in the `perturb_cols` loop.
+"""
+function dmda_star_fd_coloring(petsclib::PetscLib, da::AbstractPetscDM{PetscLib}) where PetscLib
+    CPetscInt = petsclib.PetscInt
+
+    # ── ISColoring ────────────────────────────────────────────────────────────
+    # IS_COLORING_LOCAL covers owned + ghost DOFs; ghost colors are consistent
+    # with the owning rank so no extra MPI communication is needed here.
+    iscoloring = LibPETSc.DMCreateColoring(petsclib, da, LibPETSc.IS_COLORING_LOCAL)
+    _, nc_pi, col_colors_local = LibPETSc.ISColoringGetColors(petsclib, iscoloring)
+    n_colors = Int(nc_pi)
+    LibPETSc.ISColoringDestroy(petsclib, iscoloring)
+
+    # ── DMDA geometry ─────────────────────────────────────────────────────────
+    info          = getinfo(da)
+    mx            = Int(info.global_size[1])
+    my            = Int(info.global_size[2])
+    dof_per_node  = Int(info.dof)
+    corners       = getcorners(da)
+    ghost_corners = getghostcorners(da)
+    xs_da  = corners.lower[1];       ys_da  = corners.lower[2]
+    xe_da  = corners.upper[1];       ye_da  = corners.upper[2]
+    xsg_da = ghost_corners.lower[1]; ysg_da = ghost_corners.lower[2]
+    xeg_da = ghost_corners.upper[1]; yeg_da = ghost_corners.upper[2]
+    nx_g_da = xeg_da - xsg_da + 1
+    ny_g_da = yeg_da - ysg_da + 1
+    nx_own  = xe_da  - xs_da  + 1
+    ny_own  = ye_da  - ys_da  + 1
+    ox_coo  = xs_da  - xsg_da   # ghost offset in x (grid nodes)
+    oy_coo  = ys_da  - ysg_da   # ghost offset in y (grid nodes)
+    n_local_dofs = nx_own * ny_own * dof_per_node
+
+    # col_colors_mat[d+1, ix_ghost+1, iy_ghost+1] → color of that ghost DOF
+    col_colors_mat = reshape(col_colors_local, dof_per_node, nx_g_da, ny_g_da)
+
+    # ── COO triplets from 2-D STAR stencil ────────────────────────────────────
+    # Ghost-local 0-based row/col indices (for MatSetPreallocationCOOLocal).
+    # Both use DMDA ghost-local numbering: idx = d + ix_g*dof + iy_g*dof*nx_g.
+    #
+    # Pre-compute exact nnz_coo analytically so we can allocate once and fill
+    # by index rather than growing via push! (avoids O(nnz) realloc+copy work).
+    # Each owned node contributes dof² entries per STAR neighbor it has.
+    # Interior nodes have 5 neighbors (self + 4); boundary nodes have fewer.
+    nbr_left   = xs_da == 1  ? nx_own - 1 : nx_own   # columns with a left neighbor
+    nbr_right  = xe_da == mx ? nx_own - 1 : nx_own   # columns with a right neighbor
+    nbr_bottom = ys_da == 1  ? ny_own - 1 : ny_own   # rows with a bottom neighbor
+    nbr_top    = ye_da == my ? ny_own - 1 : ny_own   # rows with a top neighbor
+    total_nbr_pairs = nx_own * ny_own +               # self
+                      nbr_left   * ny_own +
+                      nbr_right  * ny_own +
+                      nbr_bottom * nx_own +
+                      nbr_top    * nx_own
+    nnz_coo = total_nbr_pairs * dof_per_node^2
+
+    row_coo_local     = Vector{CPetscInt}(undef, nnz_coo)
+    col_coo_local     = Vector{CPetscInt}(undef, nnz_coo)
+    local_row_per_coo = Vector{CPetscInt}(undef, nnz_coo)  # 0-based owned-local row
+    color_per_coo     = Vector{CPetscInt}(undef, nnz_coo)  # 0-based color of column
+
+    # Inner helper: write dof² entries for a (row-node, col-node) pair.
+    # Captures ix_gh, iy_gh, ix_ow, iy_ow from the outer scope.
+    @inline function fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, nix_gh, njy_gh)
+        r_base = ix_gh  * dof_per_node + iy_gh  * dof_per_node * nx_g_da
+        c_base = nix_gh * dof_per_node + njy_gh * dof_per_node * nx_g_da
+        p_base = ix_ow  * dof_per_node + iy_ow  * nx_own       * dof_per_node
+        for d_row in 0:dof_per_node-1
+            r_local = CPetscInt(d_row + r_base)
+            p_owned = CPetscInt(d_row + p_base)
+            for d_col in 0:dof_per_node-1
+                @inbounds begin
+                    row_coo_local[k]     = r_local
+                    col_coo_local[k]     = CPetscInt(d_col + c_base)
+                    color_per_coo[k]     = CPetscInt(col_colors_mat[d_col+1, nix_gh+1, njy_gh+1])
+                    local_row_per_coo[k] = p_owned
+                end
+                k += 1
+            end
+        end
+        return k
+    end
+
+    k = 1
+    for jj in ys_da:ye_da, ii in xs_da:xe_da
+        ix_gh = ii - xsg_da   # 0-based ghost-x of this owned node
+        iy_gh = jj - ysg_da   # 0-based ghost-y of this owned node
+        ix_ow = ii - xs_da    # 0-based owned-x
+        iy_ow = jj - ys_da    # 0-based owned-y
+
+        # self
+        k = fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, ix_gh, iy_gh)
+        # left
+        ii > 1  && (k = fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, ix_gh - 1, iy_gh))
+        # right
+        ii < mx && (k = fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, ix_gh + 1, iy_gh))
+        # bottom
+        jj > 1  && (k = fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, ix_gh, iy_gh - 1))
+        # top
+        jj < my && (k = fill_nbr!(k, ix_gh, iy_gh, ix_ow, iy_ow, ix_gh, iy_gh + 1))
+    end
+
+    # ── Per-color index arrays ────────────────────────────────────────────────
+    # perturb_cols[c]: 1-based owned-local column indices with color c-1.
+    # col_colors_local uses the ghost-local layout, but VecGetArray returns only
+    # owned DOFs re-indexed 1..n_local_dofs.  We convert owned-local → ghost-local
+    # before looking up the color.
+    hint_cols = max(1, n_local_dofs ÷ n_colors)
+    perturb_cols = [sizehint!(Int32[], hint_cols) for _ in 1:n_colors]
+    for p_local in 1:n_local_dofs
+        p0      = p_local - 1
+        d       = p0 % dof_per_node
+        x_owned = (p0 ÷ dof_per_node) % nx_own
+        y_owned = (p0 ÷ dof_per_node) ÷ nx_own
+        k_ghost = d + (x_owned + ox_coo) * dof_per_node +
+                      (y_owned + oy_coo) * dof_per_node * nx_g_da + 1
+        c = Int(col_colors_local[k_ghost]) + 1
+        push!(perturb_cols[c], Int32(p_local))
+    end
+
+    hint_coo = max(1, nnz_coo ÷ n_colors)
+    coo_idxs   = [sizehint!(Int32[], hint_coo) for _ in 1:n_colors]
+    local_rows = [sizehint!(Int32[], hint_coo) for _ in 1:n_colors]
+    for k in 1:nnz_coo
+        c = Int(color_per_coo[k]) + 1
+        push!(coo_idxs[c],   Int32(k))
+        push!(local_rows[c], Int32(local_row_per_coo[k] + 1))
+    end
+
+    return (;
+        n_colors,
+        n_local_dofs,
+        nnz_coo,
+        row_coo_local,
+        col_coo_local,
+        perturb_cols,
+        coo_idxs,
+        local_rows,
+    )
 end
\ No newline at end of file
diff --git a/src/mat.jl b/src/mat.jl
index b6510d4a..456421ad 100644
--- a/src/mat.jl
+++ b/src/mat.jl
@@ -127,9 +127,14 @@ function MatSeqDense(
     @assert PetscScalar == petsclib.PetscScalar
     
     PetscInt = petsclib.PetscInt
-    mat = LibPETSc.MatCreateSeqDense(petsclib, comm, PetscInt(size(A, 1)), PetscInt(size(A, 2)), A[:])
-
-    finalizer(destroy, mat)
+    # PETSc stores the data pointer directly without copying, so we must keep the
+    # backing array alive for the entire lifetime of the PETSc Mat.  The finalizer
+    # closure captures `data`, making it reachable (and therefore uncollectable) for
+    # as long as `mat` is alive.
+    data = vec(A)
+    mat = LibPETSc.MatCreateSeqDense(petsclib, comm, PetscInt(size(A, 1)), PetscInt(size(A, 2)), data)
+
+    finalizer(m -> (destroy(m); data), mat)
     return mat
 end
 
diff --git a/src/snes.jl b/src/snes.jl
index f63f9209..e1b2fb1d 100644
--- a/src/snes.jl
+++ b/src/snes.jl
@@ -87,19 +87,22 @@ setfunction!(snes::AbstractPetscSNES, rhs!, vec) = setfunction!(rhs!, snes, vec)
 # Wrapper for calls to setfunction!
 mutable struct Fn_SNESSetFunction{PetscLib} end
 function (w::Fn_SNESSetFunction{PetscLib})(
-    ::CSNES,
+    actual_snes_ptr::CSNES,
     r_x::CVec,
     r_fx::CVec,
     snes_ptr::Ptr{Cvoid},
 ) where {PetscLib}
     snes = unsafe_pointer_to_objref(snes_ptr)
+    # Wrap the actual C SNES for the current MG level so that getDM() inside
+    # the callback returns the correct DM (matches the pattern in Fn_KSPComputeRHS).
+    actual_snes = PetscSNES{PetscLib}(actual_snes_ptr, getlib(PetscLib).age)
     x  = PetscVec{PetscLib}(r_x)
     fx = PetscVec{PetscLib}(r_fx)
 
-    if Base.applicable(snes.f!, fx, snes, x, snes.user_ctx)
-        return snes.f!(fx, snes, x, snes.user_ctx)
+    if Base.applicable(snes.f!, fx, actual_snes, x, snes.user_ctx)
+        return snes.f!(fx, actual_snes, x, snes.user_ctx)
     else
-        return snes.f!(fx, snes, x)
+        return snes.f!(fx, actual_snes, x)
     end
 end
 
@@ -161,13 +164,14 @@ setjacobian!(snes::AbstractPetscSNES, updateJ!, J, PJ = J) =
 # Wrapper for calls to setjacobian!
 mutable struct Fn_SNESSetJacobian{PetscLib} end
 function (w::Fn_SNESSetJacobian{PetscLib})(
-    ::CSNES,
+    actual_snes_ptr::CSNES,
     r_x::CVec,
     r_A::CMat,
     r_P::CMat,
     snes_ptr::Ptr{Cvoid},
 ) where {PetscLib}
     snes = unsafe_pointer_to_objref(snes_ptr)
+    actual_snes = PetscSNES{PetscLib}(actual_snes_ptr, getlib(PetscLib).age)
     x = PetscVec{PetscLib}(r_x)
     A = PetscMat{PetscLib}(r_A)
     P = PetscMat{PetscLib}(r_P)
@@ -175,16 +179,16 @@ function (w::Fn_SNESSetJacobian{PetscLib})(
     same_mat = (P.ptr == A.ptr)
 
     if same_mat
-        if Base.applicable(snes.updateJ!, A, snes, x, snes.user_ctx)
-            return snes.updateJ!(A, snes, x, snes.user_ctx)
+        if Base.applicable(snes.updateJ!, A, actual_snes, x, snes.user_ctx)
+            return snes.updateJ!(A, actual_snes, x, snes.user_ctx)
         else
-            return snes.updateJ!(A, snes, x)
+            return snes.updateJ!(A, actual_snes, x)
         end
     else
-        if Base.applicable(snes.updateJ!, A, P, snes, x, snes.user_ctx)
-            return snes.updateJ!(A, P, snes, x, snes.user_ctx)
+        if Base.applicable(snes.updateJ!, A, P, actual_snes, x, snes.user_ctx)
+            return snes.updateJ!(A, P, actual_snes, x, snes.user_ctx)
         else
-            return snes.updateJ!(A, P, snes, x)
+            return snes.updateJ!(A, P, actual_snes, x)
         end
     end
 end
@@ -243,6 +247,10 @@ is garbage collected, but can be called explicitly to free resources immediately
 $(_doc_external("SNES/SNESDestroy"))
 """
 function destroy(snes::AbstractPetscSNES{PetscLib}) where {PetscLib}
+    if !isnothing(snes.opts)
+        destroy(snes.opts)
+        snes.opts = nothing
+    end
     if !(finalized(PetscLib)) && snes.ptr != C_NULL
         LibPETSc.SNESDestroy(PetscLib, snes)
     end
diff --git a/src/string_wrappers.jl b/src/string_wrappers.jl
index 9d6c3106..e911a9da 100644
--- a/src/string_wrappers.jl
+++ b/src/string_wrappers.jl
@@ -1,95 +1,75 @@
-# Convenience wrappers for PETSc SetType functions that accept Julia strings
-# instead of C string pointers
-#
-# These wrappers are defined in the parent PETSc module and delegate to
-# LibPETSc functions with automatic string-to-pointer conversion.
+# Convenience overloads for PETSc Set*Type functions.
+# Each accepts AbstractString and converts to the Ptr{Cchar} the C API expects.
+# GC.@preserve keeps the String alive across the ccall inside the LibPETSc wrapper.
 
 """
-    MatSetType(petsclib, mat, type::String)
+    MatSetType(petsclib, mat, type::AbstractString)
 
-Convenience wrapper for setting matrix type using a Julia string.
+Set the matrix type. Accepts any `AbstractString`.
 
-# Example
-```julia
-mat = LibPETSc.MatCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.MatSetType(petsclib, mat, "seqaij")
-```
+# External Links
+$(_doc_external("Mat/MatSetType"))
 """
-function LibPETSc.MatSetType(petsclib::LibPETSc.PetscLibType, mat, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.MatSetType(petsclib, mat, ptr)
+function LibPETSc.MatSetType(petsclib, mat, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.MatSetType(petsclib, mat, Base.unsafe_convert(Ptr{Cchar}, s))
     return nothing
 end
 
 """
-    VecSetType(petsclib, vec, type::String)
+    VecSetType(petsclib, vec, type::AbstractString)
 
-Convenience wrapper for setting vector type using a Julia string.
+Set the vector type. Accepts any `AbstractString`.
 
-# Example
-```julia
-vec = LibPETSc.VecCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.VecSetType(petsclib, vec, "seq")
-```
+# External Links
+$(_doc_external("Vec/VecSetType"))
 """
-function LibPETSc.VecSetType(petsclib::LibPETSc.PetscLibType, vec, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.VecSetType(petsclib, vec, ptr)
+function LibPETSc.VecSetType(petsclib, vec, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.VecSetType(petsclib, vec, Base.unsafe_convert(Ptr{Cchar}, s))
     return nothing
 end
 
 """
-    KSPSetType(petsclib, ksp, type::String)
+    KSPSetType(petsclib, ksp, type::AbstractString)
 
-Convenience wrapper for setting KSP solver type using a Julia string.
+Set the KSP solver type. Accepts any `AbstractString`.
 
-# Example
-```julia
-ksp = LibPETSc.KSPCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.KSPSetType(petsclib, ksp, "gmres")
-```
+# External Links
+$(_doc_external("KSP/KSPSetType"))
 """
-function LibPETSc.KSPSetType(petsclib::LibPETSc.PetscLibType, ksp, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.KSPSetType(petsclib, ksp, ptr)
+function LibPETSc.KSPSetType(petsclib, ksp, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.KSPSetType(petsclib, ksp, Base.unsafe_convert(Ptr{Cchar}, s))
     return nothing
 end
 
 """
-    SNESSetType(petsclib, snes, type::String)
+    SNESSetType(petsclib, snes, type::AbstractString)
 
-Convenience wrapper for setting SNES solver type using a Julia string.
+Set the SNES nonlinear solver type. Accepts any `AbstractString`.
 
-# Example
-```julia
-snes = LibPETSc.SNESCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.SNESSetType(petsclib, snes, "newtonls")
-```
+# External Links
+$(_doc_external("SNES/SNESSetType"))
 """
-function LibPETSc.SNESSetType(petsclib::LibPETSc.PetscLibType, snes, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.SNESSetType(petsclib, snes, ptr)
+function LibPETSc.SNESSetType(petsclib, snes, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.SNESSetType(petsclib, snes, Base.unsafe_convert(Ptr{Cchar}, s))
     return nothing
 end
 
 """
-    DMSetType(petsclib, dm, type::String)
+    DMSetType(petsclib, dm, type::AbstractString)
 
-Convenience wrapper for setting DM type using a Julia string.
+Set the DM type. Accepts any `AbstractString`.
 
-# Example
-```julia
-dm = LibPETSc.DMCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.DMSetType(petsclib, dm, "da")
-```
+# External Links
+$(_doc_external("DM/DMSetType"))
 """
-function LibPETSc.DMSetType(petsclib::LibPETSc.PetscLibType, dm, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.DMSetType(petsclib, dm, ptr)
+function LibPETSc.DMSetType(petsclib, dm, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.DMSetType(petsclib, dm, Base.unsafe_convert(Ptr{Cchar}, s))
     return nothing
 end
+
+# DMSetVecType and DMSetMatType accept AbstractString directly (VecType/MatType = Cstring).
diff --git a/src/string_wrappers_extra.jl b/src/string_wrappers_extra.jl
index 2a28baef..ee6b855f 100644
--- a/src/string_wrappers_extra.jl
+++ b/src/string_wrappers_extra.jl
@@ -1,35 +1,27 @@
 """
-    TSSetType(petsclib, ts, type::String)
+    TSSetType(petsclib, ts, type::AbstractString)
 
-Convenience wrapper for setting TS (time-stepping) type using a Julia string.
+Set the TS time-stepping type. Accepts any `AbstractString`.
 
-# Example
-```julia
-ts = LibPETSc.TSCreate(petsclib, LibPETSc.PETSC_COMM_SELF)
-LibPETSc.TSSetType(petsclib, ts, "bdf")
-```
+# External Links
+$(_doc_external("TS/TSSetType"))
 """
-function LibPETSc.TSSetType(petsclib::LibPETSc.PetscLibType, ts, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.TSSetType(petsclib, ts, ptr)
+function LibPETSc.TSSetType(petsclib, ts, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.TSSetType(petsclib, ts, Base.unsafe_convert(Ptr{Cchar}, s))
     return nothing
 end
 
 """
-    TaoSetType(petsclib, tao, type::String)
+    TaoSetType(petsclib, tao, type::AbstractString)
 
-Convenience wrapper for setting Tao solver type using a Julia string.
+Set the Tao optimization solver type. Accepts any `AbstractString`.
 
-# Example
-```julia
-tao = LibPETSc.TaoCreate(petsclib)
-LibPETSc.TaoSetType(petsclib, tao, "lmvm")
-```
+# External Links
+$(_doc_external("Tao/TaoSetType"))
 """
-function LibPETSc.TaoSetType(petsclib::LibPETSc.PetscLibType, tao, type::String)
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Int8}, pointer(c_str))
-    LibPETSc.TaoSetType(petsclib, tao, ptr)
+function LibPETSc.TaoSetType(petsclib, tao, type::AbstractString)
+    s = String(type)
+    GC.@preserve s LibPETSc.TaoSetType(petsclib, tao, Base.unsafe_convert(Ptr{Cchar}, s))
     return nothing
 end
diff --git a/src/ts.jl b/src/ts.jl
index e972f021..24e8643e 100644
--- a/src/ts.jl
+++ b/src/ts.jl
@@ -221,11 +221,10 @@ string such as `"none"` or `"basic"`.
 function LibPETSc.TSAdaptSetType(
     petsclib::LibPETSc.PetscLibType,
     adapt::LibPETSc.TSAdapt,
-    type::String,
+    type::AbstractString,
 )
-    c_str = Vector{UInt8}(type * "\0")
-    ptr = Base.unsafe_convert(Ptr{Cchar}, pointer(c_str))
-    LibPETSc.TSAdaptSetType(petsclib, adapt, ptr)
+    s = String(type)
+    GC.@preserve s LibPETSc.TSAdaptSetType(petsclib, adapt, Base.unsafe_convert(Ptr{Cchar}, s))
     return nothing
 end
 
diff --git a/src/vec.jl b/src/vec.jl
index 89a9eab4..1d920fb2 100644
--- a/src/vec.jl
+++ b/src/vec.jl
@@ -9,15 +9,19 @@ function Base.show(io::IO, v::AbstractPetscVec{PetscLib}) where {PetscLib}
         print(io, "PETSc Vec (null pointer)")
         return
     end
-    
-    # Try to get vector info, but handle uninitialized vectors gracefully
+    # VecGetType internally calls VecInitializePackage which queries the PETSc
+    # options database.  Calling it before PETSc is initialised causes a C-level
+    # SIGSEGV that cannot be caught with try/catch.
+    if !initialized(PetscLib)
+        print(io, "PETSc Vec (PETSc not initialized)")
+        return
+    end
     try
         ty = LibPETSc.VecGetType(PetscLib, v)
         si = LibPETSc.VecGetSize(PetscLib, v)
         print(io, "PETSc $ty Vec; length=$si")
     catch
-        # Vector not fully initialized yet (type not set)
-        print(io, "PETSc Vec (not yet initialized)")
+        print(io, "PETSc Vec (type not set)")
     end
     return nothing
 end
@@ -258,6 +262,197 @@ function unsafe_localarray(
 end
 
 
+# ── Memory backend type hierarchy ─────────────────────────────────────────────
+#
+# `memtype_backend` maps a `PetscMemType` to a dispatch tag.
+# Host memory returns `nothing` (no KA backend — avoids confusion with
+# KernelAbstractions.CPU()).  GPU extensions return their own singleton
+# (e.g. `CUDAMemBackend`) by overloading `memtype_backend(::Val{MT})`.
+
+"""
+    AbstractPETScMemBackend
+
+Abstract supertype for GPU memory backends used by PETSc extensions.
+Host memory is represented by `nothing`, not a subtype of this.
+GPU extensions define their own concrete subtype (e.g. `CUDAMemBackend`).
+"""
+abstract type AbstractPETScMemBackend end
+
+"""
+    memtype_backend(mtype::PetscMemType) → Nothing | AbstractPETScMemBackend
+
+Convert a `PetscMemType` to a dispatch tag.  Returns `nothing` for host memory;
+GPU extensions return their own singleton for device memory.
+"""
+memtype_backend(::Val{LibPETSc.PETSC_MEMTYPE_HOST}) = nothing
+memtype_backend(::Val{MT}) where {MT} =
+    error("No GPU backend loaded for PetscMemType $MT — load CUDA.jl, AMDGPU.jl, …")
+memtype_backend(mt::LibPETSc.PetscMemType) = memtype_backend(Val(mt))
+
+# ── Device-aware local array access ───────────────────────────────────────────
+#
+# `_unsafe_localarray` is the unified entry point: it calls
+# `VecGetArray*AndMemType`, converts the returned `PetscMemType` to a backend
+# singleton via `memtype_backend`, and dispatches to `wrap_localarray`.
+# GPU extensions add `wrap_localarray` methods for their own backend types.
+#
+# The typed overload `_unsafe_localarray(::Type{A}, vec; ...)` additionally
+# asserts that the returned array is of type `A`, giving a clear error when a
+# Vec is on an unexpected device.
+
+function _unsafe_localarray(
+    vec::AbstractPetscVec{PetscLib};
+    read::Bool = true,
+    write::Bool = true,
+) where {PetscLib}
+    pv = as_petsc_vec(vec)
+    if write && read
+        cpu_arr, mtype = LibPETSc.VecGetArrayAndMemType(PetscLib, pv)
+    elseif write
+        cpu_arr, mtype = LibPETSc.VecGetArrayWriteAndMemType(PetscLib, pv)
+    else
+        cpu_arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, pv)
+    end
+    return wrap_localarray(cpu_arr, memtype_backend(mtype), vec; read, write)
+end
+
+function _unsafe_localarray(
+    ::Type{A},
+    vec::AbstractPetscVec;
+    read::Bool = true,
+    write::Bool = true,
+) where {A <: AbstractArray}
+    arr = _unsafe_localarray(vec; read, write)
+    arr isa A && return arr
+    Base.finalize(arr)   # release the PETSc handle before throwing
+    throw(ArgumentError(
+        "expected array of type $A but Vec returned $(typeof(arr)). " *
+        "Check that the Vec lives on the expected device."
+    ))
+end
+
+function wrap_localarray(
+    cpu_arr, ::Nothing, vec::AbstractPetscVec{PetscLib};
+    read::Bool, write::Bool,
+) where {PetscLib}
+    finalizer(cpu_arr) do a
+        if write && read
+            LibPETSc.VecRestoreArrayAndMemType(PetscLib, vec, a)
+        elseif write
+            LibPETSc.VecRestoreArrayWriteAndMemType(PetscLib, vec, a)
+        else
+            LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, vec, a)
+        end
+        return nothing
+    end
+    return cpu_arr
+end
+
+# Fallback: no backend loaded for this PetscMemType.
+function wrap_localarray(cpu_arr, b::AbstractPETScMemBackend, vec; kw...)
+    error("wrap_localarray not implemented for backend $(typeof(b)) — " *
+          "load the corresponding GPU package (e.g. CUDA.jl)")
+end
+
+# ── No-finalizer acquire/release ─────────────────────────────────────────────
+#
+# `withlocalarray!` uses these instead of the finalizer-based `unsafe_localarray`
+# to avoid a documented Julia pitfall: after `Base.finalize(x)` is called, if
+# `x` later becomes unreachable GC may invoke the finalizer *again*, leading to
+# a double VecRestore call on an already-freed Vec (→ SIGSEGV).
+# `try/finally` provides deterministic, single-execution cleanup.
+
+"""
+    acquire_petsc_local_array(vec; read, write) -> (arr, cpu_arr, backend)
+
+Get the local array from `vec` via `VecGetArray*AndMemType` without
+registering a Julia finalizer.  Returns the user-visible array, the raw PETSc
+cpu_arr needed for restore, and the backend singleton.
+Extensions overload `make_local_array(cpu_arr, backend)` to wrap the raw
+array for their device (e.g. `CUDAMemBackend` → `CuArray`).
+"""
+function acquire_petsc_local_array(
+    vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
+) where {PLib}
+    pv = as_petsc_vec(vec)
+    cpu_arr, mtype = if write && read
+        LibPETSc.VecGetArrayAndMemType(PLib, pv)
+    elseif write
+        LibPETSc.VecGetArrayWriteAndMemType(PLib, pv)
+    else
+        LibPETSc.VecGetArrayReadAndMemType(PLib, pv)
+    end
+    backend = memtype_backend(mtype)
+    arr = make_local_array(cpu_arr, backend)
+    return arr, cpu_arr, backend
+end
+
+# CPU: the raw PETSc array is already a Vector — return it directly.
+make_local_array(cpu_arr, ::Nothing) = cpu_arr
+make_local_array(_, b::AbstractPETScMemBackend) =
+    error("make_local_array not implemented for backend $(typeof(b)) — " *
+          "load the corresponding GPU package (e.g. CUDA.jl)")
+
+"""
+    release_petsc_local_array(cpu_arr, backend, vec; read, write)
+
+Restore a previously acquired local array.  Called in `finally` blocks by
+`withlocalarray!`.  Extensions overload this for GPU backends.
+"""
+function release_petsc_local_array(
+    cpu_arr, ::Nothing, vec::AbstractPetscVec{PLib}; read::Bool, write::Bool,
+) where {PLib}
+    pv = as_petsc_vec(vec)
+    if write && read
+        LibPETSc.VecRestoreArrayAndMemType(PLib, pv, cpu_arr)
+    elseif write
+        LibPETSc.VecRestoreArrayWriteAndMemType(PLib, pv, cpu_arr)
+    else
+        LibPETSc.VecRestoreArrayReadAndMemType(PLib, pv, cpu_arr)
+    end
+    return nothing
+end
+release_petsc_local_array(cpu_arr, b::AbstractPETScMemBackend, vec; kw...) =
+    error("release_petsc_local_array not implemented for backend $(typeof(b)) — " *
+          "load the corresponding GPU package (e.g. CUDA.jl)")
+
+# The auto-generated *AndMemType wrappers are typed `x::PetscVec`, but
+# `AbstractPetscVec` also includes `VecPtr`.  Convert transparently.
+as_petsc_vec(v::LibPETSc.PetscVec) = v
+as_petsc_vec(v::AbstractPetscVec{PetscLib}) where {PetscLib} =
+    LibPETSc.PetscVec{PetscLib}(v.ptr)
+
+"""
+
+Query the `PetscMemType` of each Vec and return the corresponding array type.
+Errors if the Vecs are on heterogeneous devices (different `PetscMemType`
+values), since a single `withlocalarray!` call cannot handle mixed backends.
+Returns `Vector` when all Vecs are host-resident.
+
+Extensions overload `array_type(::Val{MT})` for a `PetscMemType` enum value
+`MT` to register the corresponding array type (e.g. `PETSC_MEMTYPE_DEVICE` →
+`CuArray`).
+"""
+function determine_memtype(vecs::AbstractPetscVec...)
+    mtypes = map(vecs) do v
+        PetscLib = typeof(v).parameters[1]
+        pv = as_petsc_vec(v)
+        arr, mtype = LibPETSc.VecGetArrayReadAndMemType(PetscLib, pv)
+        LibPETSc.VecRestoreArrayReadAndMemType(PetscLib, pv, arr)
+        mtype
+    end
+    allequal(mtypes) || throw(ArgumentError(
+        "Vecs are on heterogeneous devices: $(unique(mtypes)). " *
+        "Use withlocalarray!(f!, ::Type{A}, ...) to handle each backend explicitly."
+    ))
+    return array_type(Val(first(mtypes)))
+end
+
+array_type(::Val{LibPETSc.PETSC_MEMTYPE_HOST}) = Vector
+array_type(::Val{MT}) where {MT} =
+    error("No array type registered for PetscMemType $MT — load the corresponding GPU package (e.g. CUDA.jl)")
+# GPU extensions add: array_type(::Val{LibPETSc.PETSC_MEMTYPE_DEVICE}) = CuArray
+
 """
     withlocalarray!(
         f!,
@@ -265,50 +460,64 @@ end
         read::Union{Bool, NTuple{N, Bool}} = true,
         write::Union{Bool, NTuple{N, Bool}} = true,
     )
+    withlocalarray!(::Type{A}, f!, vecs...; read, write) where {A <: AbstractArray}
 
-Convert `x` to an `Array{PetscScalar}` using [`unsafe_localarray`](@ref) and
-apply the function `f!`.
+Apply `f!` to local array views of `vecs`.
 
-Use `read=false` if the array is write-only; `write=false` if read-only.
+The optional `::Type{A}` second argument (after the do-block function) asserts
+that every array returned from `VecGetArray*AndMemType` is of type `A`.  Use
+it with do-block syntax:
 
-# Examples
-```julia-repl
-julia> withlocalarray!(x; write=true) do x
-   @. x .*= 2
-end
-
-julia> withlocalarray!(
-           x,
-           y;
-           read = (false, true),
-           write = (true, false)
-       ) do x, y
-   @. x .= 2 .+ y
+```julia
+withlocalarray!(Vector, petsc_x; write=true) do x
+    x .= 1
 end
 ```
 
-!!! note
-    `Base.finalize` is automatically called on the array.
 """
 function withlocalarray!(
     f!,
     vecs::NTuple{N, AbstractPetscVec};
+    kwargs...,
+) where {N}
+    A = determine_memtype(vecs...)
+    return withlocalarray!(f!, A, vecs; kwargs...)
+end
+withlocalarray!(f!, vecs...; kwargs...) = withlocalarray!(f!, vecs; kwargs...)
+
+function withlocalarray!(
+    f!,
+    ::Type{A},
+    vecs::NTuple{N, AbstractPetscVec};
     read::Union{Bool, NTuple{N, Bool}} = true,
     write::Union{Bool, NTuple{N, Bool}} = true,
-) where {N}
+) where {A <: AbstractArray, N}
     read isa NTuple{N, Bool} || (read = ntuple(_ -> read, N))
     write isa NTuple{N, Bool} || (write = ntuple(_ -> write, N))
-
-    arrays = map(vecs, read, write) do v, r, w
-        unsafe_localarray(v; read = r, write = w)
+    # Acquire all arrays first (no finalizers), then use try/finally for release.
+    # This avoids the Julia pitfall where Base.finalize + GC can both run the
+    # finalizer if the object becomes unreachable again (double-restore → crash).
+    acquired = map(vecs, read, write) do v, r, w
+        acquire_petsc_local_array(v; read=r, write=w)
     end
-    val = f!(arrays...)
-    map(arrays) do array
-        Base.finalize(array)
+    try
+        # Type check inside try so finally still releases on mismatch.
+        arrays = map(acquired) do (arr, cpu_arr, backend)
+            arr isa A || throw(ArgumentError(
+                "expected array of type $A but Vec returned $(typeof(arr)). " *
+                "Check that the Vec lives on the expected device."
+            ))
+            arr
+        end
+        return f!(arrays...)
+    finally
+        foreach(vecs, acquired, read, write) do v, (_, cpu_arr, backend), r, w
+            release_petsc_local_array(cpu_arr, backend, v; read=r, write=w)
+        end
     end
-    return val
 end
-withlocalarray!(f!, vecs...; kwargs...) = withlocalarray!(f!, vecs; kwargs...)
+withlocalarray!(f!, ::Type{A}, vecs...; kwargs...) where {A <: AbstractArray} =
+    withlocalarray!(f!, A, vecs; kwargs...)
 
 
 """
@@ -429,3 +638,74 @@ function LinearAlgebra.norm(
     r_val = LibPETSc.VecNorm(PetscLib, v, normtype)
     return r_val
 end
+
+# ── GPU-aware array access helpers ────────────────────────────────────────────
+#
+# `get_petsc_arrays` calls `VecGetArrayAndMemType` on both Vecs, converts the
+# returned `PetscMemType` values to backend singletons, and dispatches to
+# `get_petsc_arrays_impl`.  The base package handles the pure-CPU case
+# (host × host (both backends nothing)).  GPU extensions add `get_petsc_arrays_impl`
+# methods for their backend combinations and a matching
+# `restore_petsc_arrays_impl` method dispatched by `restore_petsc_arrays`.
+#
+# Return tuple:  (fx, lx, fx_arr, lx_arr, fx_bounce)
+#   CPU:  fx, lx are plain Arrays with VecRestore finalizers;
+#         fx_arr = lx_arr = fx_bounce = nothing
+#   GPU:  fx, lx are device arrays; fx_arr, lx_arr are raw PETSc arrays
+#         (needed for restore); fx_bounce is a scratch device array or nothing.
+
+"""
+    get_petsc_arrays(petsclib, g_fx, l_x) -> (fx, lx, fx_arr, lx_arr, fx_bounce)
+
+Return arrays for `g_fx` (read-write) and `l_x` (read-only) suitable for
+passing to a compute kernel.  Dispatches on the memory location of each Vec
+via `memtype_backend`.
+
+On the pure-CPU path (`host × host (both backends nothing)`) `fx`/`lx` are plain
+`Array`s and `fx_arr = lx_arr = fx_bounce = nothing`.  When a GPU backend
+extension is loaded and a Vec lives on the device the returned `fx`/`lx` are
+device arrays.  An optional bounce buffer `fx_bounce` is allocated when `g_fx`
+is host-resident; its contents must be written back by `restore_petsc_arrays`
+after the kernel completes.
+
+See also: [`restore_petsc_arrays`](@ref)
+"""
+function get_petsc_arrays(petsclib, g_fx, l_x)
+    T = petsclib.PetscScalar
+    fx_arr, fx_mtype = LibPETSc.VecGetArrayAndMemType(petsclib, as_petsc_vec(g_fx))
+    lx_arr, lx_mtype = LibPETSc.VecGetArrayReadAndMemType(petsclib, as_petsc_vec(l_x))
+    return get_petsc_arrays_impl(
+        petsclib, g_fx, l_x, T, fx_arr, lx_arr,
+        memtype_backend(fx_mtype), memtype_backend(lx_mtype),
+    )
+end
+
+# CPU base case: return arrays directly. restore_petsc_arrays calls VecRestore
+# explicitly — no finalizers to avoid the double-finalization crash.
+function get_petsc_arrays_impl(
+    petsclib, g_fx, l_x, ::Type, fx_arr, lx_arr, ::Nothing, ::Nothing,
+)
+    return fx_arr, lx_arr, nothing, nothing, nothing
+end
+
+"""
+    restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
+
+Restore PETSc Vecs after a kernel launched via [`get_petsc_arrays`](@ref).
+
+Dispatches to `restore_petsc_arrays_impl`.  On the CPU path (`fx_arr`,
+`lx_arr`, `fx_bounce` all `nothing`) this simply finalizes `fx` and `lx`,
+triggering the registered `VecRestoreArray*AndMemType` finalizers.  GPU backend
+extensions add a `restore_petsc_arrays_impl` method for their array types.
+"""
+function restore_petsc_arrays(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
+    restore_petsc_arrays_impl(petsclib, g_fx, l_x, fx, lx, fx_arr, lx_arr, fx_bounce)
+end
+
+# CPU base case: call VecRestore directly (no finalizers).
+function restore_petsc_arrays_impl(
+    petsclib, g_fx, l_x, fx, lx, ::Nothing, ::Nothing, ::Nothing,
+)
+    LibPETSc.VecRestoreArrayAndMemType(petsclib, as_petsc_vec(g_fx), fx)
+    LibPETSc.VecRestoreArrayReadAndMemType(petsclib, as_petsc_vec(l_x), lx)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 19f69e55..2c1d3a90 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,12 +1,22 @@
 using Test
 using MPI: MPI, mpiexec
-using PETSc, PETSc_jll, Pkg
+using PETSc, Pkg
 
 # Make sure that all dependencies are installed also on a clean system
 Pkg.instantiate()
 
-import MPIPreferences
-@info "Testing PETSc.jl with" MPIPreferences.binary MPIPreferences.abi PETSc_jll.host_platform
+# When set_library! has been used, petsc_library is a path string and PETSc_jll
+# is not loaded.  Only import JLL-specific symbols when using the default binaries.
+const USING_CUSTOM_LIB = PETSc.petsclibs[1].petsc_library isa AbstractString
+
+import MPIPreferences  # always import to ensure local MPI API is configured
+
+if USING_CUSTOM_LIB
+    @info "Testing PETSc.jl with custom library" path=PETSc.petsclibs[1].petsc_library MPIPreferences.binary MPIPreferences.abi
+else
+    using PETSc_jll
+    @info "Testing PETSc.jl with" MPIPreferences.binary MPIPreferences.abi PETSc_jll.host_platform
+end
 
 # Do the MPI tests first so we do not have mpi running inside MPI
 mpi_tests = ("mpivec.jl", "mpimat.jl", "ksp.jl", "dmstag.jl")
diff --git a/test/vec.jl b/test/vec.jl
index b5f78a56..a5a9add2 100644
--- a/test/vec.jl
+++ b/test/vec.jl
@@ -292,6 +292,73 @@ end
         PETSc.destroy(petsc_x)
         PETSc.destroy(petsc_y)
 
+        PETSc.finalize(petsclib)
+    end
+end
+
+@testset "withlocalarray! typed (Array)" begin
+    for petsclib in PETSc.petsclibs
+        PETSc.initialize(petsclib)
+        PetscScalar = petsclib.PetscScalar
+        PetscInt    = petsclib.PetscInt
+        N           = PetscInt(10)
+        test_comm = Sys.iswindows() ? LibPETSc.PETSC_COMM_SELF : MPI.COMM_SELF
+        petsc_x = LibPETSc.VecCreateSeq(petsclib, test_comm, N)
+        petsc_y = LibPETSc.VecCreateSeq(petsclib, test_comm, N)
+
+        # determine_memtype returns Vector for CPU vecs
+        @test PETSc.determine_memtype(petsc_x) === Vector
+        @test PETSc.determine_memtype(petsc_x, petsc_y) === Vector
+
+        # typed single vec — write
+        PETSc.withlocalarray!(Vector, petsc_x; read = false, write = true) do x
+            @test x isa Vector
+            for i in eachindex(x)
+                x[i] = PetscScalar(i)
+            end
+        end
+        @test petsc_x[1:N] == PetscScalar.(1:N)
+
+        # typed two vecs as NTuple
+        PETSc.withlocalarray!(
+            Vector,
+            (petsc_x, petsc_y);
+            read = (false, false), write = (true, true),
+        ) do x, y
+            @test x isa Vector
+            @test y isa Vector
+            for i in eachindex(x)
+                x[i] = PetscScalar(i)
+                y[i] = PetscScalar(2i)
+            end
+        end
+        @test petsc_x[1:N] == PetscScalar.(1:N)
+        @test petsc_y[1:N] == PetscScalar.(2:2:2N)
+
+        # typed two vecs as splat
+        PETSc.withlocalarray!(
+            Vector, petsc_x, petsc_y;
+            read = (false, false), write = (true, true),
+        ) do x, y
+            @test x isa Vector
+            @test y isa Vector
+            for i in eachindex(x)
+                x[i] = PetscScalar(2i)
+                y[i] = PetscScalar(3i)
+            end
+        end
+        @test petsc_x[1:N] == PetscScalar.(2:2:2N)
+        @test petsc_y[1:N] == PetscScalar.(3:3:3N)
+
+        # wrong type raises ArgumentError (Matrix is not Vector)
+        @test_throws ArgumentError PETSc.withlocalarray!(
+            Matrix, petsc_x; read = true, write = false,
+        ) do x
+            nothing
+        end
+
+        PETSc.destroy(petsc_x)
+        PETSc.destroy(petsc_y)
         PETSc.finalize(petsclib)
     end
 end
\ No newline at end of file