diff --git a/docs/make.jl b/docs/make.jl
index ce7588be..17f7c94f 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -12,6 +12,7 @@ makedocs(;
     ),
     pages=[
         "Home" => "index.md",
+        "Installation" => "man/installation.md",
         "Getting Started" => "man/getting_started.md",
         "High-level interface" => Any[
             "Vec" =>  "man/vec.md",
@@ -47,6 +48,7 @@ makedocs(;
             "AO (Application Ordering)" =>  "man/ao_lowlevel.md",
         ],
         "Utilities" => "man/utilities.md",
+        "Running on HPC Systems" => "man/hpc.md",
         "FAQ"  => "man/FAQ.md",
         "Contributing"  => "man/contributing.md",
         "Funding" => "man/funding.md",
diff --git a/docs/src/assets/img/weak_scaling_ex45_LUMI.png b/docs/src/assets/img/weak_scaling_ex45_LUMI.png
new file mode 100644
index 00000000..48d6b21e
Binary files /dev/null and b/docs/src/assets/img/weak_scaling_ex45_LUMI.png differ
diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md
index bb0c799a..6e42a01e 100644
--- a/docs/src/man/getting_started.md
+++ b/docs/src/man/getting_started.md
@@ -1,55 +1,10 @@
 # Getting started
 
+- [1. Solving a linear system of equations](#1-solving-a-linear-system-of-equations)
+- [2. Nonlinear example](#2-nonlinear-example)
+- [3. Next steps](#3-next-steps)
 
-- [Getting started](#getting-started)
-    - [1a. Installation using pre-built libraries](#1a-installation-using-pre-built-libraries)
-    - [1b. Installation using a custom PETSc build](#1b-installation-using-a-custom-petsc-build)
-    - [2. Solving a linear system of equations](#2-solving-a-linear-system-of-equations)
-    - [3. Nonlinear example](#3-nonlinear-example)
-    - [4. Next steps](#4-next-steps)
-
-### 1a. Installation using pre-built libraries 
-The easiest way to install the package is: 
-```julia
-julia> ]
-(@v1.12) pkg> add PETSc
-```
-which will install a pre-built PETSc library (`PETSc_jll`) as well as `MPI.jl` on your system. This will work both in serial and in parallel on your machine.
-
-!!! warning "Windows Users"
-    The prebuild binaries currently do not work on Windows as we had to build `PETSc_jll` without MPI due to compatibility issues with `MicrosoftMPI_jll`.
-    
-    **Windows users are therefore advised to install the [Windows Subsystem for Linux](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux) (WSL) and run PETSc.jl from within WSL.** This will provide full functionality with both serial and parallel (MPI) support.
-
-### 1b. Installation using a custom PETSc build
-Sometimes, you may be interested in a PETSc installation that comes with additional external packages, or that you compiled yourself. Ensure the library is compiled as a **dynamic** (not static) library.
-
-Use `set_library!` to configure the path once — it is stored in `LocalPreferences.toml` and no environment variables are needed afterwards:
-
-```julia
-using PETSc
-PETSc.set_library!(
-    "/path/to/custom/libpetsc.so";
-    PetscScalar = Float64,
-    PetscInt    = Int64,
-)
-# Restart Julia — PETSc_jll is not loaded and your library is used automatically.
-```
-
-To revert to the bundled binaries: `PETSc.unset_library!()`.
-
-For a one-off session without changing persistent settings, use `set_petsclib` directly:
-
-```julia
-petsclib = PETSc.set_petsclib("/path/to/custom/libpetsc.so";
-                              PetscScalar=Float64, PetscInt=Int64)
-PETSc.initialize(petsclib, log_view=true)
-# ... your code ...
-PETSc.finalize(petsclib)
-```
-
-
-### 2. Solving a linear system of equations
+### 1. Solving a linear system of equations
 
 Lets consider the following elliptic equation:
 ```math
@@ -202,7 +157,7 @@ This is important information that helps PETSc to distribute the problem over se
 PETSc uses the `DM` infrastructure for such cases. `DMDA` is for (collocated) finite differences, `DMStag` for staggered finite differences, `DMPlex` for finite element/finite volume discretisations and `DMForest` for adaptive mesh refinement problems.
 If possible, use this infrastructure as it simplifies your life. Have a look at the [examples](https://github.com/JuliaParallel/PETSc.jl/tree/main/examples) or [tests](https://github.com/JuliaParallel/PETSc.jl/tree/main/test) of `PETSc.jl`.
 
-### 3. Nonlinear example
+### 2. Nonlinear example
 Let's solve the coupled system of nonlinear equations: 
 ```math
 \begin{aligned}
@@ -285,8 +240,8 @@ julia> PETSc.finalize(petsclib)
 ```
 
 
-### 4. Next steps 
-Now that you have the basics, ypu can start playing with some more complete examples.
+### 3. Next steps 
+Now that you have the basics, you can start playing with some more complete examples.
 Here some suggestions:
 1. [laplacian.jl](https://github.com/JuliaParallel/PETSc.jl/blob/main/examples/laplacian.jl) - simple (non-parallel) laplacian example using Julia sparse matrixes
 2. [dmda_laplacian.jl](https://github.com/JuliaParallel/PETSc.jl/blob/main/examples/dmda_laplacian.jl) - 2D laplacian example with Dirichlet boundary conditions using the `DMDA` framework. Examples are given on how to run it with various (multigrid) solvers.
@@ -300,4 +255,4 @@ Working through those examples should give you a fair idea of how to use PETSc.
 
 If you have further questions, please have a look at the [test](https://github.com/JuliaParallel/PETSc.jl/tree/main/test) directory; this is run automatically every time we make a new commit, and we do our best to keep it working.
 
-Furthermore, the left menu gives additional instructions on how to use the low-level interface.
\ No newline at end of file
+Furthermore, the left menu gives additional instructions on how to use the low-level interface.
diff --git a/docs/src/man/hpc.md b/docs/src/man/hpc.md
new file mode 100644
index 00000000..bd860e87
--- /dev/null
+++ b/docs/src/man/hpc.md
@@ -0,0 +1,319 @@
+# Running on HPC Systems
+
+PETSc.jl can be used on HPC clusters in two main configurations: using precompiled binaries via MPITrampoline, or by pointing to a locally installed PETSc build.
+
+## 1. Use precompiled binaries via MPITrampoline
+
+The main reason that it is challenging to run applications on HPC systems is that MPI is implemented in a different way by different vendors. This will change in the future as there is now the MPI ABI (application Binary Interface) and our precompiled PETSc binaries are already compatible with that.
+
+Yet, until all MPI implementations fully support this, we recommend using [MPITrampoline](https://github.com/eschnett/MPITrampoline) instead, which is a MPI wrapper layer that lets MPI-linked binaries be redirected to any system MPI at runtime. The `PETSc_jll` binaries distributed with PETSc.jl are built against MPITrampoline, which means they can be used on clusters by simply configuring `MPI.jl` to use the system MPI.
+Doing this requires you to compile a small code on the HPC system that is linked versus the local MPI implementation.
+
+Here step-by-step instructions (for Linux, as that is what essentially all HPC systems use):
+
+#### 1.1 Install MPIwrapper 
+
+* Download [MPIwrapper](https://github.com/eschnett/MPIwrapper): 
+```bash
+git clone https://github.com/eschnett/MPIwrapper.git 
+cd MPIwrapper
+```
+
+* Install it after making sure that `mpiexec` points to the one you want (you may have to load some modules, depending on your system):
+```bash
+cmake -S . -B build -DMPIEXEC_EXECUTABLE=/full/path/to/mpiexec -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$HOME/mpiwrapper
+cmake --build build
+cmake --install build
+```
+> [!IMPORTANT]  
+> You need to specify the full path to `mpiexec` (or equivalent, such as `srun` or `mpirun`, depending oin your system) and not just the name. If you don't know that, you can determine this with
+> `which mpiexec`
+ 
+At this stage, `MPIwrapper` is installed in `$HOME/mpiwrapper`
+
+#### 1.2 Set the correct wrappers
+Next, you need to specify these environmental variables:
+```
+export MPITRAMPOLINE_LIB=$HOME/mpiwrapper/lib64/libmpiwrapper.so
+export MPITRAMPOLINE_MPIEXEC=$HOME/MPIwrapper/mpiwrapper/bin/mpiwrapperexec 
+```
+Depending on the system it may be called `lib` instead of `lib64` (check!).
+
+
+#### 1.3 Install the `MPI` and `MPIPreferences` packages:
+Install packages the usual way:
+```julia
+julia
+julia> ]
+pkg>add MPI, MPIPreferences
+```
+
+Set the preference to use `MPItrampoline`:
+```julia
+julia> using MPIPreferences; MPIPreferences.use_jll_binary("MPItrampoline_jll")
+┌ Info: MPIPreferences unchanged
+└   binary = "MPItrampoline_jll"
+```
+
+Load `MPI` and verify it is the correct one:
+```julia
+julia> using MPI
+julia> MPI.Get_library_version()
+"MPIwrapper 2.10.3, using MPIABI 2.9.0, wrapping:\nOpen MPI v4.1.4, package: Open MPI boris@Pluton Distribution, ident: 4.1.4, repo rev: v4.1.4, May 26, 2022"
+```
+After this, restart julia (this only needs to be done once, next time all is fine).
+
+#### 1.4 Test `MPI`:
+
+If you want you can run a test case with:
+```julia
+julia> using MPI
+julia> mpiexec(cmd -> run(`$cmd -n 3 echo hello world`));
+hello world
+hello world
+hello world
+```
+
+#### 1.5 Install and use `PETSc.jl`:
+Now install `PETSc.jl`:
+```julia
+julia> using MPI,PETSc
+```
+At this stage you can use PETSc.jl as normal — no changes to your script needed:
+```julia
+using PETSc, MPI
+petsclib = PETSc.getlib(; PetscScalar = Float64, PetscInt = Int64)
+PETSc.initialize(petsclib)
+# ...
+```
+
+1. Launch via the cluster's MPI:
+```bash
+mpiexec -n 128 julia --project myScript.jl
+```
+This is the easiest path for most clusters and requires no custom PETSc compilation.
+
+## 2. Use a locally installed PETSc build
+
+If you need a PETSc build with specific options (external packages, GPU support, custom BLAS, etc.), you can point `PETSc.jl` directly to your local installation.
+The local PETSc must be compiled as a **shared library** (`--with-shared-libraries=1`) and linked against the **same MPI** that `MPI.jl` is configured to use (i.e. the one that you should use on your HPC machine).
+
+#### 2.1 Link `PETSc` to the local library  
+
+Use `set_library!` to configure the path once — it is stored in `LocalPreferences.toml` and no environment variables are needed afterwards:
+
+```julia
+using PETSc
+PETSc.set_library!(
+    "/path/to/custom/libpetsc.so";
+    PetscScalar = Float64,
+    PetscInt    = Int64,
+)
+# Restart Julia — PETSc_jll is not loaded and your library is used automatically.
+```
+
+#### 2.2 Link `MPI.jl` to the local mpi
+You *must* use the same MPI implementation as the one versus which you compiled PETSc (otherwise you'll get heaps of problems).
+
+Check with:
+```julia
+julia> using MPI
+
+julia> MPI.Get_library_version()
+```
+
+
+
+### 3. Typical HPC job script
+
+A typical slurm submissions script to run PETSc code using `MPITrampoline` binaries can look like:
+
+```bash
+#!/bin/bash
+#SBATCH --nodes=8
+#SBATCH --ntasks-per-node=128
+
+
+export MPITRAMPOLINE_LIB=/users/kausbori/mpiwrapper/lib64/libmpiwrapper.so
+export JULIA_CPU_TARGET="generic"
+
+srun julia --project ex45.jl \
+  -N 257 \
+  -ksp_type cg \
+  -pc_type mg \
+  -pc_mg_levels 5 \
+  -mg_levels_ksp_type chebyshev \
+  -mg_levels_pc_type sor \
+  -ksp_rtol 1e-8 \
+  -ksp_view \
+  -pc_mg_log \
+  -log_view
+```
+
+#### Precompile code on compute nodes
+On some machines, it can be useful to precompile `PETSc.jl` on a single core rather than having many processors trying to do the same thing simultaneously.
+
+This is an example on how this can be done by getting access on an interactive node using slurm. Note that julia was installed in the home directory using `juliaup`.
+```bash
+salloc --nodes=1 --ntasks=1 --time=00:30:00 \
+    --partition=standard --account=your_project_number
+
+# Once on the compute node:
+module --force purge
+module load CrayEnv
+module load craype
+module load gcc-native/13.2
+module load cray-mpich/8.1.32
+
+export HOME=/users/kausbori
+export JULIA_DEPOT_PATH=/users/kausbori/.julia
+export MPITRAMPOLINE_LIB=/users/kausbori/mpiwrapper/lib64/libmpiwrapper.so
+export JULIA_CPU_TARGET="generic"
+
+JULIA=/users/kausbori/.julia/juliaup/julia-1.12.6+0.x64.linux.gnu/bin/julia
+
+# Clear stale cache
+rm -rf /users/kausbori/.julia/compiled/v1.12/PETSc/
+rm -rf /users/kausbori/.julia/compiled/v1.12/PETSc_jll/
+rm -rf /users/kausbori/.julia/compiled/v1.12/MPI/
+
+# Precompile
+$JULIA --project=/users/kausbori/PETSc_jl_scalability \
+    -e 'using PETSc; println("OK")'
+```
+
+---
+
+## Performance and Weak Scalability
+
+This section summarises weak scalability results for a 3D Laplacian benchmark ([`ex45.jl`](https://github.com/JuliaParallel/PETSc.jl/blob/main/examples/ex45.jl)) using a CG solver with geometric multigrid preconditioning (`-pc_type mg`). The problem size is scaled proportionally with the number of MPI ranks so that the work per rank stays constant.
+
+We have performed these simulations on LUMI-C (Finnland) and provide job submission scripts (`submit_scaling.sh`, `job.sh`), along with a parsing file that collects and summarizes the results (`parse_scaling.jl`). 
+All files are uploaded under [PETSc.jl/examples/scalability_tests](PETSc.jl/examples/scalability_tests), and can be started with
+```bash
+$ ./submit_scaling.sh 512 1025 1025 1025 6 16
+```
+which will start a $1025 \times 1025 \times 1015$ simulation on 512 cores, with 6 multigrid levels and the coarse grid solver being solved on 16 cores.
+
+We consider 3 cases:
+- Using `PETSc.jl` with MPITrampoline linked to the local MPI ("PETSc.jl"). This is generally the easiest setup
+- Using `PETSc.jl` linked to our locally build PETSc version ("local lib")
+- Using a C-compiled version of `ex45_julia.c`  ("native")
+
+#### Results
+Here the collected results (all simulations, except the largest ones, where done 3 times; I only show the fastest ones here). 
+
+| JobID | Ntasks | MG | NC | Grid | Backend | SolveTime (s) | KSPSolve (s) | TotGFlops | GFlops/s | KSP | L2 error | Max error | Residual | Converged |
+|-------|--------|----|----|------|---------|--------------|-------------|-----------|----------|-----|----------|-----------|----------|-----------|
+| 17684772 | 64 | 5 | 16 | 513³ | PETSc.jl | 50.800 | 27.108 | 335.60 | 6.00 | 9 | 4.4372e-06 | 1.2557e-05 | 3.5610e-05 | ✅ |
+| 17769825 | 64 | 5 | 16 | 513³ | local lib | 51.032 | 27.006 | 335.60 | 6.00 | 9 | 4.4372e-06 | 1.2557e-05 | 3.5610e-05 | ✅ |
+| 17767475 | 64 | 5 | 16 | 513³ | native | 38.770 | 26.917 | 337.90 | 8.50 | 9 | 4.4372e-06 | 1.2557e-05 | 3.5610e-05 | ✅ |
+| 17724458 | 512 | 6 | 16 | 1025³ | PETSc.jl | 61.715 | 32.482 | 2878.00 | 37.52 | 10 | 1.1093e-06 | 3.1236e-06 | 9.6257e-05 | ✅ |
+| 17769827 | 512 | 6 | 16 | 1025³ | local lib | 58.254 | 32.033 | 2878.00 | 40.54 | 10 | 1.1093e-06 | 3.1236e-06 | 9.6257e-05 | ✅ |
+| 17767644 | 512 | 6 | 16 | 1025³ | native | 42.700 | 30.122 | 2896.00 | 65.64 | 10 | 1.1093e-06 | 3.1236e-06 | 9.6257e-05 | ✅ |
+| 17768041 | 512 | 6 | 16 | 1025³ | native | 42.924 | 30.307 | 2896.00 | 65.34 | 10 | 1.1093e-06 | 3.1236e-06 | 9.6257e-05 | ✅ |
+| 17658195 | 4096 | 7 | 16 | 2049³ | PETSc.jl | 72.949 | 34.332 | 21390.00 | 258.00 | 9 | 2.7738e-07 | 7.8741e-07 | 1.9254e-04 | ✅ |
+| 17769830 | 4096 | 7 | 16 | 2049³ | local lib | 68.095 | 34.220 | 21390.00 | 270.00 | 9 | 2.7738e-07 | 7.8741e-07 | 1.9254e-04 | ✅ |
+| 17767610 | 4096 | 7 | 16 | 2049³ | native | 41.345 | 28.214 | 21540.00 | 506.10 | 9 | 2.7738e-07 | 7.8741e-07 | 1.9254e-04 | ✅ |
+| 17726872 | 32768 | 8 | 16 | 4097³ | PETSc.jl | 142.540 | 63.300 | 171000.00 | 746.60 | 9 | 6.9301e-08 | 1.9765e-07 | 6.6391e-04 | ✅ |
+| 17767541 | 32768 | 8 | 16 | 4097³ | native | 69.282 | 50.858 | 172200.00 | 2386.00 | 9 | 6.9301e-08 | 1.9765e-07 | 6.6391e-04 | ✅ |
+17769918  |  32768  |   8 |   16 |  4097³ | local lib |    135.402  |     73.108 |  171000.00   |   263.20 |    9 |   6.9301e-08  |  1.9765e-07 |   6.6391e-04 | ✅ |
+---
+In this table `KSPSolve` indicates the time spend in the solver (taken from the log) and `SolveTime` the time for the full solution.
+
+Below, we break this for each of the cases:
+
+#### a) PETSc.jl with precompiled `PETSc_jll` binaries
+
+the results of using PETSc.jl with precompiled `jll` libraries are:
+
+| Ntasks | Grid | DOFs/core | KSPSolve (s) | SolveTime (s) | Efficiency | Converged |
+|--------|------|-----------|-------------|--------------|------------|-----------|
+| 64 | 513³ | 2,109,464 | 27.916 | 55.843 | 100.0% | ✅ |
+| 64 | 513³ | 2,109,464 | 27.108 | 50.800 | 103.0% | ✅ |
+| 64 | 513³ | 2,109,464 | 28.121 | 56.352 | 99.3% | ✅ |
+| 512 | 1025³ | 2,103,302 | 33.670 | 66.789 | 82.9% | ✅ |
+| 512 | 1025³ | 2,103,302 | 32.482 | 61.715 | 85.9% | ✅ |
+| 512 | 1025³ | 2,103,302 | 34.236 | 65.473 | 81.5% | ✅ |
+| 4096 | 2049³ | 2,100,226 | 34.332 | 72.949 | 81.3% | ✅ |
+| 4096 | 2049³ | 2,100,226 | 44.244 | 79.557 | 63.1% | ✅ |
+| 32768 | 4097³ | 2,098,688 | 63.300 | 142.540 | 44.1% | ✅ |
+
+Please note that in the way we run this on LUMI-C, the 64 core case is on a single node and does not have inter-node communication. Despite this, weak scalability is pretty good (one could also argue to use the 512 core case as reference as this includes communication, in which case it would even be better).
+There is some variability in the timing when repeating the same run (a few %).
+
+#### b) PETSc.jl linked against a local PETSc build (system MPI)
+
+| Ntasks | Grid | DOFs/core | KSPSolve (s) | SolveTime (s) | Efficiency | Converged |
+|--------|------|-----------|-------------|--------------|------------|-----------|
+| 64 | 513³ | 2,109,464 | 27.065 | 51.123 | 100.0% | ✅ |
+| 64 | 513³ | 2,109,464 | 27.006 | 51.032 | 100.2% | ✅ |
+| 64 | 513³ | 2,109,464 | 27.551 | 56.452 | 98.2% | ✅ |
+| 512 | 1025³ | 2,103,302 | 34.456 | 61.041 | 78.5% | ✅ |
+| 512 | 1025³ | 2,103,302 | 32.033 | 58.254 | 84.5% | ✅ |
+| 512 | 1025³ | 2,103,302 | 30.708 | 61.971 | 88.1% | ✅ |
+| 4096 | 2049³ | 2,100,226 | 38.414 | 69.384 | 70.5% | ✅ |
+| 4096 | 2049³ | 2,100,226 | 35.292 | 68.905 | 76.7% | ✅ |
+| 4096 | 2049³ | 2,100,226 | 34.220 | 68.095 | 79.1% | ✅ |
+| 32768 | 4097³ | 2,098,688 | 73.108  | 135.402  | 37.0% | ✅ |
+
+
+#### c) Native C build (`ex45_julia.c`)
+
+The equivalent PETSc C example compiled natively, serving as the baseline.
+
+
+| Ntasks | Grid | DOFs/core | KSPSolve (s) | SolveTime (s) | Efficiency | Converged |
+|--------|------|-----------|-------------|--------------|------------|-----------|
+| 64 | 513³ | 2,109,464 | 26.917 | 38.770 | 100.0% | ✅ |
+| 64 | 513³ | 2,109,464 | 27.155 | 39.011 | 99.1% | ✅ |
+| 64 | 513³ | 2,109,464 | 26.854 | 39.065 | 100.2% | ✅ |
+| 512 | 1025³ | 2,103,302 | 31.492 | 44.062 | 85.5% | ✅ |
+| 512 | 1025³ | 2,103,302 | 30.122 | 42.700 | 89.4% | ✅ |
+| 512 | 1025³ | 2,103,302 | 30.607 | 43.331 | 87.9% | ✅ |
+| 512 | 1025³ | 2,103,302 | 30.307 | 42.924 | 88.8% | ✅ |
+| 4096 | 2049³ | 2,100,226 | 30.694 | 46.659 | 87.7% | ✅ |
+| 4096 | 2049³ | 2,100,226 | 28.214 | 41.345 | 95.4% | ✅ |
+| 4096 | 2049³ | 2,100,226 | 28.498 | 42.759 | 94.5% | ✅ |
+| 32768 | 4097³ | 2,098,688 | 50.858 | 69.282 | 52.9% | ✅ |
+
+
+#### d) Backend Comparison 
+
+If we compare the case on 4096 cores with a 2049³ grid, for 3D Poisson, 64 nodes × 64 tasks/node, MG levels = 7, coarse ranks = 16 we get:
+
+| Metric | native C | PETSc.jl (`_jll`) | local lib |
+|--------|----------|-------------------|-----------|
+| **Runs** | 3 | 2 | 3 |
+| **SolveTime mean (s)** | 43.6 | 76.3 | 68.8 |
+| **SolveTime range (s)** | 41.3–46.7 | 72.9–79.6 | 68.1–69.4 |
+| **KSPSolve mean (s)** | 29.1 | 39.3 | 35.9 |
+| **KSPSolve range (s)** | 28.2–30.7 | 34.3–44.2 | 34.2–38.4 |
+| **KSP iterations** | 9 | 9 | 9 |
+| **L2 error** | 2.7738e-07 | 2.7738e-07 | 2.7738e-07 |
+| **Converged** | ✅ | ✅ | ✅ |
+
+The relative performance is thus:
+
+| Backend | SolveTime overhead | KSPSolve overhead |
+|---------|--------------------|-------------------|
+| native C | — (reference) | — (reference) |
+| PETSc.jl (best run) | +21.5% | +1.6% |
+| PETSc.jl (worst run) | +32.5% | +30.9% |
+| local lib | +56.9% | +50.8% |
+
+From this we can conclude:
+- **Numerical results are identical** across all backends — same iterations, same L2/Max error.
+- **KSPSolve gap vs native** is ~6s for local lib and ~10s for `_jll` — much of this is MG setup overhead inside the solver
+- **SolveTime gap vs native** (~25s for local lib, ~32s for `_jll`) includes Julia setup: mesh construction, matrix assembly, MG hierarchy setup
+- **local lib** is faster than `_jll` at 4096 tasks (35.9s vs 39.3s mean KSPSolve) — likely because it uses the system Cray libsci BLAS directly without MPItrampoline overhead.
+- There is a (significant) drop in performance for high core counts. this is likely related to the multigrid setup and coarse grid solvers and not to PETSc.jl as we see a similar behavior for the native compilation as well.
+
+Overall it shows that there is overhead in using the Julia interface to `PETSc.jl` compared to native C-code. 
+On the other hand, this is done in a higher level language and can thus seemlessly integrate with the full Julia ecosystem. It should also be kept in mind that the residual routines etc. are all written in julia. 
+
+As a start one can configure `PETSc.jl` to use MPITrampoline, which doesn't require any PETSc installation and should thus work on any machine. For the last bit of performance one can configure a native PETSc installation and use that instead.
+
+#### Weak scalability plot
+Results can be summarized in the plot below:
+![weak_scalability_LUMI](../assets/img/weak_scaling_ex45_LUMI.png)
\ No newline at end of file
diff --git a/docs/src/man/installation.md b/docs/src/man/installation.md
new file mode 100644
index 00000000..e12d659b
--- /dev/null
+++ b/docs/src/man/installation.md
@@ -0,0 +1,49 @@
+# Installation
+
+## Using pre-built libraries
+
+The easiest way to install the package is:
+```julia
+julia> ]
+(@v1.12) pkg> add PETSc
+```
+which will install a pre-built PETSc library (`PETSc_jll`) as well as `MPI.jl` on your system. This will work both in serial and in parallel on your machine.
+
+!!! warning "Windows Users"
+    The prebuild binaries currently do not work on Windows as we had to build `PETSc_jll` without MPI due to compatibility issues with `MicrosoftMPI_jll`.
+
+    **Windows users are therefore advised to install the [Windows Subsystem for Linux](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux) (WSL) and run PETSc.jl from within WSL.** This will provide full functionality with both serial and parallel (MPI) support.
+
+## Using a custom PETSc build
+
+Sometimes, you may be interested in a PETSc installation that comes with additional external packages, or that you compiled yourself. Ensure the library is compiled as a **dynamic** (not static) library.
+
+Use `set_library!` to configure the path once — it is stored in `LocalPreferences.toml` and no environment variables are needed afterwards:
+
+```julia
+using PETSc
+PETSc.set_library!(
+    "/path/to/custom/libpetsc.so";
+    PetscScalar = Float64,
+    PetscInt    = Int64,
+)
+# Restart Julia — PETSc_jll is not loaded and your library is used automatically.
+```
+
+To revert to the bundled binaries: `PETSc.unset_library!()`.
+
+To check what is currently configured: `PETSc.library_info()`.
+
+For a one-off session without changing persistent settings, use `set_petsclib` directly:
+
+```julia
+petsclib = PETSc.set_petsclib("/path/to/custom/libpetsc.so";
+                              PetscScalar=Float64, PetscInt=Int64)
+PETSc.initialize(petsclib, log_view=true)
+# ... your code ...
+PETSc.finalize(petsclib)
+```
+
+## HPC systems
+
+On many high-performance clusters, you will need to use the cluster's MPI installation. There are a number of options to do so - see the [Running on HPC Systems](hpc.md) page for details.
diff --git a/examples/ex45.jl b/examples/ex45.jl
index 7d29ac57..503f7006 100644
--- a/examples/ex45.jl
+++ b/examples/ex45.jl
@@ -7,21 +7,32 @@ exact(x, y, z) = sin(2π * x) * sin(2π * y) * sin(2π * z)
 # The discrete operator is -∇², so forcing = -∇²(exact)
 # For exact(x,y,z) = sin(2πx)sin(2πy)sin(2πz), we have ∇²u = -3(2π)²sin(2πx)sin(2πy)sin(2πz)
 # So forcing = -∇²u = 12π²sin(2πx)sin(2πy)sin(2πz)
-forcing(x, y, z) = 12 * π^2 * sin(2π * x) * sin(2π * y)  * sin(2π * z)
+forcing(x, y, z) = 12 * π^2 * sin(2π * x) * sin(2π * y) * sin(2π * z)
 
 # Get the PETSc lib with our chosen `PetscScalar` type
 petsclib = PETSc.getlib(; PetscScalar = Float64)
 
+# If we want to use a custom PETSc library, we can set it with `PETSc.set_library!` before calling `PETSc.getlib`. For example:
+#=
+using PETSc
+PETSc.set_library!(
+    "/path/to/custom/libpetsc.so";
+    PetscScalar = Float64,
+    PetscInt    = Int64,
+)
+# Restart Julia — PETSc_jll is not loaded and your library is used automatically.
+=#
+# This creates a LocalPreferences.toml file. 
+#petsclib = PETSc.getlib(; PetscScalar = Float64, PetscInt = Int64)
+
+
 # Initialize PETSc
-PETSc.initialize(petsclib)
+PETSc.initialize(petsclib, log_view=true)
 
 function solve_ex45(N=7; da_grid_x=7, da_grid_y=7, da_grid_z=7, kwargs...)
     comm = MPI.COMM_WORLD
 
-    # Parse command-line options and merge with keyword arguments passed
-    # programmatically. Keyword args take precedence over CLI.
-    cli_opts = PETSc.parse_options(ARGS)           # NamedTuple
-    # Build a flexible Dict{Symbol,Any} to allow merging CLI and kwargs
+    cli_opts = PETSc.parse_options(ARGS)
     cli = Dict{Symbol,Any}()
     for (k, v) in pairs(cli_opts)
         cli[k] = v
@@ -30,11 +41,6 @@ function solve_ex45(N=7; da_grid_x=7, da_grid_y=7, da_grid_z=7, kwargs...)
         cli[k] = v
     end
 
-    # Default RHS: continuous analytic forcing (scaled by cell volume)
-    # We keep CLI/kwargs simple — any specific forcing-mode option was
-    # removed: continuous mode is now the only behavior.
-
-    # Determine N (keyword/arg default overridden by CLI/kwargs)
     function _as_int(x, default)
         if x === nothing
             return default
@@ -45,76 +51,81 @@ function solve_ex45(N=7; da_grid_x=7, da_grid_y=7, da_grid_z=7, kwargs...)
         end
     end
 
-    N = _as_int(get(cli, :N, N), N)
-    dim = _as_int(get(cli, :dim, 3), 3)
+    N   = _as_int(get(cli, :N,   N),   N)
+    dim = _as_int(get(cli, :dim, 3),   3)
+    Nx  = _as_int(get(cli, :Nx,  N),   N)
+    Ny  = _as_int(get(cli, :Ny,  N),   N)
+    Nz  = _as_int(get(cli, :Nz,  N),   N)
 
-    # Re-pack merged options into a NamedTuple for keyword splatting
     opts = (; [(k => cli[k]) for k in keys(cli)]...)
 
-    # Create a 3D DMDA (defaults 7x7x7 like the C example)
-    boundary = Tuple(fill(PETSc.DM_BOUNDARY_NONE,dim))
-    stencil = PETSc.DMDA_STENCIL_STAR
-    global_size = Tuple(fill(N,dim))
-    # Pass parsed opts into DMDA so other DM options are applied
+    boundary    = Tuple(fill(PETSc.DM_BOUNDARY_NONE, dim))
+    stencil     = PETSc.DMDA_STENCIL_STAR
+    global_size = dim == 3 ? (Nx, Ny, Nz) : (Nx, Ny, 1)
     da = PETSc.DMDA(petsclib, comm, boundary, global_size, 1, 1, stencil; opts...)
 
-    # Print fine grid resolution at start (only on rank 0)
     final_grid = PETSc.getinfo(da).global_size
     if MPI.Comm_rank(comm) == 0
         @printf("Solving on %d × %d × %d grid\n", final_grid[1], final_grid[2], final_grid[3])
     end
 
-    # Create KSP tied to DM and pass parsed PETSc options so
-    # command-line flags (e.g. -ksp_monitor) are applied like in `ex50.jl`
     ksp = PETSc.KSP(da; opts...)
 
-    # Set compute operators (matrix assembly) and RHS
     PETSc.setcomputeoperators!(ksp) do J, jac, ksp
-        dm = PETSc.getDM(ksp)
+        dm      = PETSc.getDM(ksp)
         corners = PETSc.getcorners(dm)
-        info = PETSc.getinfo(dm)
-        N = info.global_size
+        info    = PETSc.getinfo(dm)
+        N       = info.global_size
+
+        # Grid spacings (uniform)
+        Hx, Hy, Hz = 1.0 ./ (N .- 1)
+
+        # Scale entire system by Hx*Hy*Hz (= h³ for uniform grid) so that
+        # matrix entries are O(h) ~ O(1/N) rather than O(1/h²) ~ O(N²).
+        # This keeps MG smoothers well-conditioned regardless of resolution.
+        #
+        # Scaled operator: (Hx*Hy*Hz) * (-∇²_h)
+        # For uniform h: scaled diagonal  = 6h³/h² = 6h
+        #                scaled off-diag  = -h³/h² = -h
+        # All entries O(h) → matrix is well-scaled for MG.
+        scale = Hx * Hy * Hz
+
+        # Scaled coefficients
+        cx = scale / (Hx * Hx)   # = Hy*Hz/Hx
+        cy = scale / (Hy * Hy)   # = Hx*Hz/Hy
+        cz = scale / (Hz * Hz)   # = Hx*Hy/Hz
+        diag3d = 2.0 * (cx + cy + cz)
+        diag2d = 2.0 * (cx + cy)
 
-        Hx,Hy,Hz = 1.0 ./ (N .- 1)
-        HxHydHz  = Hx * Hy / Hz
-        HxHzdHy  = Hx * Hz / Hy
-        HyHzdHx  = Hy * Hz / Hx
-
-        HxdHy = Hx / Hy
-        HydHx = Hy / Hx
         for k in corners.lower[3]:corners.upper[3]
             for j in corners.lower[2]:corners.upper[2]
                 for i in corners.lower[1]:corners.upper[1]
                     idx = CartesianIndex(i, j, k)
-                    # boundary check (global indices start at 1 here)
-                    is_boundary = (i == 1 || j == 1 || k == 1 || i == N[1] || j == N[2] || k == N[3])
-                    if dim==3
+                    is_boundary = (i == 1 || j == 1 || k == 1 ||
+                                   i == N[1] || j == N[2] || k == N[3])
+                    if dim == 3
                         if is_boundary
-                            # Dirichlet BC: set row to identity
                             jac[idx, idx] = 1.0
                         else
-                            jac[idx, idx + CartesianIndex(0,-1,0)] = -HxHzdHy
-                            jac[idx, idx + CartesianIndex(-1,0,0)] = -HyHzdHx
-                            jac[idx, idx] = 2.0 * (HxHydHz + HxHzdHy + HyHzdHx)
-                            jac[idx, idx + CartesianIndex(1,0,0)] = -HyHzdHx
-                            jac[idx, idx + CartesianIndex(0,1,0)] = -HxHzdHy
-                            jac[idx, idx + CartesianIndex(0,0,-1)] = -HxHydHz
-                            jac[idx, idx + CartesianIndex(0,0,1)] = -HxHydHz
+                            jac[idx, idx + CartesianIndex(-1,  0,  0)] = -cx
+                            jac[idx, idx + CartesianIndex( 1,  0,  0)] = -cx
+                            jac[idx, idx + CartesianIndex( 0, -1,  0)] = -cy
+                            jac[idx, idx + CartesianIndex( 0,  1,  0)] = -cy
+                            jac[idx, idx + CartesianIndex( 0,  0, -1)] = -cz
+                            jac[idx, idx + CartesianIndex( 0,  0,  1)] = -cz
+                            jac[idx, idx]                               =  diag3d
                         end
-                    elseif dim==2
+                    elseif dim == 2
                         if is_boundary
-                            # Dirichlet BC: set row to identity
                             jac[idx, idx] = 1.0
                         else
-                            # Interior point - full 5-point stencil
-                            jac[idx, idx + CartesianIndex(0, -1, 0)] = -HxdHy
-                            jac[idx, idx + CartesianIndex(-1, 0, 0)] = -HydHx
-                            jac[idx, idx] = 2.0 * (HxdHy + HydHx)
-                            jac[idx, idx + CartesianIndex(1, 0, 0)] = -HydHx
-                            jac[idx, idx + CartesianIndex(0, 1, 0)] = -HxdHy
+                            jac[idx, idx + CartesianIndex(-1, 0, 0)] = -cx
+                            jac[idx, idx + CartesianIndex( 1, 0, 0)] = -cx
+                            jac[idx, idx + CartesianIndex( 0,-1, 0)] = -cy
+                            jac[idx, idx + CartesianIndex( 0, 1, 0)] = -cy
+                            jac[idx, idx]                             =  diag2d
                         end
                     end
-
                 end
             end
         end
@@ -123,84 +134,78 @@ function solve_ex45(N=7; da_grid_x=7, da_grid_y=7, da_grid_z=7, kwargs...)
     end
 
     PETSc.setcomputerhs!(ksp) do b_vec, ksp
-        dm = PETSc.getDM(ksp)
+        dm      = PETSc.getDM(ksp)
         corners = PETSc.getcorners(dm)
-        info = PETSc.getinfo(dm)
-        N = info.global_size
+        info    = PETSc.getinfo(dm)
+        N       = info.global_size
 
-        Hx,Hy,Hz = 1.0 ./ (N .- 1)
-        HxHydHz  = Hx * Hy / Hz
-        HxHzdHy  = Hx * Hz / Hy
-        HyHzdHx  = Hy * Hz / Hx
+        Hx, Hy, Hz = 1.0 ./ (N .- 1)
+        scale = Hx * Hy * Hz   # same scaling as matrix
 
         g_x = range(0, length = N[1], stop = 1)
         g_y = range(0, length = N[2], stop = 1)
         g_z = range(0, length = N[3], stop = 1)
-    
+
         l_x = g_x[(corners.lower[1]):(corners.upper[1])]
         l_y = g_y[(corners.lower[2]):(corners.upper[2])]
         l_z = g_z[(corners.lower[3]):(corners.upper[3])]
-    
-        PETSc.withlocalarray!(b_vec; read=false) do b
-            # reshape into local block (xm, ym, zm)
-            N = corners.size;
-            #nx = Int(corners.size[1]); ny = Int(corners.size[2]); nz = Int(corners.size[3])
 
-            if dim==3
-                #is_boundary = (x == 1 || y == 1 || x == global_size[1] || y == global_size[2])
-                b3 = reshape(b, N...)
-                for kk = 1:N[3], jj = 1:N[2], ii = 1:N[1]
-                    # global indices
-                    x,y,z = l_x[ii], l_y[jj], l_z[kk]
+        PETSc.withlocalarray!(b_vec; read=false) do b
+            sz = corners.size
 
+            if dim == 3
+                b3 = reshape(b, sz...)
+                for kk = 1:sz[3], jj = 1:sz[2], ii = 1:sz[1]
+                    x, y, z = l_x[ii], l_y[jj], l_z[kk]
                     gi = corners.lower[1] + (ii - 1)
                     gj = corners.lower[2] + (jj - 1)
                     gk = corners.lower[3] + (kk - 1)
-                    if gi == 1 || gj == 1 || gk == 1 || gi == N[1] || gj == N[2] || gk == N[3]
-                        # Dirichlet BC: RHS is analytic boundary value
+                    if gi == 1 || gj == 1 || gk == 1 ||
+                       gi == N[1] || gj == N[2] || gk == N[3]
+                        # Dirichlet BC: identity row in matrix → RHS = exact value
                         b3[ii, jj, kk] = exact(x, y, z)
                     else
-                        # Continuous forcing (default): analytic -∇² evaluated
-                        # at the point and scaled by the cell volume so the
-                        # RHS uses the same quadrature scaling as the matrix.
-                        vol_cell = Hx * Hy * Hz
-                        b3[ii, jj, kk] = forcing(x, y, z) * vol_cell
+                        # Scale forcing by same factor as matrix
+                        b3[ii, jj, kk] = forcing(x, y, z) * scale
+                    end
+                end
+            elseif dim == 2
+                b2 = reshape(b, sz...)
+                for jj = 1:sz[2], ii = 1:sz[1]
+                    x, y = l_x[ii], l_y[jj]
+                    gi = corners.lower[1] + (ii - 1)
+                    gj = corners.lower[2] + (jj - 1)
+                    if gi == 1 || gj == 1 || gi == N[1] || gj == N[2]
+                        b2[ii, jj, 1] = exact(x, y, 0.0)
+                    else
+                        b2[ii, jj, 1] = forcing(x, y, 0.0) * scale
                     end
                 end
-            else
-
             end
-
         end
         PETSc.assemble!(b_vec)
         return 0
     end
 
-    # Time the solve so callers can inspect runtime
     solve_time = @elapsed PETSc.solve!(ksp)
-
-    # Number of outer KSP iterations
-    niter = LibPETSc.KSPGetIterationNumber(petsclib, ksp)
-
-    # Final (fine) grid resolution
+    niter      = LibPETSc.KSPGetIterationNumber(petsclib, ksp)
     final_grid = PETSc.getinfo(da).global_size
 
-    # Get the residual norm
-    x = LibPETSc.KSPGetSolution(petsclib, ksp)
+    x    = LibPETSc.KSPGetSolution(petsclib, ksp)
     norm = LibPETSc.KSPGetResidualNorm(petsclib, ksp)
 
-    # Compute L2 and max error against analytic solution `exact(x,y,z)`
-    info = PETSc.getinfo(da)
+    # Compute L2 and max error against analytic solution
+    info  = PETSc.getinfo(da)
     Nglob = info.global_size
-    Hx,Hy,Hz = 1.0 ./ (Nglob .- 1)
+    Hx, Hy, Hz = 1.0 ./ (Nglob .- 1)
     vol = Hx * Hy * Hz
 
     local_err2 = 0.0
-    local_max = 0.0
+    local_max  = 0.0
     PETSc.withlocalarray!(x; read=true) do xu
         corners = PETSc.getcorners(da)
-        dims = corners.size
-        uas = reshape(xu, dims...)
+        dims    = corners.size
+        uas     = reshape(xu, dims...)
         for kk = 1:dims[3], jj = 1:dims[2], ii = 1:dims[1]
             gi = corners.lower[1] + (ii - 1)
             gj = corners.lower[2] + (jj - 1)
@@ -209,15 +214,15 @@ function solve_ex45(N=7; da_grid_x=7, da_grid_y=7, da_grid_z=7, kwargs...)
             yc = (gj - 1) * Hy
             zc = (gk - 1) * Hz
             u_num = uas[ii, jj, kk]
-            u_ex = exact(xc, yc, zc)
-            err = u_num - u_ex
+            u_ex  = exact(xc, yc, zc)
+            err   = u_num - u_ex
             local_err2 += err^2 * vol
-            local_max = max(local_max, abs(err))
+            local_max   = max(local_max, abs(err))
         end
     end
 
     global_err2 = MPI.Allreduce(local_err2, MPI.SUM, comm)
-    global_max = MPI.Allreduce(local_max, MPI.MAX, comm)
+    global_max  = MPI.Allreduce(local_max,  MPI.MAX, comm)
     L2 = sqrt(global_err2)
 
     if MPI.Comm_rank(MPI.COMM_WORLD) == 0
@@ -227,21 +232,20 @@ function solve_ex45(N=7; da_grid_x=7, da_grid_y=7, da_grid_z=7, kwargs...)
         @printf("Solve time: %.6f seconds\n", solve_time)
     end
 
-    # Clean up
     PETSc.destroy(ksp)
-    return (;norm,final_grid,niter,solve_time,L2,global_max)
+    return (; norm, final_grid, niter, solve_time, L2, global_max)
 end
 
 if !isinteractive() && abspath(PROGRAM_FILE) == @__FILE__
     res = solve_ex45()
-    # res is a NamedTuple: (norm, final_grid, niter, solve_time, L2, max)
     if MPI.Comm_rank(MPI.COMM_WORLD) == 0
         @printf("Final residual norm %g\n", res.norm)
-        @printf("Fine grid resolution: %d × %d × %d\n", res.final_grid[1], res.final_grid[2], res.final_grid[3])
+        @printf("Fine grid resolution: %d × %d × %d\n",
+                res.final_grid[1], res.final_grid[2], res.final_grid[3])
         @printf("Outer KSP iterations: %d\n", res.niter)
         @printf("Solve time: %.6f seconds\n", res.solve_time)
         @printf("L2 error: %.6e\n", res.L2)
         @printf("Max error: %.6e\n", res.global_max)
     end
     PETSc.finalize(petsclib)
-end
+end
\ No newline at end of file
diff --git a/examples/scalability_tests/ex45_julia.c b/examples/scalability_tests/ex45_julia.c
new file mode 100644
index 00000000..1fc15215
--- /dev/null
+++ b/examples/scalability_tests/ex45_julia.c
@@ -0,0 +1,264 @@
+/*
+  Laplacian in 3D with manufactured solution.
+
+  PDE:  -Laplacian u = 12*pi^2 * sin(2*pi*x) * sin(2*pi*y) * sin(2*pi*z)
+  on (0,1)^3 with Dirichlet BCs:
+        u = sin(2*pi*x) * sin(2*pi*y) * sin(2*pi*z)  on all boundaries
+
+  Exact solution: u = sin(2*pi*x) * sin(2*pi*y) * sin(2*pi*z)
+
+  Matrix is scaled by h^3 (= Hx*Hy*Hz) so entries are O(h) rather than O(1/h^2),
+  matching the scaling used in ex45.jl.
+
+  Grid size is controlled by -Nx, -Ny, -Nz (matching ex45.jl convention).
+
+  Output is formatted to match ex45.jl so that parse_scaling.jl works unchanged.
+*/
+
+static char help[] = "Solves 3D Laplacian (manufactured solution) using multigrid.\n\n";
+
+#include <petscksp.h>
+#include <petscdm.h>
+#include <petscdmda.h>
+#include <math.h>
+
+/* Context passed to callbacks */
+typedef struct {
+  PetscInt Nx, Ny, Nz;
+} AppCtx;
+
+extern PetscErrorCode ComputeMatrix(KSP, Mat, Mat, void *);
+extern PetscErrorCode ComputeRHS(KSP, Vec, void *);
+extern PetscErrorCode ComputeInitialGuess(KSP, Vec, void *);
+
+static inline PetscReal exact_sol(PetscReal x, PetscReal y, PetscReal z)
+{
+  return sin(2.0 * PETSC_PI * x) * sin(2.0 * PETSC_PI * y) * sin(2.0 * PETSC_PI * z);
+}
+
+static inline PetscReal forcing(PetscReal x, PetscReal y, PetscReal z)
+{
+  return 12.0 * PETSC_PI * PETSC_PI *
+         sin(2.0 * PETSC_PI * x) * sin(2.0 * PETSC_PI * y) * sin(2.0 * PETSC_PI * z);
+}
+
+int main(int argc, char **argv)
+{
+  KSP        ksp;
+  DM         da;
+  Vec        x, b, r;
+  Mat        A;
+  AppCtx     ctx;
+  PetscReal  norm, solve_time, L2, maxerr;
+  PetscInt   niter;
+  PetscLogDouble t0, t1;
+
+  PetscFunctionBeginUser;
+  PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
+
+  /* Default grid size matches ex45.jl default */
+  ctx.Nx = 7; ctx.Ny = 7; ctx.Nz = 7;
+  PetscCall(PetscOptionsGetInt(NULL, NULL, "-Nx", &ctx.Nx, NULL));
+  PetscCall(PetscOptionsGetInt(NULL, NULL, "-Ny", &ctx.Ny, NULL));
+  PetscCall(PetscOptionsGetInt(NULL, NULL, "-Nz", &ctx.Nz, NULL));
+
+  /* Print grid info matching ex45.jl output */
+  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Solving on %d × %d × %d grid\n",
+                        (int)ctx.Nx, (int)ctx.Ny, (int)ctx.Nz));
+
+  PetscCall(KSPCreate(PETSC_COMM_WORLD, &ksp));
+  PetscCall(DMDACreate3d(PETSC_COMM_WORLD,
+                         DM_BOUNDARY_NONE, DM_BOUNDARY_NONE, DM_BOUNDARY_NONE,
+                         DMDA_STENCIL_STAR,
+                         ctx.Nx, ctx.Ny, ctx.Nz,
+                         PETSC_DECIDE, PETSC_DECIDE, PETSC_DECIDE,
+                         1, 1, NULL, NULL, NULL, &da));
+  PetscCall(DMSetFromOptions(da));
+  PetscCall(DMSetUp(da));
+  PetscCall(KSPSetDM(ksp, da));
+  PetscCall(KSPSetComputeInitialGuess(ksp, ComputeInitialGuess, &ctx));
+  PetscCall(KSPSetComputeRHS(ksp, ComputeRHS, &ctx));
+  PetscCall(KSPSetComputeOperators(ksp, ComputeMatrix, &ctx));
+  PetscCall(DMDestroy(&da));
+  PetscCall(KSPSetFromOptions(ksp));
+
+  /* Time the solve */
+  PetscCall(PetscTime(&t0));
+  PetscCall(KSPSolve(ksp, NULL, NULL));
+  PetscCall(PetscTime(&t1));
+  solve_time = (PetscReal)(t1 - t0);
+
+  PetscCall(KSPGetSolution(ksp, &x));
+  PetscCall(KSPGetRhs(ksp, &b));
+  PetscCall(KSPGetIterationNumber(ksp, &niter));
+  PetscCall(KSPGetResidualNorm(ksp, &norm));
+
+  /* Compute explicit residual norm */
+  PetscCall(VecDuplicate(b, &r));
+  PetscCall(KSPGetOperators(ksp, &A, NULL));
+  PetscCall(MatMult(A, x, r));
+  PetscCall(VecAXPY(r, -1.0, b));
+  PetscReal resnorm;
+  PetscCall(VecNorm(r, NORM_2, &resnorm));
+
+  /* Compute L2 and max error against exact solution */
+  {
+    DM          dm;
+    DMDALocalInfo info;
+    Vec         x_local;
+    PetscScalar ***xu;
+    PetscReal   local_err2 = 0.0, local_max = 0.0;
+    PetscReal   Hx, Hy, Hz, vol;
+    PetscInt    i, j, k;
+
+    PetscCall(KSPGetDM(ksp, &dm));
+    PetscCall(DMDAGetLocalInfo(dm, &info));
+    Hx  = 1.0 / (PetscReal)(info.mx - 1);
+    Hy  = 1.0 / (PetscReal)(info.my - 1);
+    Hz  = 1.0 / (PetscReal)(info.mz - 1);
+    vol = Hx * Hy * Hz;
+
+    PetscCall(DMGetLocalVector(dm, &x_local));
+    PetscCall(DMGlobalToLocalBegin(dm, x, INSERT_VALUES, x_local));
+    PetscCall(DMGlobalToLocalEnd(dm, x, INSERT_VALUES, x_local));
+    PetscCall(DMDAVecGetArrayRead(dm, x_local, &xu));
+
+    for (k = info.zs; k < info.zs + info.zm; k++) {
+      for (j = info.ys; j < info.ys + info.ym; j++) {
+        for (i = info.xs; i < info.xs + info.xm; i++) {
+          PetscReal xc   = i * Hx;
+          PetscReal yc   = j * Hy;
+          PetscReal zc   = k * Hz;
+          PetscReal u_ex = exact_sol(xc, yc, zc);
+          PetscReal err  = (PetscReal)xu[k][j][i] - u_ex;
+          local_err2 += err * err * vol;
+          if (PetscAbsReal(err) > local_max) local_max = PetscAbsReal(err);
+        }
+      }
+    }
+
+    PetscCall(DMDAVecRestoreArrayRead(dm, x_local, &xu));
+    PetscCall(DMRestoreLocalVector(dm, &x_local));
+
+    PetscCall(MPI_Allreduce(&local_err2, &L2,     1, MPIU_REAL, MPIU_SUM, PETSC_COMM_WORLD));
+    PetscCall(MPI_Allreduce(&local_max,  &maxerr, 1, MPIU_REAL, MPIU_MAX, PETSC_COMM_WORLD));
+    L2 = PetscSqrtReal(L2);
+  }
+
+  /* Output exactly matching ex45.jl format for parse_scaling.jl */
+  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "L2 error: %e\n",       (double)L2));
+  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Max error: %e\n",      (double)maxerr));
+  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Residual norm %g\n",   (double)resnorm));
+  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Solve time: %f seconds\n", (double)solve_time));
+  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Final residual norm %g\n", (double)norm));
+  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Fine grid resolution: %d × %d × %d\n",
+                        (int)ctx.Nx, (int)ctx.Ny, (int)ctx.Nz));
+  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Outer KSP iterations: %d\n", (int)niter));
+  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Solve time: %f seconds\n",   (double)solve_time));
+  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "L2 error: %e\n",             (double)L2));
+  PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Max error: %e\n",            (double)maxerr));
+
+  PetscCall(VecDestroy(&r));
+  PetscCall(KSPDestroy(&ksp));
+  PetscCall(PetscFinalize());
+  return 0;
+}
+
+PetscErrorCode ComputeInitialGuess(KSP ksp, Vec b, void *ctx)
+{
+  PetscFunctionBeginUser;
+  PetscCall(VecSet(b, 0));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+PetscErrorCode ComputeRHS(KSP ksp, Vec b, void *ctx)
+{
+  AppCtx            *user = (AppCtx *)ctx;
+  DM                 dm;
+  DMDALocalInfo      info;
+  PetscScalar      ***barray;
+  PetscInt           i, j, k;
+  PetscReal          Hx, Hy, Hz, scale;
+
+  PetscFunctionBeginUser;
+  PetscCall(KSPGetDM(ksp, &dm));
+  PetscCall(DMDAGetLocalInfo(dm, &info));
+
+  Hx    = 1.0 / (PetscReal)(info.mx - 1);
+  Hy    = 1.0 / (PetscReal)(info.my - 1);
+  Hz    = 1.0 / (PetscReal)(info.mz - 1);
+  scale = Hx * Hy * Hz;   /* same h^3 scaling as ex45.jl */
+
+  PetscCall(DMDAVecGetArray(dm, b, &barray));
+  for (k = info.zs; k < info.zs + info.zm; k++) {
+    for (j = info.ys; j < info.ys + info.ym; j++) {
+      for (i = info.xs; i < info.xs + info.xm; i++) {
+        PetscReal x = i * Hx;
+        PetscReal y = j * Hy;
+        PetscReal z = k * Hz;
+        if (i == 0 || j == 0 || k == 0 ||
+            i == info.mx - 1 || j == info.my - 1 || k == info.mz - 1) {
+          /* Dirichlet BC: identity row -> RHS = exact value */
+          barray[k][j][i] = exact_sol(x, y, z);
+        } else {
+          /* Interior: scale forcing by h^3 */
+          barray[k][j][i] = forcing(x, y, z) * scale;
+        }
+      }
+    }
+  }
+  PetscCall(DMDAVecRestoreArray(dm, b, &barray));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
+
+PetscErrorCode ComputeMatrix(KSP ksp, Mat jac, Mat B, void *ctx)
+{
+  DM            da;
+  DMDALocalInfo info;
+  PetscInt      i, j, k;
+  PetscReal     Hx, Hy, Hz, scale;
+  PetscReal     cx, cy, cz, diag;
+  PetscScalar   v[7];
+  MatStencil    row, col[7];
+
+  PetscFunctionBeginUser;
+  PetscCall(KSPGetDM(ksp, &da));
+  PetscCall(DMDAGetLocalInfo(da, &info));
+
+  Hx    = 1.0 / (PetscReal)(info.mx - 1);
+  Hy    = 1.0 / (PetscReal)(info.my - 1);
+  Hz    = 1.0 / (PetscReal)(info.mz - 1);
+  scale = Hx * Hy * Hz;   /* h^3 scaling matching ex45.jl */
+
+  /* Scaled FD coefficients: scale * (1/h^2) = h^3/h^2 = h */
+  cx   = scale / (Hx * Hx);   /* = Hy*Hz/Hx */
+  cy   = scale / (Hy * Hy);   /* = Hx*Hz/Hy */
+  cz   = scale / (Hz * Hz);   /* = Hx*Hy/Hz */
+  diag = 2.0 * (cx + cy + cz);
+
+  for (k = info.zs; k < info.zs + info.zm; k++) {
+    for (j = info.ys; j < info.ys + info.ym; j++) {
+      for (i = info.xs; i < info.xs + info.xm; i++) {
+        row.i = i; row.j = j; row.k = k;
+        if (i == 0 || j == 0 || k == 0 ||
+            i == info.mx - 1 || j == info.my - 1 || k == info.mz - 1) {
+          /* Identity row for Dirichlet BC */
+          v[0] = 1.0;
+          PetscCall(MatSetValuesStencil(B, 1, &row, 1, &row, v, INSERT_VALUES));
+        } else {
+          v[0] = -cz; col[0].i = i;   col[0].j = j;   col[0].k = k-1;
+          v[1] = -cy; col[1].i = i;   col[1].j = j-1; col[1].k = k;
+          v[2] = -cx; col[2].i = i-1; col[2].j = j;   col[2].k = k;
+          v[3] =  diag; col[3].i = i; col[3].j = j;   col[3].k = k;
+          v[4] = -cx; col[4].i = i+1; col[4].j = j;   col[4].k = k;
+          v[5] = -cy; col[5].i = i;   col[5].j = j+1; col[5].k = k;
+          v[6] = -cz; col[6].i = i;   col[6].j = j;   col[6].k = k+1;
+          PetscCall(MatSetValuesStencil(B, 1, &row, 7, col, v, INSERT_VALUES));
+        }
+      }
+    }
+  }
+  PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
+  PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
+  PetscFunctionReturn(PETSC_SUCCESS);
+}
\ No newline at end of file
diff --git a/examples/scalability_tests/job.sh b/examples/scalability_tests/job.sh
new file mode 100644
index 00000000..5591b73d
--- /dev/null
+++ b/examples/scalability_tests/job.sh
@@ -0,0 +1,77 @@
+#!/bin/bash -l
+#SBATCH --job-name=scaling
+#SBATCH --partition=standard
+#SBATCH --cpus-per-task=1
+#SBATCH --exclusive
+#SBATCH --mem=200000M
+#SBATCH --time=0-00:10:00
+#SBATCH --account=??? # add your account here
+# --ntasks, --nodes, --ntasks-per-node, --output passed by submit_scaling.sh
+
+module --force purge
+module load CrayEnv
+module load craype
+module load gcc-native/13.2
+module load cray-mpich/8.1.32
+
+export HOME=/users/kausbori
+export JULIA_DEPOT_PATH=/users/kausbori/.julia
+export MPITRAMPOLINE_LIB=/users/kausbori/mpiwrapper/lib64/libmpiwrapper.so
+export JULIA_PKG_PRECOMPILE_AUTO=0
+export JULIA_NUM_THREADS=1
+export JULIA_CPU_TARGET="generic"
+
+export MPICH_OFI_STARTUP_CONNECT=1
+export MPICH_OFI_NIC_POLICY=NUMA
+export FI_CXI_DEFAULT_CQ_SIZE=131072
+export MPICH_OFI_STARTUP_CONNECT=0
+export FI_CXI_DISABLE_CQ_HUGETLB=1
+export MPICH_MAX_THREAD_SAFETY=single
+export LD_LIBRARY_PATH=/opt/cray/pe/mpich/8.1.32/ofi/cray/17.0/lib:$LD_LIBRARY_PATH
+
+JULIA=/users/kausbori/.julia/juliaup/julia-1.12.6+0.x64.linux.gnu/bin/julia
+
+echo "=== Scaling run: ntasks=${NTASKS}, Nx=${NX}, Ny=${NY}, Nz=${NZ}, mg_levels=${NMGLEVELS}, ncoarse=${NCOARSE}, reduction_factor=${REDUCTION_FACTOR} ==="
+echo "=== Running on ${SLURM_NNODES} nodes, ${SLURM_NTASKS} tasks, $(( SLURM_NTASKS / SLURM_NNODES )) tasks/node ==="
+
+echo "=== MPI diagnostic ==="
+srun -n 1 $JULIA --project=/users/kausbori/PETSc_jl_scalability -e '
+using MPI
+MPI.Init()
+println("MPI library: ", MPI.libmpi)
+println("MPI version: ", MPI.MPI_LIBRARY_VERSION_STRING)
+println("hostname: ", gethostname())
+'
+echo "=== End MPI diagnostic ==="
+
+echo "MPI run started at $(date)"
+srun -n ${NTASKS} $JULIA --heap-size-hint=1G \
+    --project=/users/kausbori/PETSc_jl_scalability ex45.jl \
+  -Nx ${NX} \
+  -Ny ${NY} \
+  -Nz ${NZ} \
+  -pc_mg_levels ${NMGLEVELS} \
+  -ksp_type cg \
+  -pc_type mg \
+  -mg_levels_ksp_type chebyshev \
+  -mg_levels_pc_type sor \
+  -ksp_rtol 1e-8 \
+  -mg_coarse_ksp_type preonly \
+  -mg_coarse_pc_type telescope \
+  -mg_coarse_telescope_reduction_factor ${REDUCTION_FACTOR} \
+  -mg_coarse_telescope_pc_type lu \
+  -mg_coarse_telescope_pc_factor_mat_solver_type superlu_dist \
+  -pc_mg_log \
+  -malloc_view \
+  -memory_view \
+  -ksp_converged_reason \
+  -ksp_monitor_true_residual \
+  -log_view \
+  2>&1
+echo "MPI run finished at $(date)"
+
+echo "=== Memory usage ==="
+sacct -j ${SLURM_JOB_ID} --format=JobID,MaxRSS,AveRSS,NNodes,NCPUs --units=G
+
+echo "exit: $?"
+echo "Job finished at $(date)"
\ No newline at end of file
diff --git a/examples/scalability_tests/makefile b/examples/scalability_tests/makefile
new file mode 100644
index 00000000..7677bfde
--- /dev/null
+++ b/examples/scalability_tests/makefile
@@ -0,0 +1,24 @@
+-include ../../../../petscdir.mk
+
+LOCDIR           = src/ksp/ksp/tutorials/
+MANSEC           = KSP
+CLEANFILES       = rhs.vtk solution.vtk bench_pcsetup
+NP               = 1
+DIRS             = network amrex
+
+include ${PETSC_DIR}/lib/petsc/conf/variables
+include ${PETSC_DIR}/lib/petsc/conf/rules
+
+testex100: ex100.PETSc
+	-@OMPI_MCA_mpi_warn_on_fork=0 ${MPIEXEC} -n 1 ./ex100 -test  > ex100_1.tmp 2>&1; \
+	 if (${DIFF} output/ex100_1.testout ex100_1.tmp > /dev/null 2>&1) then \
+           echo "C/C++ Python example src/ksp/ksp/tutorials/ex100 run successfully with 1 MPI process"; \
+	 else \
+           echo "Possible error running C/C++ Python src/ksp/ksp/tutorials/ex100 with 1 MPI process"; \
+           echo "See https://petsc.org/release/faq/";\
+           cat ex100_1.tmp;\
+           touch ${PETSC_DIR}/check_error;\
+         fi; \
+         ${RM} -f ex100_1.tmp
+
+include ${PETSC_DIR}/lib/petsc/conf/test
diff --git a/examples/scalability_tests/parse_scaling.jl b/examples/scalability_tests/parse_scaling.jl
new file mode 100644
index 00000000..4dc4b427
--- /dev/null
+++ b/examples/scalability_tests/parse_scaling.jl
@@ -0,0 +1,247 @@
+# EXCLUDE FROM TESTING
+#!/usr/bin/env julia
+# parse_scaling.jl
+# Usage: julia parse_scaling.jl scaling_*.out
+
+using Printf
+
+struct ScalingResult
+    jobid            :: String
+    ntasks           :: Int
+    nx               :: Int
+    ny               :: Int
+    nz               :: Int
+    mg_levels        :: Int
+    ncoarse          :: Int
+    solve_time       :: Float64   # application "Solve time:" field
+    ksp_solve_time   :: Float64   # PETSc KSPSolve event time (for weak scaling)
+    ksp_iter         :: Int
+    l2_error         :: Float64
+    max_error        :: Float64
+    residual         :: Float64
+    total_gflops     :: Float64
+    gflops_s         :: Float64
+    converged_reason :: String
+    backend          :: String    # "native" or "PETSc.jl"
+end
+
+function parse_file(filename::String)
+    m = match(r"scaling([a-zA-Z]*)_(\d+)_n(\d+)_nx(\d+)_ny(\d+)_nz(\d+)_mg(\d+)_nc(\d+)\.out", basename(filename))
+    if isnothing(m)
+        @warn "Filename $filename does not match expected pattern, skipping"
+        return nothing
+    end
+    variant   = m[1]   # e.g. "", "Galerkin", "NATIVE"
+    jobid     = m[2]
+    ntasks    = parse(Int, m[3])
+    nx        = parse(Int, m[4])
+    ny        = parse(Int, m[5])
+    nz        = parse(Int, m[6])
+    mg_levels = parse(Int, m[7])
+    ncoarse   = parse(Int, m[8])
+    backend = if uppercase(variant) == "NATIVE"
+        "native"
+    elseif uppercase(variant) == "LOCALLIB"
+        "local lib"
+    else
+        "PETSc.jl"
+    end
+
+    solve_time       = NaN
+    ksp_solve_time   = NaN
+    ksp_iter         = -1
+    l2_error         = NaN
+    max_error        = NaN
+    residual         = NaN
+    total_gflops     = NaN
+    gflops_s         = NaN
+    converged_reason = "UNKNOWN"
+
+    in_petsc_summary = false
+    in_ksp_monitor   = false
+
+    for line in eachline(filename)
+
+        # Detect PETSc performance summary block
+        if occursin("PETSc Performance Summary", line)
+            in_petsc_summary = true
+        end
+
+        # Detect KSP monitor lines (leading spaces + digit + " KSP")
+        if match(r"^\s+\d+ KSP", line) !== nothing
+            in_ksp_monitor = true
+        end
+
+        # KSP monitor block ends at the "Linear solve" line
+        if occursin("Linear solve", line) && occursin("due to", line)
+            in_ksp_monitor = false
+            m2 = match(r"Linear solve (\w+) due to (\S+)", line)
+            isnothing(m2) || (converged_reason = string(m2[1] == "converged" ? "OK " : "FAIL ") * m2[2])
+            continue
+        end
+
+        # Skip KSP monitor lines entirely
+        in_ksp_monitor && continue
+
+        # ---- application output (before PETSc summary) ----
+        if !in_petsc_summary
+
+            if occursin("Outer KSP iterations:", line)
+                m2 = match(r"Outer KSP iterations:\s+(\d+)", line)
+                isnothing(m2) || (ksp_iter = parse(Int, m2[1]))
+
+            elseif occursin("Final residual norm", line)
+                m2 = match(r"Final residual norm\s+([\d.e+\-]+)", line)
+                isnothing(m2) || (residual = parse(Float64, m2[1]))
+
+            elseif occursin("Solve time:", line)
+                m2 = match(r"Solve time:\s+([\d.e+\-]+)", line)
+                isnothing(m2) || isnan(solve_time) && (solve_time = parse(Float64, m2[1]))
+
+            elseif occursin("L2 error:", line)
+                m2 = match(r"L2 error:\s+([\d.e+\-]+)", line)
+                isnothing(m2) || isnan(l2_error) && (l2_error = parse(Float64, m2[1]))
+
+            elseif occursin("Max error:", line)
+                m2 = match(r"Max error:\s+([\d.e+\-]+)", line)
+                isnothing(m2) || isnan(max_error) && (max_error = parse(Float64, m2[1]))
+            end
+
+        # ---- PETSc performance summary block ----
+        else
+            # KSPSolve event line: extract the Max time (column 3 after the counts)
+            # Format: "KSPSolve   count ratio  maxtime ratio  ..."
+            if match(r"^\s*KSPSolve\s", line) !== nothing && isnan(ksp_solve_time)
+                # fields: Name Count Ratio MaxTime Ratio ...
+                vals = split(strip(line))
+                # vals[1]=KSPSolve, vals[2]=count, vals[3]=ratio, vals[4]=maxtime
+                if length(vals) >= 4
+                    t = tryparse(Float64, vals[4])
+                    isnothing(t) || (ksp_solve_time = t)
+                end
+            end
+
+            if match(r"^\s*Flops:\s", line) !== nothing
+                vals = split(strip(replace(line, r"^\s*Flops:\s*" => "")))
+                length(vals) >= 4 && (total_gflops = parse(Float64, vals[4]) / 1e9)
+
+            elseif match(r"^\s*Flops/sec:\s", line) !== nothing
+                vals = split(strip(replace(line, r"^\s*Flops/sec:\s*" => "")))
+                length(vals) >= 4 && (gflops_s = parse(Float64, vals[4]) / 1e9)
+            end
+        end
+    end
+
+    return ScalingResult(jobid, ntasks, nx, ny, nz, mg_levels, ncoarse,
+                         solve_time, ksp_solve_time, ksp_iter,
+                         l2_error, max_error, residual,
+                         total_gflops, gflops_s,
+                         converged_reason, backend)
+end
+
+function ndofs(r::ScalingResult)
+    return r.nx * r.ny * r.nz
+end
+
+function print_table(results::Vector{ScalingResult})
+    sort!(results, by = r -> (r.ntasks, r.backend))
+
+    println()
+    @printf("%-12s  %6s  %4s  %4s  %15s  %9s  %10s  %11s  %10s  %10s  %4s  %12s  %12s  %12s  %s\n",
+            "JobID", "Ntasks", "MG", "NC", "Grid", "Backend",
+            "SolveTime", "KSPSolveTime", "TotGFlops",
+            "GFlops/s", "KSP", "L2_error", "Max_error", "Residual", "Converged")
+    println(repeat("-", 190))
+
+    for r in results
+        grid  = @sprintf("%dx%dx%d", r.nx, r.ny, r.nz)
+        ksp_t = isnan(r.ksp_solve_time) ? "        N/A" : @sprintf("%11.3f", r.ksp_solve_time)
+        @printf("%-12s  %6d  %4d  %4d  %15s  %9s  %10.3f  %s  %10.2f  %10.2f  %4d  %12.4e  %12.4e  %12.4e  %s\n",
+                r.jobid, r.ntasks, r.mg_levels, r.ncoarse, grid, r.backend,
+                r.solve_time, ksp_t, r.total_gflops, r.gflops_s,
+                r.ksp_iter, r.l2_error, r.max_error, r.residual,
+                r.converged_reason)
+    end
+    println()
+end
+
+function print_convergence(results::Vector{ScalingResult})
+    iso = filter(r -> r.nx == r.ny == r.nz, results)
+    sort!(iso, by = r -> r.nx)
+
+    if length(iso) >= 2
+        println("Spatial convergence rate - isotropic runs (should be ~2.0 for second-order FD):")
+        println(repeat("-", 65))
+        for i in 2:length(iso)
+            r_fine   = iso[i]
+            r_coarse = iso[i-1]
+            h_fine   = 1.0 / (r_fine.nx   - 1)
+            h_coarse = 1.0 / (r_coarse.nx - 1)
+            if !isnan(r_fine.l2_error) && !isnan(r_coarse.l2_error)
+                rate = log(r_coarse.l2_error / r_fine.l2_error) / log(h_coarse / h_fine)
+                @printf("  %dx%dx%d -> %dx%dx%d  L2: %.4e->%.4e  rate= %.2f  [%s]\n",
+                        r_coarse.nx, r_coarse.ny, r_coarse.nz,
+                        r_fine.nx,   r_fine.ny,   r_fine.nz,
+                        r_coarse.l2_error, r_fine.l2_error, rate, r_fine.backend)
+            end
+        end
+        println()
+    end
+end
+
+function print_scaling(results::Vector{ScalingResult})
+    # Print separate weak scaling tables per backend
+    backends = unique(r.backend for r in results)
+    for backend in sort(backends)
+        subset = filter(r -> r.backend == backend, results)
+        sorted = sort(subset, by = r -> r.ntasks)
+        isempty(sorted) && continue
+
+        ref    = sorted[1]
+        t0_ksp   = ref.ksp_solve_time
+        t0_solve = ref.solve_time
+        use_ksp  = !isnan(t0_ksp)
+        t0       = use_ksp ? t0_ksp : t0_solve
+
+        label = use_ksp ? "KSPSolve time" : "Solve time (fallback)"
+        println("Weak scaling efficiency [$backend] based on $label (relative to smallest ntasks):")
+        println(repeat("-", 85))
+        @printf("  %-6s  %-15s  %8s  %11s  %10s  %10s  %s\n",
+                "Ntasks", "Grid", "DOFs/core", "KSPSolve(s)", "SolveTime", "Efficiency", "Converged")
+        println(repeat("-", 85))
+        for r in sorted
+            t_ksp = isnan(r.ksp_solve_time) ? NaN : r.ksp_solve_time
+            t_use = use_ksp ? t_ksp : r.solve_time
+            eff   = isnan(t_use) ? NaN : t0 / t_use * 100
+            dpc   = ndofs(r) / r.ntasks
+            grid_str = @sprintf("%dx%dx%d", r.nx, r.ny, r.nz)
+            ksp_str  = isnan(t_ksp) ? "        N/A" : @sprintf("%11.3f", t_ksp)
+            eff_str  = isnan(eff)   ? "       N/A" : @sprintf("%9.1f%%", eff)
+            @printf("  %-6d  %-15s  %8.0f  %s  %10.3f  %s  %s\n",
+                    r.ntasks, grid_str, dpc, ksp_str, r.solve_time, eff_str, r.converged_reason)
+        end
+        println()
+    end
+end
+
+function main()
+    if isempty(ARGS)
+        println("Usage: julia parse_scaling.jl scaling_*.out")
+        exit(1)
+    end
+
+    results = ScalingResult[]
+    for f in ARGS
+        isfile(f) || (@warn "File not found: $f, skipping"; continue)
+        r = parse_file(f)
+        isnothing(r) || push!(results, r)
+    end
+
+    isempty(results) && (println("No valid files found."); exit(1))
+
+    print_table(results)
+    print_convergence(results)
+    print_scaling(results)
+end
+
+main()
\ No newline at end of file
diff --git a/examples/scalability_tests/submit_scaling.sh b/examples/scalability_tests/submit_scaling.sh
new file mode 100644
index 00000000..403d5c4d
--- /dev/null
+++ b/examples/scalability_tests/submit_scaling.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# submit_scaling.sh
+# Usage: ./submit_scaling.sh <ntasks> <Nx> <Ny> <Nz> <mg_levels> [ncoarse]
+#
+# Arguments:
+#   ntasks     - total MPI tasks (default: 8)
+#   Nx/Ny/Nz   - global grid size in each dimension (default: 129)
+#   mg_levels  - number of multigrid levels (default: 4)
+#   ncoarse    - number of ranks for coarse solve (default: 16)
+#
+# Coarse grid size guidance:
+#   mg_levels=4, fine=129^3  -> coarse ~16^3 (~4096 DOF)
+#   mg_levels=5, fine=257^3  -> coarse ~17^3 (~4913 DOF)
+#
+# Examples:
+#   ./submit_scaling.sh 64  257 257 257 5 16
+#   ./submit_scaling.sh 512 1025 1025 1025 6 16
+#   ./submit_scaling.sh 4096 2049 2049 2049 7 16
+
+NTASKS=${1:-8}
+NX=${2:-129}
+NY=${3:-129}
+NZ=${4:-129}
+NMGLEVELS=${5:-4}
+NCOARSE=${6:-16}
+
+if [ ${NCOARSE} -gt ${NTASKS} ]; then
+  ec
+  ho "ERROR: NCOARSE=${NCOARSE} > NTASKS=${NTASKS}"
+  exit 1
+fi
+
+REDUCTION_FACTOR=$((NTASKS / NCOARSE))
+
+NTASKS_PER_NODE=64
+NNODES=$(( (NTASKS + NTASKS_PER_NODE - 1) / NTASKS_PER_NODE ))
+ACTUAL_NTASKS_PER_NODE=$(( NTASKS < NTASKS_PER_NODE ? NTASKS : NTASKS_PER_NODE ))
+
+echo "Submitting: ntasks=${NTASKS}, nodes=${NNODES}, ntasks-per-node=${ACTUAL_NTASKS_PER_NODE}"
+echo "            Nx=${NX}, Ny=${NY}, Nz=${NZ}, mg_levels=${NMGLEVELS}"
+echo "            ncoarse=${NCOARSE}, reduction_factor=${REDUCTION_FACTOR}"
+
+sbatch \
+  --ntasks=${NTASKS} \
+  --nodes=${NNODES} \
+  --ntasks-per-node=${ACTUAL_NTASKS_PER_NODE} \
+  --output="scaling_%j_n${NTASKS}_nx${NX}_ny${NY}_nz${NZ}_mg${NMGLEVELS}_nc${NCOARSE}.out" \
+  --export=ALL,NTASKS=${NTASKS},NX=${NX},NY=${NY},NZ=${NZ},NMGLEVELS=${NMGLEVELS},NCOARSE=${NCOARSE},REDUCTION_FACTOR=${REDUCTION_FACTOR} \
+  job.sh
\ No newline at end of file
diff --git a/examples/scalability_tests/weak_scaling_plot.jl b/examples/scalability_tests/weak_scaling_plot.jl
new file mode 100644
index 00000000..12475dd0
--- /dev/null
+++ b/examples/scalability_tests/weak_scaling_plot.jl
@@ -0,0 +1,99 @@
+# EXCLUDE FROM TESTING
+using GLMakie
+using Statistics
+
+# All KSPSolve times per (ntasks, backend)
+# Multiple runs where available
+
+data = Dict(
+    :native => Dict(
+        64   => [26.917, 27.155, 26.854],
+        512  => [31.492, 30.122, 30.607, 30.307],
+        4096 => [30.694, 28.214, 28.498],
+        32768 => [50.858],
+    ),
+    :petscjl => Dict(
+        64   => [27.916, 27.108, 28.121],
+        512  => [33.670, 32.482, 34.236],
+        4096 => [34.332, 44.244],
+        32768 => [63.300],
+    ),
+    :locallib => Dict(
+        64   => [27.065, 27.006, 27.551],
+        512  => [34.456, 32.033, 30.708],
+        4096 => [38.414, 35.292, 34.220],
+        32768 =>[73.108],   
+    ),
+)
+
+ntasks = [64, 512, 4096, 32768]
+
+function stats(d, backend)
+    means = [isempty(d[backend][n]) ? NaN : mean(d[backend][n]) for n in ntasks]
+    mins  = [isempty(d[backend][n]) ? NaN : minimum(d[backend][n]) for n in ntasks]
+    maxs  = [isempty(d[backend][n]) ? NaN : maximum(d[backend][n]) for n in ntasks]
+    return means, mins, maxs
+end
+
+mean_native,   min_native,   max_native   = stats(data, :native)
+mean_petscjl,  min_petscjl,  max_petscjl  = stats(data, :petscjl)
+mean_locallib, min_locallib, max_locallib = stats(data, :locallib)
+
+fig = Figure(size = (900, 600), fontsize = 14)
+
+ax = Axis(fig[1, 1],
+    title  = "Weak Scaling on LUMI-C | KSPSolve Time (3D Poisson, 64 tasks/node), ex45.jl",
+    xlabel = "Number of MPI tasks, [number of nodes], (global grid size)",
+    ylabel = "Solution time (KSPSolve) (s)",
+    xscale = log2,
+    xticks = (ntasks, ["64\n[1]\n(513³)", "512\n[8]\n(1025³)", "4096\n[64]\n(2049³)", "32768\n[512]\n(4097³)"]),
+    xminorticksvisible = false,
+    yminorticksvisible = false,
+    limits = (nothing, nothing, 0, 75),
+)
+
+col_native   = :black
+col_petscjl  = :dodgerblue
+col_locallib = :tomato
+
+# Shaded min-max bands (only where data exists)
+# native: all 4 points
+band!(ax, ntasks, min_native,   max_native,   color = (col_native,   0.10))
+# petscjl: all 4 points
+band!(ax, ntasks, min_petscjl,  max_petscjl,  color = (col_petscjl,  0.15))
+# locallib: all 4 points
+band!(ax, ntasks[1:4], min_locallib[1:4], max_locallib[1:4], color = (col_locallib, 0.15))
+
+# Mean lines
+lines!(ax, ntasks,       mean_native,              color = col_native,   linewidth = 2.5, label = "Native C")
+lines!(ax, ntasks,       mean_petscjl,             color = col_petscjl,  linewidth = 2.5, label = "PETSc.jl (_jll)")
+lines!(ax, ntasks,  mean_locallib[1:4],       color = col_locallib, linewidth = 2.5, label = "Local lib")
+# Dashed extension hint for local lib pending
+scatter!(ax, [32768], [NaN], color = col_locallib, markersize = 14, marker = :diamond,
+         strokewidth = 1.5, strokecolor = :white)  # placeholder, won't render NaN
+
+# Individual run scatter points
+for (backend, col, mkr) in [(:native, col_native, :circle), (:petscjl, col_petscjl, :rect), (:locallib, col_locallib, :diamond)]
+    for n in ntasks
+        vals = data[backend][n]
+        isempty(vals) && continue
+        scatter!(ax, fill(n, length(vals)), vals, color = col, markersize = 9, marker = mkr)
+    end
+end
+
+# Mean markers (larger, white outline)
+scatter!(ax, ntasks,      mean_native,        color = col_native,   markersize = 14, marker = :circle,  strokewidth = 1.5, strokecolor = :white)
+scatter!(ax, ntasks,      mean_petscjl,       color = col_petscjl,  markersize = 14, marker = :rect,    strokewidth = 1.5, strokecolor = :white)
+scatter!(ax, ntasks,  mean_locallib,       color = col_locallib, markersize = 14, marker = :diamond, strokewidth = 1.5, strokecolor = :white)
+
+# Ideal flat line based on native baseline
+hlines!(ax, [mean_native[1]], color = :gray, linewidth = 1.5, linestyle = :dash, label = "Ideal (flat)")
+
+axislegend(ax, position = :lt, framevisible = true, labelsize = 13)
+
+Label(fig[2, 1],
+    "Shaded bands show min–max range across repeated runs. Large markers show mean.",
+    fontsize = 11, color = :gray50, tellwidth = false)
+
+save("weak_scaling.png", fig, px_per_unit = 2)
+println("Saved weak_scaling.png")