diff --git a/pyproject.toml b/pyproject.toml
index 90341f1..f215317 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ version = "1.0.0"
 description = "SAtom – Sensitivity Analysis using the Kolmogorov-Smirnov 2-sample test (TOM method)"
 readme = "README.md"
 license = {text = "BSD-2-Clause"}
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 authors = [
     {name = "Torben Østergård", email = "tostergard@build.aau.dk"},
     {name = "Markus Schaffer", email = "msch@build.aau.dk"},
@@ -20,7 +20,6 @@ classifiers = [
     "License :: OSI Approved :: BSD License",
     "Operating System :: OS Independent",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -33,9 +32,14 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+fast = [
+    "numba>=0.60.0",
+]
 dev = [
     "pytest>=7.0",
     "pytest-cov",
+    "pandas",
+    "openpyxl",
 ]
 
 [project.urls]
diff --git a/src/satom/_core.py b/src/satom/_core.py
index 874b6c1..0dac715 100644
--- a/src/satom/_core.py
+++ b/src/satom/_core.py
@@ -25,11 +25,122 @@
 import numpy as np
 from numpy.typing import ArrayLike
 
+try:
+    from numba import njit
+
+    _HAS_NUMBA = True
+except ModuleNotFoundError:  # pragma: no cover
+    _HAS_NUMBA = False
+
+    def njit(*args, **kwargs):  # noqa: D103 – stub
+        """Identity decorator when Numba is not installed."""
+        if args and callable(args[0]):
+            return args[0]
+        return lambda f: f
+
+
+# =========================================================================
+# Numba-accelerated KS row computation
+# =========================================================================
+
+
+@njit(cache=True)
+def _ks_row_mode01(mask, orderMat, uniformCDF, N, nIn):
+    """KS row for SScompare 0 or 1 (one-sample vs uniform).
+
+    Fuses indexing + cumsum + max into a single pass per column,
+    eliminating temporary array allocations.
+    """
+    ks_row = np.empty(nIn, dtype=np.float64)
+    n2 = 0.0
+    for i in range(N):
+        if mask[i]:
+            n2 += 1.0
+    for col in range(nIn):
+        running_sum = 0.0
+        max_d = 0.0
+        for row in range(N):
+            idx = orderMat[row, col]
+            if mask[idx]:
+                running_sum += 1.0
+            cdf2 = running_sum / n2
+            d = abs(uniformCDF[row] - cdf2)
+            if d > max_d:
+                max_d = d
+        ks_row[col] = max_d
+    return ks_row
+
+
+@njit(cache=True)
+def _ks_row_mode2(maskB, maskN, orderMat, N, nIn):
+    """KS row for SScompare 2 (behavioural vs non-behavioural).
+
+    Fuses both CDFs + max into a single pass per column.
+    """
+    ks_row = np.empty(nIn, dtype=np.float64)
+    n1 = 0.0
+    n2 = 0.0
+    for i in range(N):
+        if maskB[i]:
+            n1 += 1.0
+        if maskN[i]:
+            n2 += 1.0
+    for col in range(nIn):
+        sum1 = 0.0
+        sum2 = 0.0
+        max_d = 0.0
+        for row in range(N):
+            idx = orderMat[row, col]
+            if maskB[idx]:
+                sum1 += 1.0
+            if maskN[idx]:
+                sum2 += 1.0
+            d = abs(sum1 / n1 - sum2 / n2)
+            if d > max_d:
+                max_d = d
+        ks_row[col] = max_d
+    return ks_row
+
+
+@njit(cache=True)
+def _build_mask(randStart, YsortedIdx, N, nOut, subsetSize):
+    """Build the behavioural mask for one iteration (Numba-accelerated).
+
+    Parameters
+    ----------
+    randStart : 1-D int array of length nOut, 1-based start positions.
+    YsortedIdx : 2-D int array (nOut, N), 0-based original row indices
+                 sorted by each output.
+    N, nOut, subsetSize : int
+
+    Returns
+    -------
+    maskB : bool array of length N
+    remaining : int – number of True values in maskB
+    """
+    maskB = np.ones(N, dtype=np.bool_)
+    for idxY in range(nOut):
+        idxStart = randStart[idxY]  # 1-based
+        maskTmp = np.zeros(N, dtype=np.bool_)
+        for k in range(subsetSize + 1):
+            sortedPos = (idxStart - 1 + k) % N
+            origIdx = YsortedIdx[idxY, sortedPos]
+            maskTmp[origIdx] = True
+        for i in range(N):
+            if not maskTmp[i]:
+                maskB[i] = False
+    remaining = 0
+    for i in range(N):
+        if maskB[i]:
+            remaining += 1
+    return maskB, remaining
+
 
 # =========================================================================
 # Helper: Switch matplotlib backend
 # =========================================================================
 
+
 def _switch_backend(gui: str) -> None:
     """Force-switch the matplotlib backend, reimporting pyplot."""
     global plt
@@ -49,6 +160,7 @@ def SAtom(
     SScompare: int = 0,
     X_label: Sequence[str] | None = None,
     seed: int = 42,
+    use_numba: bool = True,
 ) -> tuple[np.ndarray, np.ndarray]:
     """TOM sensitivity analysis using the Kolmogorov-Smirnov 2-sample test.
 
@@ -81,6 +193,10 @@ def SAtom(
         Labels for each input variable in *X*.  Default: None (auto-generated).
     seed : int
         Random seed for reproducibility.  Default: 42
+    use_numba : bool
+        If ``True`` (default), use Numba JIT-compiled inner loop for
+        faster KS computation.  Set to ``False`` to use the pure-NumPy
+        fallback (useful for benchmarking or environments without Numba).
 
     Returns
     -------
@@ -136,6 +252,15 @@ def SAtom(
     ):
         raise ValueError("checkInterval must be a non-negative integer.")
 
+    # Resolve Numba availability
+    if use_numba and not _HAS_NUMBA:
+        warnings.warn(
+            "Numba is not installed – falling back to pure-NumPy mode. "
+            "Install numba (pip install satom[fast]) for ~5-10× speedup.",
+            stacklevel=2,
+        )
+        use_numba = False
+
     # Subset data (double precision throughout)
     X = X[:N].copy()
     Y = Y[:N].copy()
@@ -185,15 +310,12 @@ def SAtom(
 
     subsetSize = max(1, int(np.floor(0.5 ** (1.0 / nOut) * N)))
 
-    # MATLAB: indexVector = (1:N)' → 1-based
-    indexVector = np.arange(1, N + 1, dtype=np.float64)
-
     # Precompute sorted index arrays for each output
-    YtempSorted: list[np.ndarray] = []
+    # YsortedIdx[idxY, k] = 0-based original row at position k when
+    # output idxY is sorted ascending
+    YsortedIdx = np.empty((nOut, N), dtype=np.intp)
     for idxY in range(nOut):
-        augmented = np.column_stack([Y, indexVector])
-        order = np.argsort(augmented[:, idxY], kind="mergesort")
-        YtempSorted.append(augmented[order])
+        YsortedIdx[idxY, :] = np.argsort(Y[:, idxY], kind="mergesort")
 
     # Precompute sorted indices for each input column
     orderMat = np.empty((N, nIn), dtype=np.uint32)
@@ -205,9 +327,10 @@ def SAtom(
     if SScompare in (0, 1):
         uniformCDF = np.arange(1, N + 1, dtype=np.float64) / float(N)
 
-    # Cache-aware chunk size (8 bytes per float64)
-    cacheBytes = 32e6
-    chunkSize = max(1, int(np.floor(cacheBytes / (8 * N * 2))))
+    # Cache-aware chunk size for NumPy fallback (8 bytes per float64)
+    if not use_numba:
+        cacheBytes = 32e6
+        chunkSize = max(1, int(np.floor(cacheBytes / (8 * N * 2))))
 
     # MATLAB: randStarts = ceil(rand(J*10, nOut) * N) → 1-based [1..N]
     randStarts = np.ceil(rng.rand(J * 10, nOut) * N).astype(np.intp)
@@ -228,9 +351,7 @@ def SAtom(
         originalBackend = matplotlib.get_backend()
         _switch_backend("QtAgg")
         plt.ion()
-        convFig, convAx = plt.subplots(
-            1, 1, num="TOM: Interactive Convergence Check"
-        )
+        convFig, convAx = plt.subplots(1, 1, num="TOM: Interactive Convergence Check")
         nextCheck = checkInterval
 
     # ------------------------------------------------------------------
@@ -245,66 +366,74 @@ def SAtom(
             randStarts = np.concatenate([randStarts, newStarts], axis=0)
 
         # Build behavioural mask
-        maskB = np.ones(N, dtype=bool)
-        for idxY in range(nOut):
-            idxStart = randStarts[randIdx, idxY]
-
-            # MATLAB colon is inclusive → length = subsetSize + 1
-            idxRange = np.mod(
-                np.arange(idxStart - 1, idxStart - 1 + subsetSize + 1), N
-            ) + 1
-
-            # Retrieve 1-based original row indices, convert to 0-based
-            idxBBefore = YtempSorted[idxY][idxRange - 1, nOut]
-
-            maskTmp = np.zeros(N, dtype=bool)
-            maskTmp[(idxBBefore - 1).astype(np.intp)] = True
-            maskB &= maskTmp
+        if use_numba:
+            maskB, remaining = _build_mask(
+                randStarts[randIdx], YsortedIdx, N, nOut, subsetSize
+            )
+        else:
+            maskB = np.ones(N, dtype=bool)
+            for idxY in range(nOut):
+                idxStart = randStarts[randIdx, idxY]
+                sortedPositions = np.mod(
+                    np.arange(idxStart - 1, idxStart - 1 + subsetSize + 1), N
+                )
+                origIndices = YsortedIdx[idxY, sortedPositions]
+                maskTmp = np.zeros(N, dtype=bool)
+                maskTmp[origIndices] = True
+                maskB &= maskTmp
+            remaining = int(maskB.sum())
 
         randIdx += 1
 
-        remaining = int(maskB.sum())
         if remaining == 0 or remaining == N:
             continue
 
-        # --- Compute KS row (with cache-aware chunking) -------------------
-        if SScompare == 0:  # Non-behavioural vs. all
-            maskN = ~maskB
-            n2 = float(N - remaining)
-            for colStart in range(0, nIn, chunkSize):
-                colEnd = min(colStart + chunkSize, nIn)
-                cols = slice(colStart, colEnd)
-                m2_chunk = maskN[orderMat[:, cols]].astype(np.float64)
-                cdf2_chunk = np.cumsum(m2_chunk, axis=0) / n2
-                KS[rep, cols] = np.max(
-                    np.abs(uniformCDF[:, np.newaxis] - cdf2_chunk), axis=0
-                )
-
-        elif SScompare == 1:  # Behavioural vs. all
-            n2 = float(remaining)
-            for colStart in range(0, nIn, chunkSize):
-                colEnd = min(colStart + chunkSize, nIn)
-                cols = slice(colStart, colEnd)
-                m2_chunk = maskB[orderMat[:, cols]].astype(np.float64)
-                cdf2_chunk = np.cumsum(m2_chunk, axis=0) / n2
-                KS[rep, cols] = np.max(
-                    np.abs(uniformCDF[:, np.newaxis] - cdf2_chunk), axis=0
-                )
-
-        else:  # SScompare == 2: Non-behavioural vs. behavioural
-            maskN = ~maskB
-            n1 = float(remaining)
-            n2 = float(N - remaining)
-            for colStart in range(0, nIn, chunkSize):
-                colEnd = min(colStart + chunkSize, nIn)
-                cols = slice(colStart, colEnd)
-                m1_chunk = maskB[orderMat[:, cols]].astype(np.float64)
-                m2_chunk = maskN[orderMat[:, cols]].astype(np.float64)
-                cdf1_chunk = np.cumsum(m1_chunk, axis=0) / n1
-                cdf2_chunk = np.cumsum(m2_chunk, axis=0) / n2
-                KS[rep, cols] = np.max(
-                    np.abs(cdf1_chunk - cdf2_chunk), axis=0
-                )
+        # --- Compute KS row -----------------------------------------------
+        if use_numba:
+            # Numba-accelerated: single fused pass per column
+            if SScompare == 0:  # Non-behavioural vs. all
+                maskN = ~maskB
+                KS[rep, :] = _ks_row_mode01(maskN, orderMat, uniformCDF, N, nIn)
+            elif SScompare == 1:  # Behavioural vs. all
+                KS[rep, :] = _ks_row_mode01(maskB, orderMat, uniformCDF, N, nIn)
+            else:  # SScompare == 2
+                maskN = ~maskB
+                KS[rep, :] = _ks_row_mode2(maskB, maskN, orderMat, N, nIn)
+        else:
+            # Pure-NumPy fallback with cache-aware chunking
+            if SScompare == 0:  # Non-behavioural vs. all
+                maskN = ~maskB
+                n2 = float(N - remaining)
+                for colStart in range(0, nIn, chunkSize):
+                    colEnd = min(colStart + chunkSize, nIn)
+                    cols = slice(colStart, colEnd)
+                    m2_chunk = maskN[orderMat[:, cols]]
+                    cdf2_chunk = np.cumsum(m2_chunk, axis=0, dtype=np.float64) / n2
+                    KS[rep, cols] = np.max(
+                        np.abs(uniformCDF[:, np.newaxis] - cdf2_chunk), axis=0
+                    )
+            elif SScompare == 1:  # Behavioural vs. all
+                n2 = float(remaining)
+                for colStart in range(0, nIn, chunkSize):
+                    colEnd = min(colStart + chunkSize, nIn)
+                    cols = slice(colStart, colEnd)
+                    m2_chunk = maskB[orderMat[:, cols]]
+                    cdf2_chunk = np.cumsum(m2_chunk, axis=0, dtype=np.float64) / n2
+                    KS[rep, cols] = np.max(
+                        np.abs(uniformCDF[:, np.newaxis] - cdf2_chunk), axis=0
+                    )
+            else:  # SScompare == 2
+                maskN = ~maskB
+                n1 = float(remaining)
+                n2 = float(N - remaining)
+                for colStart in range(0, nIn, chunkSize):
+                    colEnd = min(colStart + chunkSize, nIn)
+                    cols = slice(colStart, colEnd)
+                    m1_chunk = maskB[orderMat[:, cols]]
+                    m2_chunk = maskN[orderMat[:, cols]]
+                    cdf1_chunk = np.cumsum(m1_chunk, axis=0, dtype=np.float64) / n1
+                    cdf2_chunk = np.cumsum(m2_chunk, axis=0, dtype=np.float64) / n2
+                    KS[rep, cols] = np.max(np.abs(cdf1_chunk - cdf2_chunk), axis=0)
 
         rep += 1
 
@@ -345,9 +474,10 @@ def SAtom(
     # Trim to completed iterations
     KS = KS[:actualJ]
 
-    KS2_J_mean = np.cumsum(KS, axis=0) / np.arange(
-        1, actualJ + 1, dtype=np.float64
-    )[:, np.newaxis]
+    KS2_J_mean = (
+        np.cumsum(KS, axis=0)
+        / np.arange(1, actualJ + 1, dtype=np.float64)[:, np.newaxis]
+    )
 
     # Output (already float64)
     KS2_mean = np.mean(KS, axis=0)
@@ -379,9 +509,7 @@ def SAtom(
         ax_conv.set_ylabel(r"Mean of $D_{ij}$")
         if actualJ > 10:
             ax_conv.set_xlim(10, actualJ)
-        ax_conv.legend(
-            loc="center left", bbox_to_anchor=(1.0, 0.5), fontsize="small"
-        )
+        ax_conv.legend(loc="center left", bbox_to_anchor=(1.0, 0.5), fontsize="small")
         fig_conv.tight_layout()
         plt.show()
 
@@ -392,6 +520,7 @@ def SAtom(
 # Helper: Interactive convergence checkpoint
 # =========================================================================
 
+
 def _convergence_checkpoint(
     KS: np.ndarray,
     currentJ: int,
@@ -402,9 +531,10 @@ def _convergence_checkpoint(
     """Display running convergence plot and ask user to continue or stop."""
 
     KS_partial = KS[:currentJ]
-    cumMean = np.cumsum(KS_partial, axis=0) / np.arange(
-        1, currentJ + 1, dtype=np.float64
-    )[:, np.newaxis]
+    cumMean = (
+        np.cumsum(KS_partial, axis=0)
+        / np.arange(1, currentJ + 1, dtype=np.float64)[:, np.newaxis]
+    )
 
     currentMean = np.mean(KS_partial, axis=0)
     rank = np.argsort(currentMean)[::-1]