diff --git a/README.md b/README.md
index b36d849..4fc9789 100644
--- a/README.md
+++ b/README.md
@@ -295,7 +295,7 @@ positive int = deny with errno, `"audit"`/`-2` = allow + flag.
 **Event fields:** `syscall`, `category` (file/network/process/memory),
 `pid`, `parent_pid`, `host`, `port`, `argv`, `denied`.
 
-> **TOCTOU NOTE ** Per `seccomp_unotify(2)`, the kernel
+> **TOCTOU NOTE** Per `seccomp_unotify(2)`, the kernel
 > re-reads user-memory pointers after `Continue`. Sandlock handles this
 > in two places:
 >
@@ -303,14 +303,15 @@ positive int = deny with errno, `"audit"`/`-2` = allow + flag.
 >   belongs in static Landlock rules (`fs_readable` / `fs_writable` /
 >   `fs_denied`) — kernel-enforced and TOCTOU-immune. Use
 >   `ctx.deny_path()` for runtime additions.
-> - **`event.argv` is exposed and TOCTOU-safe.** Before returning
->   `Continue` for an `execve`, the supervisor `PTRACE_SEIZE` +
->   `PTRACE_INTERRUPT`s every sibling thread of the calling tid so the
->   kernel's re-read happens with no other writer running. The pause
->   has no observable cost: `execve`'s `de_thread` step kills sibling
->   threads anyway. If the freeze cannot be established (e.g., YAMA
->   blocks ptrace), the execve is denied with `EPERM` — the safety
->   invariant is never silently relaxed.
+> - **`event.argv` is exposed and TOCTOU-safe.** Before exposing
+>   `argv` to `policy_fn` or returning `Continue` for an
+>   `execve`, the supervisor freezes every task in `ProcessIndex`,
+>   including peer processes that may alias argv through shared memory.
+>   With `policy_fn` active, fork-like syscalls are traced for one
+>   ptrace creation event, so children are registered in `ProcessIndex`
+>   before they can run user code. If the freeze or creation tracking
+>   cannot be established (e.g., YAMA blocks ptrace), the syscall is
+>   denied with `EPERM`; the safety invariant is never silently relaxed.
 
 **Context methods:**
 - `ctx.restrict_network(ips)` / `ctx.grant_network(ips)` — network control
@@ -455,8 +456,8 @@ what a command would do before committing.
 ### COW Fork & Map-Reduce
 
 Initialize expensive state once, then fork COW clones that share memory.
-Each fork uses raw `fork(2)` (bypasses seccomp notification) for minimal
-overhead. 1000 clones in ~530ms, ~1,900 forks/sec.
+Each clone uses raw `fork(2)` with shared copy-on-write pages. 1000
+clones in ~530ms, ~1,900 forks/sec.
 
 Each clone's stdout is captured via its own pipe. `reduce()` reads all
 pipes and feeds combined output to a reducer's stdin — fully pipe-based
diff --git a/crates/sandlock-core/src/arch.rs b/crates/sandlock-core/src/arch.rs
index 6f1653f..14bd7f1 100644
--- a/crates/sandlock-core/src/arch.rs
+++ b/crates/sandlock-core/src/arch.rs
@@ -29,6 +29,17 @@ mod imp {
     pub const SYS_IOPERM: Option<i64> = Some(libc::SYS_ioperm);
     pub const SYS_IOPL: Option<i64> = Some(libc::SYS_iopl);
     pub const SYS_TIME: Option<i64> = Some(libc::SYS_time);
+
+    /// Every syscall the kernel will dispatch through `handle_fork`.
+    /// Single source of truth for callers that enumerate fork-class
+    /// syscalls (BPF notif registration in `seccomp::dispatch`,
+    /// classification in `resource::is_process_creation_notif`).
+    pub const FORK_LIKE_SYSCALLS: &[i64] = &[
+        libc::SYS_clone,
+        libc::SYS_clone3,
+        libc::SYS_vfork,
+        libc::SYS_fork,
+    ];
 }
 
 #[cfg(target_arch = "aarch64")]
@@ -60,6 +71,13 @@ mod imp {
     pub const SYS_IOPERM: Option<i64> = None;
     pub const SYS_IOPL: Option<i64> = None;
     pub const SYS_TIME: Option<i64> = None;
+
+    /// Every syscall the kernel will dispatch through `handle_fork`.
+    /// aarch64 has no `fork`/`vfork` (glibc emulates via `clone`).
+    pub const FORK_LIKE_SYSCALLS: &[i64] = &[
+        libc::SYS_clone,
+        libc::SYS_clone3,
+    ];
 }
 
 pub use imp::*;
diff --git a/crates/sandlock-core/src/context.rs b/crates/sandlock-core/src/context.rs
index 67627d7..0a3b4ed 100644
--- a/crates/sandlock-core/src/context.rs
+++ b/crates/sandlock-core/src/context.rs
@@ -240,6 +240,15 @@ pub fn notif_syscalls(policy: &Policy) -> Vec<u32> {
         libc::SYS_waitid as u32,
     ];
     arch::push_optional_syscall(&mut nrs, arch::SYS_VFORK);
+    // Bare fork(2) carries none of the namespace/process-limit risk of
+    // clone/clone3 and was historically left out of the BPF filter so
+    // hot fork-loops (COW map-reduce) bypass the supervisor entirely.
+    // It only needs interception when policy_fn is active, so the
+    // supervisor can register the new child via ptrace fork events
+    // before it can run user code (argv-safety invariant).
+    if policy.policy_fn.is_some() {
+        arch::push_optional_syscall(&mut nrs, arch::SYS_FORK);
+    }
 
     if policy.max_memory.is_some() {
         nrs.push(libc::SYS_mmap as u32);
@@ -949,9 +958,21 @@ pub(crate) fn confine_child(args: ChildSpawnArgs<'_>) -> ! {
         let mut notif = notif_syscalls(policy);
         if !extra_syscalls.is_empty() {
             notif.extend_from_slice(extra_syscalls);
-            notif.sort_unstable();
-            notif.dedup();
         }
+        // Argv-safety gate (companion to the policy_fn case in
+        // notif_syscalls): an extra handler bound to execve/execveat
+        // can call `read_child_mem` to inspect argv, so the supervisor
+        // must register newly forked children before they can run user
+        // code — same invariant policy_fn relies on. Bare fork(2)
+        // therefore needs to be intercepted here too.
+        let exec_extra = extra_syscalls.iter().any(|&n| {
+            n == libc::SYS_execve as u32 || n == libc::SYS_execveat as u32
+        });
+        if exec_extra {
+            arch::push_optional_syscall(&mut notif, arch::SYS_FORK);
+        }
+        notif.sort_unstable();
+        notif.dedup();
         let filter = match bpf::assemble_filter(&notif, &deny, &args) {
             Ok(f) => f,
             Err(e) => fail!(format!("seccomp assemble: {}", e)),
@@ -1093,6 +1114,24 @@ mod tests {
         if let Some(vfork) = arch::SYS_VFORK {
             assert!(nrs.contains(&(vfork as u32)));
         }
+        // Bare fork(2) is intercepted only when policy_fn is active —
+        // see notif_syscalls. The default policy has no policy_fn, so
+        // fork stays out of the BPF filter and hot fork-loops keep
+        // bypassing the supervisor.
+        if let Some(fork) = arch::SYS_FORK {
+            assert!(!nrs.contains(&(fork as u32)));
+        }
+    }
+
+    #[test]
+    fn test_notif_syscalls_fork_gated_on_policy_fn() {
+        let Some(fork) = arch::SYS_FORK else { return };
+        let policy = Policy::builder()
+            .policy_fn(|_event, _ctx| crate::policy_fn::Verdict::Allow)
+            .build()
+            .unwrap();
+        let nrs = notif_syscalls(&policy);
+        assert!(nrs.contains(&(fork as u32)));
     }
 
     #[test]
diff --git a/crates/sandlock-core/src/fork.rs b/crates/sandlock-core/src/fork.rs
index 14ca53f..4cb4d09 100644
--- a/crates/sandlock-core/src/fork.rs
+++ b/crates/sandlock-core/src/fork.rs
@@ -5,17 +5,17 @@
 //! COW clones that share memory pages with the template. Each clone
 //! receives `CLONE_ID=0..N-1` and execs `work_cmd`.
 //!
-//! Uses raw `fork()` syscall (NR 57 on x86_64) to bypass seccomp
-//! notification — the BPF filter only intercepts `clone`/`clone3`.
+//! Uses raw `fork()` syscall (NR 57 on x86_64). The supervisor
+//! intercepts fork-like syscalls for process accounting and, when
+//! `policy_fn` is active, child registration before user code runs.
 
 use std::os::unix::io::RawFd;
 
 // ============================================================
-// Raw fork (bypasses seccomp clone interception)
+// Raw fork
 // ============================================================
 
 /// Raw fork() syscall — NR 57 on x86_64.
-/// Unlike clone/clone3, this is NOT intercepted by the seccomp notif filter.
 fn raw_fork() -> std::io::Result<i32> {
     #[cfg(target_arch = "x86_64")]
     const NR_FORK: i64 = 57;
diff --git a/crates/sandlock-core/src/policy_fn.rs b/crates/sandlock-core/src/policy_fn.rs
index 2e5f0aa..b7a2ce1 100644
--- a/crates/sandlock-core/src/policy_fn.rs
+++ b/crates/sandlock-core/src/policy_fn.rs
@@ -63,9 +63,12 @@ pub enum SyscallCategory {
 /// (`fs_read` / `fs_write` / `fs_deny`); see issue #27.
 ///
 /// `argv` *is* exposed for `execve`/`execveat` and is TOCTOU-safe by
-/// construction: before the supervisor returns `Continue` for an
-/// execve, it `PTRACE_SEIZE`+`PTRACE_INTERRUPT`s every task in the
-/// sandbox — both sibling threads of the calling tid (same TGID, share
+/// construction: with `policy_fn` active, fork-like syscalls are traced
+/// for one ptrace creation event, so children are registered in
+/// `ProcessIndex` before they can run user code. Before the supervisor
+/// exposes `argv` to `policy_fn` or returns `Continue` for an execve, it
+/// then `PTRACE_SEIZE`+`PTRACE_INTERRUPT`s every task that could write
+/// the memory — both sibling threads of the calling tid (same TGID, share
 /// `mm_struct`) and peer threads in other TGIDs that may alias argv
 /// pages via `MAP_SHARED` mappings or share `mm_struct` via
 /// `clone(CLONE_VM)`. The kernel's post-Continue re-read therefore
@@ -94,8 +97,11 @@ pub struct SyscallEvent {
     /// Size argument (for mmap, brk).
     pub size: Option<u64>,
     /// Command arguments for execve/execveat. TOCTOU-safe: every task
-    /// in the sandbox (caller's siblings and peer processes) is frozen
-    /// before the kernel re-reads argv from child memory.
+    /// in `ProcessIndex` (caller's siblings and peer processes) is
+    /// frozen before argv is read for this event and before the kernel
+    /// re-reads argv from child memory; fork-like syscalls register
+    /// children before they can run user code while `policy_fn` is
+    /// active.
     pub argv: Option<Vec<String>>,
     /// Whether the supervisor denied this syscall.
     pub denied: bool,
diff --git a/crates/sandlock-core/src/resource.rs b/crates/sandlock-core/src/resource.rs
index d4277bf..9d96248 100644
--- a/crates/sandlock-core/src/resource.rs
+++ b/crates/sandlock-core/src/resource.rs
@@ -1,21 +1,25 @@
 // Resource limit handlers — memory and process limit enforcement.
 //
 // Continue safety (issue #27): every `Continue` in this module is safe.
-// All decisions here are on scalar register args (clone flags, mmap len,
-// brk address, etc.) which are copied into the seccomp_notif struct at
-// notification time — they are *not* pointers into racy user memory.
-// The kernel's re-read of the syscall args after Continue comes from the
-// suspended calling thread's saved registers, which a sibling thread
-// cannot mutate. So even though we return Continue after taking a
-// security-relevant action (e.g., counting an allocation against the
-// memory limit), there is no TOCTOU substitution window for the values
-// we examined.
+// Most decisions here are on scalar register args (clone flags, mmap
+// len, brk address, etc.) which are copied into the seccomp_notif
+// struct at notification time — they are *not* pointers into racy user
+// memory. The one exception is `clone3`, whose flags live in a
+// `clone_args` struct that the supervisor reads from child memory; see
+// `clone_flags` for the TOCTOU rationale. The reader is used only for
+// resource accounting, not for any kernel-enforced security boundary.
+// The kernel's re-read of the syscall args after Continue comes from
+// the suspended calling thread's saved registers, which a sibling
+// thread cannot mutate.
 
+use std::io;
+use std::mem;
+use std::os::unix::io::RawFd;
 use std::sync::Arc;
 use tokio::sync::Mutex;
 
 use crate::seccomp::ctx::SupervisorCtx;
-use crate::seccomp::notif::{spawn_pid_watcher, NotifAction, NotifPolicy};
+use crate::seccomp::notif::{read_child_mem, spawn_pid_watcher, NotifAction, NotifPolicy};
 use crate::seccomp::state::ResourceState;
 use crate::sys::structs::{
     SeccompNotif, CLONE_NS_FLAGS, EAGAIN, EPERM,
@@ -27,35 +31,77 @@ const CLONE_THREAD: u64 = 0x0001_0000;
 /// MAP_ANONYMOUS flag — only anonymous mappings count toward memory limit.
 const MAP_ANONYMOUS: u64 = 0x20;
 
+/// Effective clone flags for a fork-like notification.
+///
+/// `clone(2)` exposes flags directly in `args[0]`. `clone3(2)` instead
+/// passes a pointer to a `clone_args` struct in `args[0]` (size in
+/// `args[1]`); its `flags` field is the first u64. `fork`/`vfork`
+/// have no flags. Anything else returns `None`.
+///
+/// TOCTOU note: the `clone3` read is from racy user memory — a sibling
+/// thread could mutate the struct between this read and the kernel's
+/// re-read after `Continue`. Callers use this only for resource
+/// accounting (`proc_count`, fork-event tracking gate), never as a
+/// security boundary, so a misread can throttle incorrectly but cannot
+/// bypass any kernel-enforced deny.
+fn clone_flags(notif: &SeccompNotif, notif_fd: RawFd) -> Option<u64> {
+    let args = &notif.data.args;
+    let nr = notif.data.nr as i64;
+    if nr == libc::SYS_clone {
+        return Some(args[0]);
+    }
+    if nr == libc::SYS_clone3 {
+        let ptr = args[0];
+        let size = args[1] as usize;
+        if ptr == 0 || size < 8 {
+            return None;
+        }
+        let buf = read_child_mem(notif_fd, notif.id, notif.pid, ptr, 8).ok()?;
+        let arr: [u8; 8] = buf.as_slice().try_into().ok()?;
+        return Some(u64::from_ne_bytes(arr));
+    }
+    if Some(nr) == crate::arch::SYS_VFORK || Some(nr) == crate::arch::SYS_FORK {
+        return Some(0);
+    }
+    None
+}
+
+/// True when the fork-like notification creates a thread (CLONE_THREAD
+/// set), i.e. it should not bump the process count. Returns false for
+/// non-fork notifs and for clone3 calls whose `clone_args` cannot be
+/// read (fail-safe: count as a process rather than silently uncount).
+fn is_thread_create(notif: &SeccompNotif, notif_fd: RawFd) -> bool {
+    matches!(clone_flags(notif, notif_fd), Some(f) if f & CLONE_THREAD != 0)
+}
+
 /// Handle fork/clone/vfork notifications.
 ///
-/// Enforces namespace creation ban and process limits, registers the
-/// new child in `ProcessIndex` (with an owned pidfd), and spawns a
-/// per-child pidfd watcher that runs unified cleanup on exit.
+/// Enforces namespace creation ban and process limits.
 ///
 /// Note: `notif.pid` here is the *parent* (the task issuing
-/// clone/fork). The kernel hasn't run the syscall yet, so we don't
-/// know the child's pid. The child is discovered and registered later,
-/// on its first own seccomp notification, via `register_child_if_new`.
+/// fork/clone/vfork). The kernel hasn't run the syscall yet, so we don't
+/// know the child's pid yet. When `policy_fn` is active, the supervisor
+/// wraps the eventual `Continue` in one-shot ptrace fork-event tracking
+/// and registers the new child before it can run user code.
 pub(crate) async fn handle_fork(
     notif: &SeccompNotif,
+    notif_fd: RawFd,
     resource: &Arc<Mutex<ResourceState>>,
     _policy: &NotifPolicy,
 ) -> NotifAction {
     let nr = notif.data.nr as i64;
     let args = &notif.data.args;
 
-    // For clone/vfork: check namespace flags in args[0].
-    if nr == libc::SYS_clone || Some(nr) == crate::arch::SYS_VFORK {
-        if nr == libc::SYS_clone && (args[0] & CLONE_NS_FLAGS) != 0 {
-            return NotifAction::Errno(EPERM);
-        }
-        // For clone: if CLONE_THREAD is set, it's a thread — don't count, allow.
-        if nr == libc::SYS_clone && (args[0] & CLONE_THREAD) != 0 {
-            return NotifAction::Continue;
-        }
+    // Namespace flags are denied for clone (clone3's are caught by the
+    // BPF arg filter; vfork takes no flags).
+    if nr == libc::SYS_clone && (args[0] & CLONE_NS_FLAGS) != 0 {
+        return NotifAction::Errno(EPERM);
+    }
+
+    // Threads share their parent's process slot — don't count, allow.
+    if is_thread_create(notif, notif_fd) {
+        return NotifAction::Continue;
     }
-    // For clone3: BPF arg filter handles dangerous cases; proceed to limit check.
 
     let mut rs = resource.lock().await;
 
@@ -75,10 +121,16 @@ pub(crate) async fn handle_fork(
 }
 
 /// If `notif.pid` is not yet tracked in the ProcessIndex, register
-/// it: open a pidfd, record the canonical PidKey, and spawn the exit
-/// watcher. Called from the supervisor's notification dispatcher
-/// before per-syscall handlers run, so handlers can rely on
-/// `ProcessIndex::key_for(notif.pid)` returning a fresh PidKey.
+/// per-process supervisor state for it: open a pidfd, record the
+/// canonical PidKey, and spawn the exit watcher. Called from the
+/// supervisor's notification dispatcher before per-syscall handlers
+/// run, so handlers can rely on `ProcessIndex::key_for(notif.pid)`
+/// returning a fresh PidKey.
+///
+/// With `policy_fn` active, fork-like syscalls additionally register
+/// new child processes at creation time via ptrace fork events, before
+/// the child can run user code. Without `policy_fn`, lazy registration
+/// is enough because no argv-based security decision is exposed.
 ///
 /// The fast path is a single `RwLock` read: if the pid is already
 /// tracked, we trust the entry. PID-identity correctness comes from
@@ -87,23 +139,352 @@ pub(crate) async fn handle_fork(
 /// parent has waited (which we observe), so a stale entry has no
 /// window in which to be hit. We deliberately do *not* re-stat
 /// /proc/<pid>/stat on every notification.
-pub(crate) async fn register_child_if_new(ctx: &Arc<SupervisorCtx>, pid: i32) {
+pub(crate) fn register_pid_if_new(ctx: &Arc<SupervisorCtx>, pid: i32) -> bool {
     if ctx.processes.contains(pid) {
-        return;
+        return true;
     }
 
     let pidfd = match crate::sys::syscall::pidfd_open(pid as u32, 0) {
         Ok(fd) => fd,
-        Err(_) => return, // old kernel or process gone — GC backstop will clean up
+        Err(_) => {
+            // clone3 can create CLONE_THREAD tasks. Linux 6.9 added
+            // PIDFD_THREAD so pidfd_open works for non-leader TIDs too.
+            const PIDFD_THREAD: u32 = libc::O_EXCL as u32;
+            match crate::sys::syscall::pidfd_open(pid as u32, PIDFD_THREAD) {
+                Ok(fd) => fd,
+                Err(_) => {
+                    if matches!(read_tgid_of_tid(pid), Some(tgid) if ctx.processes.contains(tgid)) {
+                        return true;
+                    }
+                    return false; // old kernel or process gone
+                }
+            }
+        }
     };
 
     let key = match ctx.processes.register(pid) {
         Some(k) => k,
-        None => return, // process exited between pidfd_open and stat read
+        None => return false, // process exited between pidfd_open and stat read
     };
 
     // Hand the pidfd to the watcher; it owns the fd's lifetime now.
     spawn_pid_watcher(Arc::clone(ctx), key, pidfd);
+    true
+}
+
+fn read_tgid_of_tid(tid: i32) -> Option<i32> {
+    let status = std::fs::read_to_string(format!("/proc/{}/status", tid)).ok()?;
+    for line in status.lines() {
+        if let Some(rest) = line.strip_prefix("Tgid:") {
+            return rest.trim().parse().ok();
+        }
+    }
+    None
+}
+
+pub(crate) async fn register_child_if_new(ctx: &Arc<SupervisorCtx>, pid: i32) {
+    let _ = register_pid_if_new(ctx, pid);
+}
+
+/// One-shot ptrace attachment around a fork-like syscall. RAII guard:
+/// on drop, detaches the caller so the supervisor cannot leak a ptrace
+/// relationship if a code path between `prepare_*` and `finish_*`
+/// panics or returns early. Functions that complete the tracking and
+/// detach explicitly should still hand the trace to a consuming
+/// function (or let it fall out of scope) — duplicate `PTRACE_DETACH`
+/// is harmless (returns ESRCH and is ignored).
+pub(crate) struct ProcessCreationTrace {
+    caller_tid: i32,
+}
+
+impl Drop for ProcessCreationTrace {
+    fn drop(&mut self) {
+        detach_traced(self.caller_tid);
+    }
+}
+
+fn is_process_creation_notif(notif: &SeccompNotif) -> bool {
+    crate::arch::FORK_LIKE_SYSCALLS.contains(&(notif.data.nr as i64))
+}
+
+/// True when `handle_fork` would have incremented the concurrent
+/// process count for this notification if it returned `Continue`.
+///
+/// Mirrors the thread-vs-process decision in `handle_fork`: a clone or
+/// clone3 with `CLONE_THREAD` does not bump the count, so a later
+/// rollback would be wrong. The clone3 flag check involves a racy read
+/// from child memory — see `clone_flags`.
+pub(crate) fn fork_counted_on_continue(notif: &SeccompNotif, notif_fd: RawFd) -> bool {
+    is_process_creation_notif(notif) && !is_thread_create(notif, notif_fd)
+}
+
+/// True when this notification can create a new task that must be in
+/// `ProcessIndex` before it can race a later execve argv decision.
+pub(crate) fn requires_process_creation_tracking(
+    notif: &SeccompNotif,
+    notif_fd: RawFd,
+    policy: &NotifPolicy,
+) -> bool {
+    policy.argv_safety_required && fork_counted_on_continue(notif, notif_fd)
+}
+
+/// Arm ptrace fork-event tracking on the syscall's calling task.
+///
+/// The caller is stopped in seccomp user notification when this runs.
+/// After the supervisor sends `Continue`, the kernel executes the
+/// fork-like syscall and reports `PTRACE_EVENT_{FORK,VFORK,CLONE}`;
+/// the new child is born traced/stopped, so we can register it before
+/// detaching either task.
+///
+/// Runs the blocking `waitpid` on a tokio blocking-pool thread so the
+/// notification handler's worker is not stalled if the wait stretches.
+pub(crate) async fn prepare_process_creation_tracking(
+    caller_tid: i32,
+) -> io::Result<ProcessCreationTrace> {
+    tokio::task::spawn_blocking(move || prepare_process_creation_tracking_blocking(caller_tid))
+        .await
+        .map_err(|e| {
+            io::Error::new(io::ErrorKind::Other, format!("spawn_blocking join: {e}"))
+        })?
+}
+
+fn prepare_process_creation_tracking_blocking(
+    caller_tid: i32,
+) -> io::Result<ProcessCreationTrace> {
+    let opts = (libc::PTRACE_O_TRACEFORK
+        | libc::PTRACE_O_TRACEVFORK
+        | libc::PTRACE_O_TRACECLONE
+        | libc::PTRACE_O_TRACESYSGOOD) as libc::c_ulong;
+    let ret = unsafe {
+        libc::ptrace(
+            libc::PTRACE_SEIZE as libc::c_uint,
+            caller_tid,
+            0,
+            opts,
+        )
+    };
+    if ret < 0 {
+        return Err(io::Error::last_os_error());
+    }
+
+    // Arm the RAII guard the moment SEIZE succeeds: any early return
+    // from here to the end of this function detaches via Drop.
+    let trace = ProcessCreationTrace { caller_tid };
+
+    let ret = unsafe {
+        libc::ptrace(libc::PTRACE_INTERRUPT as libc::c_uint, caller_tid, 0, 0)
+    };
+    if ret < 0 {
+        return Err(io::Error::last_os_error());
+    }
+    wait_for_ptrace_stop(caller_tid)?;
+
+    // Arm a syscall-exit stop as a fallback. A successful fork-like
+    // syscall reports PTRACE_EVENT_{FORK,VFORK,CLONE}; a failed one has
+    // no child event, but it still reaches syscall-exit so the
+    // supervisor will not block forever waiting for a child that was
+    // never created.
+    let ret = unsafe {
+        libc::ptrace(libc::PTRACE_SYSCALL as libc::c_uint, caller_tid, 0, 0)
+    };
+    if ret < 0 {
+        return Err(io::Error::last_os_error());
+    }
+
+    Ok(trace)
+}
+
+fn detach_traced(tid: i32) {
+    let _ = unsafe { libc::ptrace(libc::PTRACE_DETACH, tid, 0, 0) };
+}
+
+fn wait_for_ptrace_stop(tid: i32) -> io::Result<libc::c_int> {
+    let mut status: libc::c_int = 0;
+    loop {
+        let ret = unsafe { libc::waitpid(tid, &mut status, libc::__WALL) };
+        if ret < 0 {
+            let err = io::Error::last_os_error();
+            if err.raw_os_error() == Some(libc::EINTR) {
+                continue;
+            }
+            return Err(err);
+        }
+        break;
+    }
+
+    if !libc::WIFSTOPPED(status) {
+        return Err(io::Error::new(
+            io::ErrorKind::Other,
+            format!("unexpected ptrace wait status: {status:#x}"),
+        ));
+    }
+    Ok(status)
+}
+
+fn syscall_stop_kind(tid: i32) -> io::Result<u8> {
+    let mut info: libc::ptrace_syscall_info = unsafe { mem::zeroed() };
+    let ret = unsafe {
+        libc::ptrace(
+            libc::PTRACE_GET_SYSCALL_INFO as libc::c_uint,
+            tid,
+            mem::size_of::<libc::ptrace_syscall_info>(),
+            &mut info,
+        )
+    };
+    if ret < 0 {
+        return Err(io::Error::last_os_error());
+    }
+    Ok(info.op)
+}
+
+#[cfg(test)]
+static CHILD_REGISTERED_HOOK: std::sync::Mutex<
+    Option<Box<dyn Fn(i32) + Send + 'static>>,
+> = std::sync::Mutex::new(None);
+
+#[cfg(test)]
+fn child_registered_for_test(child_pid: i32) {
+    if let Ok(guard) = CHILD_REGISTERED_HOOK.lock() {
+        if let Some(hook) = guard.as_ref() {
+            hook(child_pid);
+        }
+    }
+}
+
+/// Complete one-shot process-creation tracking after `Continue`.
+///
+/// Runs the blocking `waitpid` on a tokio blocking-pool thread so the
+/// notification handler's worker is not stalled.
+pub(crate) async fn finish_process_creation_tracking(
+    ctx: &Arc<SupervisorCtx>,
+    trace: ProcessCreationTrace,
+) -> io::Result<bool> {
+    let ctx = Arc::clone(ctx);
+    tokio::task::spawn_blocking(move || finish_process_creation_tracking_blocking(&ctx, trace))
+        .await
+        .map_err(|e| {
+            io::Error::new(io::ErrorKind::Other, format!("spawn_blocking join: {e}"))
+        })?
+}
+
+fn finish_process_creation_tracking_blocking(
+    ctx: &Arc<SupervisorCtx>,
+    trace: ProcessCreationTrace,
+) -> io::Result<bool> {
+    // Every early return below relies on `trace`'s Drop to detach the
+    // caller. The success path hands `trace` off to
+    // `finish_process_creation_event`, which keeps the same guarantee.
+    loop {
+        let status = wait_for_ptrace_stop(trace.caller_tid)?;
+
+        let event = (status >> 16) & 0xffff;
+        let is_fork_event = event == libc::PTRACE_EVENT_FORK
+            || event == libc::PTRACE_EVENT_VFORK
+            || event == libc::PTRACE_EVENT_CLONE;
+        if is_fork_event {
+            return finish_process_creation_event(ctx, trace);
+        }
+
+        let stopsig = libc::WSTOPSIG(status);
+        if event == 0 && stopsig == (libc::SIGTRAP | 0x80) {
+            let op = syscall_stop_kind(trace.caller_tid)?;
+            match op {
+                libc::PTRACE_SYSCALL_INFO_ENTRY => {
+                    let ret = unsafe {
+                        libc::ptrace(
+                            libc::PTRACE_SYSCALL as libc::c_uint,
+                            trace.caller_tid,
+                            0,
+                            0,
+                        )
+                    };
+                    if ret < 0 {
+                        return Err(io::Error::last_os_error());
+                    }
+                    continue;
+                }
+                libc::PTRACE_SYSCALL_INFO_EXIT => {
+                    return Ok(false);
+                }
+                op => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::Other,
+                        format!("unexpected ptrace syscall stop kind: {op}"),
+                    ));
+                }
+            }
+        }
+
+        return Err(io::Error::new(
+            io::ErrorKind::Other,
+            format!("unexpected ptrace event: {event}"),
+        ));
+    }
+}
+
+fn finish_process_creation_event(
+    ctx: &Arc<SupervisorCtx>,
+    trace: ProcessCreationTrace,
+) -> io::Result<bool> {
+    // `trace` detaches the caller on drop; the explicit child-side
+    // detaches stay manual since the child is not held by the guard.
+    let mut child_pid: libc::c_ulong = 0;
+    let ret = unsafe {
+        libc::ptrace(
+            libc::PTRACE_GETEVENTMSG as libc::c_uint,
+            trace.caller_tid,
+            0,
+            &mut child_pid,
+        )
+    };
+    if ret < 0 {
+        return Err(io::Error::last_os_error());
+    }
+
+    let child_pid = child_pid as i32;
+    if !register_pid_if_new(ctx, child_pid) {
+        let _ = unsafe { libc::kill(child_pid, libc::SIGKILL) };
+        detach_traced(child_pid);
+        return Err(io::Error::new(
+            io::ErrorKind::Other,
+            format!("failed to register new child pid {child_pid}"),
+        ));
+    }
+    #[cfg(test)]
+    child_registered_for_test(child_pid);
+
+    // Wait for the child's birth-traced ptrace-stop, then detach so it
+    // can run. Result ignored: the child may have already proceeded
+    // (PTRACE_O_TRACEFORK leaves it stopped, but a racing exit is
+    // possible) — detach is harmless either way.
+    let _ = wait_for_ptrace_stop(child_pid);
+    detach_traced(child_pid);
+    drop(trace);
+    Ok(true)
+}
+
+/// Tear down a tracking session whose `Continue` was never sent
+/// (e.g., `send_response` failed). Runs the blocking `waitpid` on the
+/// tokio blocking pool.
+pub(crate) async fn abort_process_creation_tracking(trace: ProcessCreationTrace) {
+    let _ = tokio::task::spawn_blocking(move || abort_process_creation_tracking_blocking(trace))
+        .await;
+}
+
+fn abort_process_creation_tracking_blocking(trace: ProcessCreationTrace) {
+    // INTERRUPT + wait so we can detach cleanly from a known state;
+    // the actual detach happens via `trace`'s Drop on scope exit.
+    let ret = unsafe {
+        libc::ptrace(
+            libc::PTRACE_INTERRUPT as libc::c_uint,
+            trace.caller_tid,
+            0,
+            0,
+        )
+    };
+    if ret == 0 {
+        let _ = wait_for_ptrace_stop(trace.caller_tid);
+    }
 }
 
 /// Handle wait4/waitid notifications — decrement the concurrent process count.
@@ -120,6 +501,13 @@ pub(crate) async fn handle_wait(
     NotifAction::Continue
 }
 
+/// Undo the optimistic process-count increment if a fork-like syscall
+/// is denied after `handle_fork` allowed it.
+pub(crate) async fn rollback_fork_count(resource: &Arc<Mutex<ResourceState>>) {
+    let mut rs = resource.lock().await;
+    rs.proc_count = rs.proc_count.saturating_sub(1);
+}
+
 /// Handle memory-related notifications (mmap, munmap, brk, mremap, shmget).
 ///
 /// Tracks anonymous memory usage and enforces the configured memory limit.
@@ -210,3 +598,296 @@ pub(crate) async fn handle_memory(
 
     NotifAction::Continue
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::netlink::NetlinkState;
+    use crate::seccomp::state::{
+        ChrootState, CowState, NetworkState, PolicyFnState, ProcessIndex, ProcfsState,
+        TimeRandomState,
+    };
+    use crate::sys::structs::{SeccompData, SeccompNotif};
+    use std::ptr;
+
+    const GO: isize = 0;
+    const CHILD_RAN: isize = 1;
+    const REGISTERED_BEFORE_RUN: isize = 2;
+    const REGISTERED_PID: isize = 3;
+    const DONE: isize = 4;
+    const FORK_FAILED: isize = 5;
+    const FLAGS_LEN: usize = 4096;
+
+    fn fake_notif(nr: i64, arg0: u64) -> SeccompNotif {
+        SeccompNotif {
+            id: 0,
+            pid: 1,
+            flags: 0,
+            data: SeccompData {
+                nr: nr as i32,
+                arch: 0,
+                instruction_pointer: 0,
+                args: [arg0, 0, 0, 0, 0, 0],
+            },
+        }
+    }
+
+    fn fake_policy(argv_safety_required: bool) -> NotifPolicy {
+        NotifPolicy {
+            max_memory_bytes: 0,
+            max_processes: 0,
+            has_memory_limit: false,
+            has_net_allowlist: false,
+            has_random_seed: false,
+            has_time_start: false,
+            argv_safety_required,
+            time_offset: 0,
+            num_cpus: None,
+            port_remap: false,
+            cow_enabled: false,
+            chroot_root: None,
+            chroot_readable: Vec::new(),
+            chroot_writable: Vec::new(),
+            chroot_denied: Vec::new(),
+            chroot_mounts: Vec::new(),
+            deterministic_dirs: false,
+            hostname: None,
+            has_http_acl: false,
+            virtual_etc_hosts: None,
+        }
+    }
+
+    fn fake_supervisor_ctx(argv_safety_required: bool) -> Arc<SupervisorCtx> {
+        Arc::new(SupervisorCtx {
+            resource: Arc::new(Mutex::new(ResourceState::new(0, 0))),
+            cow: Arc::new(Mutex::new(CowState::new())),
+            procfs: Arc::new(Mutex::new(ProcfsState::new())),
+            network: Arc::new(Mutex::new(NetworkState::new())),
+            time_random: Arc::new(Mutex::new(TimeRandomState::new(None, None))),
+            policy_fn: Arc::new(Mutex::new(PolicyFnState::new())),
+            chroot: Arc::new(Mutex::new(ChrootState::new())),
+            netlink: Arc::new(NetlinkState::new()),
+            processes: Arc::new(ProcessIndex::new()),
+            policy: Arc::new(fake_policy(argv_safety_required)),
+            child_pidfd: None,
+            notif_fd: -1,
+        })
+    }
+
+    #[test]
+    fn process_creation_tracking_predicates_follow_argv_safety_gate() {
+        let no_argv_safety = fake_policy(false);
+        let argv_safety = fake_policy(true);
+        let clone_proc = fake_notif(libc::SYS_clone, 0);
+        let clone_thread = fake_notif(libc::SYS_clone, CLONE_THREAD);
+        let clone3 = fake_notif(libc::SYS_clone3, 0);
+        let openat = fake_notif(libc::SYS_openat, 0);
+
+        // notif_fd = -1: clone3's user-memory read fails (id_valid),
+        // which fail-safes to "not a thread" → counted as process.
+        // Matches the synthetic clone3 notif's expected accounting.
+        let fd = -1;
+
+        assert!(fork_counted_on_continue(&clone_proc, fd));
+        assert!(!fork_counted_on_continue(&clone_thread, fd));
+        assert!(fork_counted_on_continue(&clone3, fd));
+        assert!(!fork_counted_on_continue(&openat, fd));
+
+        assert!(!requires_process_creation_tracking(&clone_proc, fd, &no_argv_safety));
+        assert!(requires_process_creation_tracking(&clone_proc, fd, &argv_safety));
+        assert!(!requires_process_creation_tracking(&clone_thread, fd, &argv_safety));
+        assert!(requires_process_creation_tracking(&clone3, fd, &argv_safety));
+        assert!(!requires_process_creation_tracking(&openat, fd, &argv_safety));
+
+        if let Some(fork_nr) = crate::arch::SYS_FORK {
+            let fork = fake_notif(fork_nr, 0);
+            assert!(fork_counted_on_continue(&fork, fd));
+            assert!(requires_process_creation_tracking(&fork, fd, &argv_safety));
+        }
+        if let Some(vfork_nr) = crate::arch::SYS_VFORK {
+            let vfork = fake_notif(vfork_nr, 0);
+            assert!(fork_counted_on_continue(&vfork, fd));
+            assert!(requires_process_creation_tracking(&vfork, fd, &argv_safety));
+        }
+    }
+
+    struct SharedFlags {
+        ptr: *mut i32,
+    }
+
+    impl SharedFlags {
+        fn new() -> Self {
+            let ptr = unsafe {
+                libc::mmap(
+                    ptr::null_mut(),
+                    FLAGS_LEN,
+                    libc::PROT_READ | libc::PROT_WRITE,
+                    libc::MAP_SHARED | libc::MAP_ANONYMOUS,
+                    -1,
+                    0,
+                )
+            };
+            assert_ne!(ptr, libc::MAP_FAILED, "mmap shared flags");
+            Self {
+                ptr: ptr.cast::<i32>(),
+            }
+        }
+
+        fn read(&self, slot: isize) -> i32 {
+            unsafe { ptr::read_volatile(self.ptr.offset(slot)) }
+        }
+
+        fn write(&self, slot: isize, value: i32) {
+            unsafe { ptr::write_volatile(self.ptr.offset(slot), value) };
+        }
+
+        fn addr(&self) -> usize {
+            self.ptr as usize
+        }
+    }
+
+    impl Drop for SharedFlags {
+        fn drop(&mut self) {
+            unsafe {
+                libc::munmap(self.ptr.cast(), FLAGS_LEN);
+            }
+        }
+    }
+
+    struct HookReset;
+
+    impl Drop for HookReset {
+        fn drop(&mut self) {
+            if let Ok(mut hook) = CHILD_REGISTERED_HOOK.lock() {
+                *hook = None;
+            }
+        }
+    }
+
+    struct CallerGuard {
+        pid: i32,
+        flags_addr: usize,
+    }
+
+    impl CallerGuard {
+        fn new(pid: i32, flags: &SharedFlags) -> Self {
+            Self {
+                pid,
+                flags_addr: flags.addr(),
+            }
+        }
+
+        fn disarm(&mut self) {
+            self.pid = 0;
+        }
+    }
+
+    impl Drop for CallerGuard {
+        fn drop(&mut self) {
+            if self.pid <= 0 {
+                return;
+            }
+            let flags = self.flags_addr as *mut i32;
+            unsafe {
+                ptr::write_volatile(flags.offset(GO), 1);
+                ptr::write_volatile(flags.offset(DONE), 1);
+                libc::kill(self.pid, libc::SIGKILL);
+                let mut status = 0;
+                let _ = libc::waitpid(self.pid, &mut status, 0);
+            }
+        }
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    unsafe fn caller_wait_then_raw_fork(flags: *mut i32) -> ! {
+        while ptr::read_volatile(flags.offset(GO)) == 0 {
+            core::hint::spin_loop();
+        }
+
+        let pid = libc::syscall(libc::SYS_fork) as i32;
+        if pid == 0 {
+            ptr::write_volatile(flags.offset(CHILD_RAN), 1);
+            while ptr::read_volatile(flags.offset(DONE)) == 0 {
+                core::hint::spin_loop();
+            }
+            libc::_exit(0);
+        }
+        if pid > 0 {
+            let mut status = 0;
+            let _ = libc::waitpid(pid, &mut status, 0);
+            libc::_exit(0);
+        }
+
+        ptr::write_volatile(flags.offset(FORK_FAILED), 1);
+        libc::_exit(1);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    #[test]
+    fn process_creation_tracking_registers_child_before_user_code_runs() {
+        let flags = SharedFlags::new();
+        let flags_addr = flags.addr();
+
+        let caller = unsafe { libc::fork() };
+        assert!(caller >= 0, "fork caller");
+        if caller == 0 {
+            unsafe { caller_wait_then_raw_fork(flags.ptr) };
+        }
+        let mut caller_guard = CallerGuard::new(caller, &flags);
+
+        let _hook_reset = HookReset;
+        {
+            let mut hook = CHILD_REGISTERED_HOOK.lock().expect("hook lock");
+            *hook = Some(Box::new(move |child_pid| {
+                let flags = flags_addr as *mut i32;
+                unsafe {
+                    let child_ran = ptr::read_volatile(flags.offset(CHILD_RAN));
+                    ptr::write_volatile(flags.offset(REGISTERED_PID), child_pid);
+                    ptr::write_volatile(
+                        flags.offset(REGISTERED_BEFORE_RUN),
+                        if child_ran == 0 { 1 } else { -1 },
+                    );
+                }
+            }));
+        }
+
+        let ctx = fake_supervisor_ctx(true);
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_io()
+            .build()
+            .expect("tokio runtime");
+        let trace = match rt.block_on(prepare_process_creation_tracking(caller)) {
+            Ok(trace) => trace,
+            Err(e) if matches!(e.raw_os_error(), Some(libc::EPERM | libc::EACCES)) => {
+                eprintln!("skipping ptrace fork-event test: ptrace denied: {e}");
+                return;
+            }
+            Err(e) => panic!("prepare process-creation tracking: {e}"),
+        };
+
+        flags.write(GO, 1);
+        let created = rt
+            .block_on(finish_process_creation_tracking(&ctx, trace))
+            .expect("finish process-creation tracking");
+        assert!(created, "raw fork should produce a ptrace fork event");
+
+        let registered_pid = flags.read(REGISTERED_PID);
+        assert!(registered_pid > 0, "child pid should be captured by hook");
+        assert!(
+            ctx.processes.contains(registered_pid),
+            "child should be registered in ProcessIndex"
+        );
+        assert_eq!(
+            flags.read(REGISTERED_BEFORE_RUN),
+            1,
+            "child should still be ptrace-stopped when registered"
+        );
+
+        flags.write(DONE, 1);
+        let mut status = 0;
+        let waited = unsafe { libc::waitpid(caller, &mut status, 0) };
+        assert_eq!(waited, caller, "wait caller");
+        assert_eq!(flags.read(FORK_FAILED), 0, "raw fork failed in caller");
+        caller_guard.disarm();
+    }
+}
diff --git a/crates/sandlock-core/src/sandbox.rs b/crates/sandlock-core/src/sandbox.rs
index 02faf11..15c5480 100644
--- a/crates/sandlock-core/src/sandbox.rs
+++ b/crates/sandlock-core/src/sandbox.rs
@@ -300,7 +300,7 @@ impl Sandbox {
     /// Create N COW clones of this sandbox.
     ///
     /// Requires `new_with_fns()`. Forks a confined child, runs `init_fn`,
-    /// then forks N times using raw `fork()` (bypasses seccomp). Each
+    /// then forks N times using raw `fork()`. Each
     /// clone gets `CLONE_ID=0..N-1` and runs `work_fn(clone_id)`.
     ///
     /// Memory pages from `init_fn` are shared copy-on-write across all
@@ -997,6 +997,16 @@ impl Sandbox {
                     || !self.policy.http_deny.is_empty(),
                 has_random_seed: self.policy.random_seed.is_some(),
                 has_time_start: self.policy.time_start.is_some(),
+                // True if any consumer can inspect argv on execve:
+                // the policy_fn callback or an extra handler bound to
+                // execve/execveat (which can use read_child_mem). Both
+                // require the freeze + fork-event tracking to keep
+                // argv reads TOCTOU-safe.
+                argv_safety_required: self.policy.policy_fn.is_some()
+                    || self.extra_handlers.iter().any(|h| {
+                        h.syscall_nr == libc::SYS_execve
+                            || h.syscall_nr == libc::SYS_execveat
+                    }),
                 time_offset: time_offset_val,
                 num_cpus: self.policy.num_cpus,
                 port_remap: self.policy.port_remap,
@@ -1047,7 +1057,7 @@ impl Sandbox {
                 net_state.port_map.on_bind = Some(cb);
             }
 
-            // ProcfsState (sandbox membership lives in ProcessIndex now).
+            // ProcfsState (per-notification process state lives in ProcessIndex).
             let procfs_state = ProcfsState::new();
 
             // ResourceState
@@ -1127,8 +1137,10 @@ impl Sandbox {
             let time_random_state = Arc::new(Mutex::new(time_random_state));
             let policy_fn_state = Arc::new(Mutex::new(policy_fn_state));
             let chroot_state = Arc::new(Mutex::new(chroot_state));
-            // Root child is registered (with watcher) on its first
-            // notification, the same path grandchildren take.
+            // Root child gets per-process state (with watcher) on its
+            // first notification. When policy_fn is active, fork-like
+            // syscalls are traced at creation time so descendants are
+            // registered before they can run user code.
             let processes = Arc::new(crate::seccomp::state::ProcessIndex::new());
 
             let ctx = Arc::new(SupervisorCtx {
diff --git a/crates/sandlock-core/src/sandbox_freeze.rs b/crates/sandlock-core/src/sandbox_freeze.rs
index d08fe1f..39104ed 100644
--- a/crates/sandlock-core/src/sandbox_freeze.rs
+++ b/crates/sandlock-core/src/sandbox_freeze.rs
@@ -1,4 +1,4 @@
-//! Freeze sandbox threads of an execve caller before returning Continue.
+//! Freeze sandbox threads of an execve caller before exposing argv.
 //!
 //! # Why
 //!
@@ -18,20 +18,22 @@
 //!    that share the calling task's `mm_struct` via
 //!    `clone(CLONE_VM)` without `CLONE_THREAD`.
 //!
-//! `freeze_sandbox_for_execve` closes both classes. It enumerates every
-//! TGID tracked in `ProcessIndex` (the canonical sandbox membership
-//! set), walks `/proc/<tgid>/task` per TGID, and `PTRACE_SEIZE` +
-//! `PTRACE_INTERRUPT` every TID. Together with the supervisor's
-//! sequential notification dispatch (which prevents new clone/fork
-//! notifications from being processed while the freeze is in flight),
-//! every entity that could mutate argv is paused before the kernel
-//! re-reads.
+//! `freeze_sandbox_for_execve` closes both classes. When `policy_fn`
+//! is active, every fork-like syscall is traced for one ptrace
+//! fork/clone/vfork event and the child is registered in
+//! `ProcessIndex` before it can run user code. The exec freeze can
+//! therefore enumerate every tracked TGID, walk `/proc/<tgid>/task`,
+//! and `PTRACE_SEIZE` + `PTRACE_INTERRUPT` every TID that could mutate
+//! argv.
 //!
 //! # Sibling vs peer cleanup
 //!
 //! Sibling threads (same TGID as the caller) are killed by the kernel
-//! during execve's `de_thread` step, so the supervisor never has to
-//! detach them — their ptrace state is reaped along with the threads.
+//! during execve's `de_thread` step when execve is allowed, so the
+//! supervisor does not detach them on the allow path — their ptrace
+//! state is reaped along with the threads. If the policy callback
+//! denies execve after argv inspection, the supervisor detaches both
+//! siblings and peers because `de_thread` will not run.
 //!
 //! Peer threads (different TGID) survive execve. The supervisor must
 //! `PTRACE_DETACH` them after `NOTIF_SEND` so they can resume normal
@@ -187,46 +189,45 @@ fn read_tgid_of_tid(tid: i32) -> io::Result<i32> {
     ))
 }
 
-/// Outcome of a sandbox-wide freeze. The supervisor must call
-/// `detach_peers(&outcome.peer_tids)` after `NOTIF_SEND` to let the
-/// peer processes resume.
+/// Outcome of a sandbox-wide freeze.
 #[derive(Debug, Default)]
 pub(crate) struct SandboxFreeze {
+    /// Sibling TIDs in the caller's TGID. These die in `de_thread` if
+    /// execve is allowed, but must be detached if execve is denied
+    /// after `policy_fn` inspected argv.
+    pub sibling_tids: Vec<i32>,
     /// TIDs in *other* TGIDs that were ptrace-stopped. These survive
     /// execve and must be detached so they can resume normal
-    /// execution. Siblings of `caller_tid` are deliberately not in
-    /// this list — execve's `de_thread` kills them and the kernel
-    /// reaps their ptrace state automatically.
+    /// execution.
     pub peer_tids: Vec<i32>,
 }
 
 /// Freeze every sandbox thread that could mutate execve argv before
-/// the kernel re-reads it.
+/// the supervisor reads it for `policy_fn` and before the kernel
+/// re-reads it.
 ///
-/// Walks every TGID in `processes` (and defensively the caller's own
-/// TGID), enumerates each TGID's threads via `/proc/<tgid>/task/`,
-/// and `PTRACE_SEIZE` + `PTRACE_INTERRUPT` every TID except
-/// `caller_tid`. Sibling threads of `caller_tid` and peer threads in
-/// other TGIDs are both covered.
+/// Walks every TGID in `processes`, enumerates each TGID's threads via
+/// `/proc/<tgid>/task/`, and `PTRACE_SEIZE` + `PTRACE_INTERRUPT`s
+/// every TID except `caller_tid`. Sibling threads of `caller_tid` and
+/// peer threads in other TGIDs are both covered. `processes` is
+/// complete for `policy_fn` runs because fork-like syscalls are tracked
+/// before new children can run.
 ///
 /// Strict semantics: if any task refuses to be frozen, every
 /// already-frozen task is detached and the error is propagated. The
 /// caller is expected to deny the execve with `EPERM`, preserving the
 /// invariant that exposed argv is always TOCTOU-safe.
 ///
-/// On success, returns the list of *peer* TIDs that survive execve and
-/// must be detached after `NOTIF_SEND`. Sibling TIDs are not returned
-/// because they die in `de_thread`.
+/// On success, returns the sibling and peer TIDs that were frozen. The
+/// caller detaches peers after an allowed execve, or detaches all TIDs
+/// after a denied execve.
 pub(crate) fn freeze_sandbox_for_execve(
     processes: &crate::seccomp::state::ProcessIndex,
     caller_tid: i32,
 ) -> io::Result<SandboxFreeze> {
     let caller_tgid = read_tgid_of_tid(caller_tid)?;
-
-    // ProcessIndex is the canonical sandbox membership set. The
-    // supervisor's `register_child_if_new` runs before per-syscall
-    // handlers, so the caller's TGID is guaranteed to be present.
-    let tgids: HashSet<i32> = processes.pids_snapshot();
+    let mut tgids: HashSet<i32> = processes.pids_snapshot();
+    tgids.insert(caller_tgid);
 
     let mut sibling_tids: Vec<i32> = Vec::new();
     let mut peer_tids: Vec<i32> = Vec::new();
@@ -266,7 +267,10 @@ pub(crate) fn freeze_sandbox_for_execve(
         }
     }
 
-    Ok(SandboxFreeze { peer_tids })
+    Ok(SandboxFreeze {
+        sibling_tids,
+        peer_tids,
+    })
 }
 
 /// Detach peer TIDs after the kernel has re-read execve argv. Errors
@@ -278,6 +282,17 @@ pub(crate) fn detach_peers(peer_tids: &[i32]) {
     }
 }
 
+/// Detach every task in a freeze after execve was denied or the
+/// notification response could not be sent.
+pub(crate) fn detach_all(freeze: &SandboxFreeze) {
+    for tid in &freeze.sibling_tids {
+        detach(*tid);
+    }
+    for tid in &freeze.peer_tids {
+        detach(*tid);
+    }
+}
+
 /// Helper called from the dispatch hot path. Returns true if the
 /// notification is for an execve-class syscall whose Continue response
 /// requires freezing siblings.
@@ -311,15 +326,45 @@ mod tests {
     /// issue #27 (Changaco): a peer process in the sandbox — different
     /// TGID, possibly aliasing argv pages via shared memory — must also
     /// be frozen before the kernel re-reads execve argv. Sibling-thread
-    /// freeze alone (`freeze_siblings_for_execve`) does not cover this.
+    /// freeze alone does not cover this. In real policy_fn runs,
+    /// fork-like syscall tracking registers peer processes before they
+    /// can run; this unit test mirrors that completed registration.
+    ///
+    /// # Why we spawn a separate "caller" process
+    ///
+    /// In production, `freeze_sandbox_for_execve` runs in the supervisor
+    /// process and `caller_tid` is the sandboxed child's tid — i.e. the
+    /// supervisor and the execve caller are in *different* TGIDs, and
+    /// every TID the freeze walks is a descendant of the supervisor.
+    /// Under YAMA `ptrace_scope=1` (the Ubuntu/Debian default), that
+    /// descendant relationship is exactly what makes PTRACE_SEIZE
+    /// permitted without any privilege.
     ///
-    /// This test registers a peer process in `ProcessIndex` and verifies
-    /// that `freeze_sandbox_for_execve` puts it in ptrace-stop, the same
-    /// way `freeze_siblings_for_execve` does for siblings.
+    /// If this test instead used the test thread's own tid as
+    /// `caller_tid`, `caller_tgid` would be the cargo test binary's
+    /// TGID, the freeze would walk the test binary's sibling threads
+    /// (libtest workers, runtime helpers), and PTRACE_SEIZE would be
+    /// rejected with EPERM by YAMA — sibling threads are not
+    /// descendants of each other. That would force the test to require
+    /// privileges sandlock itself does not require. So we spawn a
+    /// dedicated "caller" sleep to play the sandboxed-process role,
+    /// matching production topology.
     #[test]
     fn freeze_sandbox_includes_peer_process() {
         use std::process::{Command, Stdio};
 
+        // The "execve caller" — stands in for the sandboxed process.
+        // Its tid is a descendant of the test process (the parent), so
+        // ptracing into its TGID is YAMA-allowed under ptrace_scope=1.
+        let mut caller = Command::new("/bin/sleep")
+            .arg("60")
+            .stdin(Stdio::null())
+            .stdout(Stdio::null())
+            .stderr(Stdio::null())
+            .spawn()
+            .expect("spawn caller sleep");
+        let caller_tid = caller.id() as i32;
+
         let mut peer = Command::new("/bin/sleep")
             .arg("60")
             .stdin(Stdio::null())
@@ -329,23 +374,19 @@ mod tests {
             .expect("spawn peer sleep");
         let peer_pid = peer.id() as i32;
 
-        // Give the peer a moment to actually be running.
+        // Give both children a moment to actually be running.
         std::thread::sleep(std::time::Duration::from_millis(50));
 
-        // Register the peer in a fresh ProcessIndex (mirrors what the
-        // supervisor's clone/fork notification handler would do).
         let processes = ProcessIndex::new();
         processes
             .register(peer_pid)
             .expect("register peer in ProcessIndex");
 
-        let our_tid = unsafe { libc::syscall(libc::SYS_gettid) } as i32;
-
-        let outcome = freeze_sandbox_for_execve(&processes, our_tid)
+        let outcome = freeze_sandbox_for_execve(&processes, caller_tid)
             .expect("freeze_sandbox_for_execve");
 
         // Peer's TID is its own TGID (single-threaded sleep), and it's
-        // a different TGID from ours, so it should be in peer_tids.
+        // a different TGID from the execve caller, so it should be in peer_tids.
         assert!(
             outcome.peer_tids.contains(&peer_pid),
             "peer pid {} should be in peer_tids: {:?}",
@@ -370,5 +411,7 @@ mod tests {
         detach_peers(&outcome.peer_tids);
         let _ = peer.kill();
         let _ = peer.wait();
+        let _ = caller.kill();
+        let _ = caller.wait();
     }
 }
diff --git a/crates/sandlock-core/src/seccomp/ctx.rs b/crates/sandlock-core/src/seccomp/ctx.rs
index bb1be30..78c9080 100644
--- a/crates/sandlock-core/src/seccomp/ctx.rs
+++ b/crates/sandlock-core/src/seccomp/ctx.rs
@@ -26,10 +26,12 @@ pub struct SupervisorCtx {
     pub chroot: Arc<Mutex<ChrootState>>,
     /// NETLINK_ROUTE virtualization state.
     pub netlink: Arc<crate::netlink::NetlinkState>,
-    /// Per-process registry: pid → PidKey. Source of truth for
-    /// "which processes are in the sandbox" and the anchor for
-    /// unified per-process state cleanup. Wraps an internal RwLock,
-    /// so handlers can query it synchronously without `.await`.
+    /// Per-process registry: pid → PidKey. This anchors unified
+    /// per-process state cleanup. With policy_fn active, fork-like
+    /// syscalls populate it at child creation time before user code can
+    /// run; otherwise it is populated lazily from notifications. Wraps
+    /// an internal RwLock, so handlers can query it synchronously
+    /// without `.await`.
     pub processes: Arc<ProcessIndex>,
     /// Immutable policy — no lock needed.
     pub policy: Arc<NotifPolicy>,
diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs
index 0dd53af..0adc66b 100644
--- a/crates/sandlock-core/src/seccomp/dispatch.rs
+++ b/crates/sandlock-core/src/seccomp/dispatch.rs
@@ -199,18 +199,14 @@ pub fn build_dispatch_table(
     // ------------------------------------------------------------------
     // Fork/clone family (always on)
     // ------------------------------------------------------------------
-    let mut fork_nrs = vec![libc::SYS_clone, libc::SYS_clone3];
-    if let Some(vfork) = arch::SYS_VFORK {
-        fork_nrs.push(vfork);
-    }
-    for nr in fork_nrs {
+    for &nr in arch::FORK_LIKE_SYSCALLS {
         let policy = Arc::clone(policy);
         let resource = Arc::clone(resource);
-        table.register(nr, Box::new(move |notif, _ctx, _notif_fd| {
+        table.register(nr, Box::new(move |notif, _ctx, notif_fd| {
             let policy = Arc::clone(&policy);
             let resource = Arc::clone(&resource);
             Box::pin(async move {
-                crate::resource::handle_fork(&notif, &resource, &policy).await
+                crate::resource::handle_fork(&notif, notif_fd, &resource, &policy).await
             })
         }));
     }
@@ -844,6 +840,7 @@ mod extra_handler_tests {
                 has_net_allowlist: false,
                 has_random_seed: false,
                 has_time_start: false,
+                argv_safety_required: false,
                 time_offset: 0,
                 num_cpus: None,
                 port_remap: false,
diff --git a/crates/sandlock-core/src/seccomp/notif.rs b/crates/sandlock-core/src/seccomp/notif.rs
index 29414f0..01404d8 100644
--- a/crates/sandlock-core/src/seccomp/notif.rs
+++ b/crates/sandlock-core/src/seccomp/notif.rs
@@ -210,6 +210,13 @@ pub struct NotifPolicy {
     pub has_net_allowlist: bool,
     pub has_random_seed: bool,
     pub has_time_start: bool,
+    /// Argv-safety gate: the supervisor must freeze every task that
+    /// could mutate argv before any consumer reads it. True when
+    /// `policy_fn` is active or when an extra handler is bound to
+    /// execve/execveat (such handlers can call `read_child_mem`).
+    /// Also gates ptrace fork-event tracking so `ProcessIndex` is
+    /// complete when the freeze enumerates it.
+    pub argv_safety_required: bool,
     pub time_offset: i64,
     pub num_cpus: Option<u32>,
     pub port_remap: bool,
@@ -559,6 +566,7 @@ fn syscall_name(nr: i64) -> &'static str {
         n if n == libc::SYS_clone => "clone",
         n if n == libc::SYS_clone3 => "clone3",
         n if Some(n) == arch::SYS_VFORK => "vfork",
+        n if Some(n) == arch::SYS_FORK => "fork",
         n if n == libc::SYS_execve => "execve",
         n if n == libc::SYS_execveat => "execveat",
         n if n == libc::SYS_mmap => "mmap",
@@ -587,8 +595,8 @@ fn syscall_category(nr: i64) -> crate::policy_fn::SyscallCategory {
             || n == libc::SYS_sendmsg || n == libc::SYS_bind
             || n == libc::SYS_getsockname => SyscallCategory::Network,
         n if n == libc::SYS_clone || n == libc::SYS_clone3
-            || Some(n) == arch::SYS_VFORK || n == libc::SYS_execve
-            || n == libc::SYS_execveat => SyscallCategory::Process,
+            || Some(n) == arch::SYS_VFORK || Some(n) == arch::SYS_FORK
+            || n == libc::SYS_execve || n == libc::SYS_execveat => SyscallCategory::Process,
         n if n == libc::SYS_mmap || n == libc::SYS_munmap
             || n == libc::SYS_brk || n == libc::SYS_mremap
             => SyscallCategory::Memory,
@@ -825,10 +833,11 @@ async fn emit_policy_event(
     // decision is racy (issue #27). Path-based access control belongs
     // in static Landlock rules.
     //
-    // argv IS extracted for execve/execveat: the supervisor freezes
-    // every task in the sandbox (siblings + peers) before returning
-    // Continue (sandbox_freeze module), so the post-Continue re-read
-    // sees the same memory we read here.
+    // argv IS extracted for allowed execve/execveat notifications:
+    // the supervisor freezes every task in the sandbox (siblings +
+    // peers) before this callback reads argv and keeps that freeze
+    // through Continue, so the post-Continue re-read sees the same
+    // memory we read here.
     //
     // Network fields are TOCTOU-safe because connect/sendto/bind are
     // performed on-behalf via pidfd_getfd; the kernel never re-reads
@@ -838,7 +847,7 @@ async fn emit_policy_event(
     let mut size = None;
     let mut argv = None;
 
-    if nr == libc::SYS_execve || nr == libc::SYS_execveat {
+    if !denied && (nr == libc::SYS_execve || nr == libc::SYS_execveat) {
         // execve(pathname, argv, envp):       args[1] = argv ptr
         // execveat(dirfd, pathname, argv, ..): args[2] = argv ptr
         let argv_ptr = if nr == libc::SYS_execveat {
@@ -914,10 +923,11 @@ async fn handle_notification(
 ) {
     let policy = &ctx.policy;
 
-    // Ensure every pid that produces a notification is tracked in the
-    // ProcessIndex with an exit watcher. The fork handler runs on the
-    // *parent* pid (the child doesn't exist yet at clone-time), so the
-    // child gets registered the first time it issues its own syscall.
+    // Ensure every pid that produces a notification has per-process
+    // supervisor state and an exit watcher. The fork handler runs on
+    // the *parent* pid (the child doesn't exist yet at clone-time), so
+    // the child gets registered the first time it issues a notified
+    // syscall.
     crate::resource::register_child_if_new(ctx, notif.pid as i32).await;
 
     // Re-patch vDSO if needed (exec replaces it with a fresh copy).
@@ -951,39 +961,26 @@ async fn handle_notification(
         }
     };
 
-    // Emit event to policy_fn callback if active
-    if let Some(verdict) = emit_policy_event(&notif, &action, &ctx.policy_fn, fd).await {
-        use crate::policy_fn::Verdict;
-        match verdict {
-            Verdict::Deny => { action = NotifAction::Errno(libc::EPERM); }
-            Verdict::DenyWith(errno) => { action = NotifAction::Errno(errno); }
-            Verdict::Audit => { /* allow, but could log here */ }
-            Verdict::Allow => {}
-        }
-    }
+    let nr = notif.data.nr as i64;
+    let fork_counted = matches!(action, NotifAction::Continue)
+        && crate::resource::fork_counted_on_continue(&notif, fd);
 
     // TOCTOU-close for execve (issue #27): freeze every sandbox task
-    // that could mutate argv before the kernel re-reads it after
-    // Continue. This covers two distinct writer classes:
+    // that could mutate argv before policy_fn reads argv and before the
+    // kernel re-reads it after Continue. This covers two writer classes:
     //   1. Sibling threads of the calling tid (same TGID, share mm).
     //   2. Peer processes in other TGIDs that alias argv pages via
     //      MAP_SHARED mappings or share mm via clone(CLONE_VM).
-    // Sibling-thread freeze alone closed (1) but not (2), as raised
-    // by Changaco on issue #27.
     //
-    // Only relevant when sending Continue: a denial (Errno) means the
-    // kernel never re-reads, so no freeze is needed.
+    // The freeze enumerates ProcessIndex. With policy_fn active, that
+    // index is complete: fork-like syscalls are traced at creation time
+    // below, before new children can run user code.
     //
     // Strict on failure: if we cannot establish the freeze, we cannot
-    // uphold the argv-safety invariant, so we deny the execve with
-    // EPERM rather than letting it through unprotected.
-    //
-    // Sibling threads die in execve's de_thread; the kernel reaps
-    // their ptrace state. Peer threads survive — we detach them after
-    // NOTIF_SEND so they resume normally.
-    let nr = notif.data.nr as i64;
-    let mut peer_tids_to_detach: Vec<i32> = Vec::new();
+    // safely expose argv or allow execve, so we deny with EPERM.
+    let mut exec_freeze = None;
     if matches!(action, NotifAction::Continue)
+        && policy.argv_safety_required
         && crate::sandbox_freeze::requires_freeze_on_continue(nr)
     {
         match crate::sandbox_freeze::freeze_sandbox_for_execve(
@@ -991,7 +988,7 @@ async fn handle_notification(
             notif.pid as i32,
         ) {
             Ok(outcome) => {
-                peer_tids_to_detach = outcome.peer_tids;
+                exec_freeze = Some(outcome);
             }
             Err(e) => {
                 eprintln!(
@@ -1004,14 +1001,80 @@ async fn handle_notification(
         }
     }
 
+    // Emit event to policy_fn callback if active. For execve, argv is
+    // only populated after `exec_freeze` has stopped every possible
+    // writer, and those tasks stay stopped until after NOTIF_SEND.
+    if let Some(verdict) = emit_policy_event(&notif, &action, &ctx.policy_fn, fd).await {
+        use crate::policy_fn::Verdict;
+        match verdict {
+            Verdict::Deny => { action = NotifAction::Errno(libc::EPERM); }
+            Verdict::DenyWith(errno) => { action = NotifAction::Errno(errno); }
+            Verdict::Audit => { /* allow, but could log here */ }
+            Verdict::Allow => {}
+        }
+    }
+
+    if fork_counted && !matches!(action, NotifAction::Continue) {
+        crate::resource::rollback_fork_count(&ctx.resource).await;
+    }
+
+    // With policy_fn active, fork-like syscalls are traced for exactly
+    // one ptrace event so ProcessIndex becomes complete before the new
+    // child can run user code. That closes the race where a peer
+    // process could exist without ever having produced a notification.
+    let mut creation_trace = None;
+    if matches!(action, NotifAction::Continue)
+        && crate::resource::requires_process_creation_tracking(&notif, fd, policy)
+    {
+        match crate::resource::prepare_process_creation_tracking(notif.pid as i32).await {
+            Ok(trace) => {
+                creation_trace = Some(trace);
+            }
+            Err(e) => {
+                eprintln!(
+                    "sandlock: process-creation tracking failed for pid {}: {} \
+                     — denying fork-like syscall to preserve argv TOCTOU invariant",
+                    notif.pid, e
+                );
+                if fork_counted {
+                    crate::resource::rollback_fork_count(&ctx.resource).await;
+                }
+                action = NotifAction::Errno(libc::EPERM);
+            }
+        }
+    }
+
     // Ignore error — child may have exited between recv and response.
-    let _ = send_response(fd, notif.id, action);
+    let exec_continued = exec_freeze.is_some() && matches!(action, NotifAction::Continue);
+    let send_result = send_response(fd, notif.id, action);
+
+    if let Some(trace) = creation_trace {
+        if send_result.is_ok() {
+            match crate::resource::finish_process_creation_tracking(ctx, trace).await {
+                Ok(true) => {}
+                Ok(false) => {
+                    crate::resource::rollback_fork_count(&ctx.resource).await;
+                }
+                Err(e) => {
+                    crate::resource::rollback_fork_count(&ctx.resource).await;
+                    eprintln!(
+                        "sandlock: process-creation tracking completion failed for pid {}: {}",
+                        notif.pid, e
+                    );
+                }
+            }
+        } else {
+            crate::resource::rollback_fork_count(&ctx.resource).await;
+            crate::resource::abort_process_creation_tracking(trace).await;
+        }
+    }
 
-    // Detach peer processes after NOTIF_SEND so they resume. Siblings
-    // of the caller's TGID are intentionally not detached: they die in
-    // execve's de_thread and the kernel reaps the ptrace state.
-    if !peer_tids_to_detach.is_empty() {
-        crate::sandbox_freeze::detach_peers(&peer_tids_to_detach);
+    if let Some(freeze) = exec_freeze {
+        if exec_continued && send_result.is_ok() {
+            crate::sandbox_freeze::detach_peers(&freeze.peer_tids);
+        } else {
+            crate::sandbox_freeze::detach_all(&freeze);
+        }
     }
 }
 
diff --git a/crates/sandlock-core/src/seccomp/state.rs b/crates/sandlock-core/src/seccomp/state.rs
index 199e7ec..b0a3a97 100644
--- a/crates/sandlock-core/src/seccomp/state.rs
+++ b/crates/sandlock-core/src/seccomp/state.rs
@@ -47,10 +47,10 @@ impl ResourceState {
 // ProcfsState — /proc virtualization state
 // ============================================================
 
-/// /proc virtualization runtime state. Sandbox membership lives in
-/// `ProcessIndex`; per-process getdents caches live in
-/// `PerProcessState::procfs_dir_cache`. This struct only holds
-/// truly global virtualization state.
+/// /proc virtualization runtime state. Per-notification process state
+/// lives in `ProcessIndex`; per-process getdents caches live in
+/// `PerProcessState::procfs_dir_cache`. This struct only holds truly
+/// global virtualization state.
 pub struct ProcfsState {
     /// Base address of the last vDSO we patched (0 = not yet patched).
     pub vdso_patched_addr: u64,
@@ -117,10 +117,17 @@ pub struct PerProcessState {
 }
 
 // ============================================================
-// ProcessIndex — sandbox membership + per-process state
+// ProcessIndex — tracked processes + per-process state
 // ============================================================
 
-/// Source-of-truth registry for processes inside the sandbox.
+/// Registry for tracked sandbox processes plus their per-process
+/// supervisor state.
+///
+/// In the default supervisor this is populated lazily from seccomp
+/// notifications. When `policy_fn` is active, fork-like syscalls are
+/// additionally traced for one ptrace creation event so children are
+/// inserted here before they can run user code; this makes the index
+/// complete for argv-safety freezes.
 ///
 /// Maps the kernel's numeric `pid` (the value that arrives in seccomp
 /// notifications) to the canonical `PidKey` plus an
@@ -188,8 +195,8 @@ impl ProcessIndex {
             .map(|e| (e.key, Arc::clone(&e.state)))
     }
 
-    /// Cheap membership test — used by /proc virtualization to gate
-    /// access to `/proc/<pid>/...` paths and by getdents filtering.
+    /// Cheap tracked-process test — used by /proc virtualization to
+    /// gate access to `/proc/<pid>/...` paths and by getdents filtering.
     pub fn contains(&self, pid: i32) -> bool {
         self.inner
             .read()
diff --git a/crates/sandlock-core/tests/integration/test_resource.rs b/crates/sandlock-core/tests/integration/test_resource.rs
index 5d29f9d..d57148b 100644
--- a/crates/sandlock-core/tests/integration/test_resource.rs
+++ b/crates/sandlock-core/tests/integration/test_resource.rs
@@ -207,6 +207,43 @@ async fn test_process_limit_allows_sequential_reuse() {
     let _ = std::fs::remove_file(&out);
 }
 
+#[tokio::test]
+async fn test_threads_do_not_count_toward_process_limit_clone3() {
+    // Regression: handle_fork only checked CLONE_THREAD on SYS_clone, not
+    // SYS_clone3 (whose flags live in a clone_args struct in user memory).
+    // glibc 2.34+ implements pthread_create via clone3, so spawning many
+    // Python threads under a tight max_processes would over-count and the
+    // thread creations would fail with EAGAIN once the limit was hit.
+    let out = temp_path("clone3-threads");
+
+    let script = format!(concat!(
+        "import threading, time\n",
+        "barrier = threading.Barrier(11)\n",  // 10 threads + main
+        "def worker():\n",
+        "  barrier.wait()\n",
+        "ts = [threading.Thread(target=worker) for _ in range(10)]\n",
+        "for t in ts: t.start()\n",
+        "barrier.wait()\n",  // every thread reached this point => all 10 alive together
+        "for t in ts: t.join()\n",
+        "open('{out}', 'w').write('ok')\n",
+    ), out = out.display());
+
+    // max_processes=2 leaves zero headroom for child *processes*, so any
+    // pre-fix bug that counted threads as processes would block thread
+    // creation immediately.
+    let policy = base_policy().max_processes(2).build().unwrap();
+    let result = Sandbox::run_interactive(&policy, &["python3", "-c", &script]).await.unwrap();
+    assert!(
+        matches!(result.exit_status, ExitStatus::Code(0)),
+        "python should exit 0; got {:?}",
+        result.exit_status,
+    );
+    let content = std::fs::read_to_string(&out).expect("temp file should exist");
+    assert_eq!(content, "ok", "all 10 threads should have started concurrently");
+
+    let _ = std::fs::remove_file(&out);
+}
+
 #[tokio::test]
 async fn test_memory_limit_enforced() {
     let out = temp_path("mem-limit");