diff --git a/README.md b/README.md index b36d849..4fc9789 100644 --- a/README.md +++ b/README.md @@ -295,7 +295,7 @@ positive int = deny with errno, `"audit"`/`-2` = allow + flag. **Event fields:** `syscall`, `category` (file/network/process/memory), `pid`, `parent_pid`, `host`, `port`, `argv`, `denied`. -> **TOCTOU NOTE ** Per `seccomp_unotify(2)`, the kernel +> **TOCTOU NOTE** Per `seccomp_unotify(2)`, the kernel > re-reads user-memory pointers after `Continue`. Sandlock handles this > in two places: > @@ -303,14 +303,15 @@ positive int = deny with errno, `"audit"`/`-2` = allow + flag. > belongs in static Landlock rules (`fs_readable` / `fs_writable` / > `fs_denied`) — kernel-enforced and TOCTOU-immune. Use > `ctx.deny_path()` for runtime additions. -> - **`event.argv` is exposed and TOCTOU-safe.** Before returning -> `Continue` for an `execve`, the supervisor `PTRACE_SEIZE` + -> `PTRACE_INTERRUPT`s every sibling thread of the calling tid so the -> kernel's re-read happens with no other writer running. The pause -> has no observable cost: `execve`'s `de_thread` step kills sibling -> threads anyway. If the freeze cannot be established (e.g., YAMA -> blocks ptrace), the execve is denied with `EPERM` — the safety -> invariant is never silently relaxed. +> - **`event.argv` is exposed and TOCTOU-safe.** Before exposing +> `argv` to `policy_fn` or returning `Continue` for an +> `execve`, the supervisor freezes every task in `ProcessIndex`, +> including peer processes that may alias argv through shared memory. +> With `policy_fn` active, fork-like syscalls are traced for one +> ptrace creation event, so children are registered in `ProcessIndex` +> before they can run user code. If the freeze or creation tracking +> cannot be established (e.g., YAMA blocks ptrace), the syscall is +> denied with `EPERM`; the safety invariant is never silently relaxed. **Context methods:** - `ctx.restrict_network(ips)` / `ctx.grant_network(ips)` — network control @@ -455,8 +456,8 @@ what a command would do before committing. ### COW Fork & Map-Reduce Initialize expensive state once, then fork COW clones that share memory. -Each fork uses raw `fork(2)` (bypasses seccomp notification) for minimal -overhead. 1000 clones in ~530ms, ~1,900 forks/sec. +Each clone uses raw `fork(2)` with shared copy-on-write pages. 1000 +clones in ~530ms, ~1,900 forks/sec. Each clone's stdout is captured via its own pipe. `reduce()` reads all pipes and feeds combined output to a reducer's stdin — fully pipe-based diff --git a/crates/sandlock-core/src/arch.rs b/crates/sandlock-core/src/arch.rs index 6f1653f..14bd7f1 100644 --- a/crates/sandlock-core/src/arch.rs +++ b/crates/sandlock-core/src/arch.rs @@ -29,6 +29,17 @@ mod imp { pub const SYS_IOPERM: Option = Some(libc::SYS_ioperm); pub const SYS_IOPL: Option = Some(libc::SYS_iopl); pub const SYS_TIME: Option = Some(libc::SYS_time); + + /// Every syscall the kernel will dispatch through `handle_fork`. + /// Single source of truth for callers that enumerate fork-class + /// syscalls (BPF notif registration in `seccomp::dispatch`, + /// classification in `resource::is_process_creation_notif`). + pub const FORK_LIKE_SYSCALLS: &[i64] = &[ + libc::SYS_clone, + libc::SYS_clone3, + libc::SYS_vfork, + libc::SYS_fork, + ]; } #[cfg(target_arch = "aarch64")] @@ -60,6 +71,13 @@ mod imp { pub const SYS_IOPERM: Option = None; pub const SYS_IOPL: Option = None; pub const SYS_TIME: Option = None; + + /// Every syscall the kernel will dispatch through `handle_fork`. + /// aarch64 has no `fork`/`vfork` (glibc emulates via `clone`). + pub const FORK_LIKE_SYSCALLS: &[i64] = &[ + libc::SYS_clone, + libc::SYS_clone3, + ]; } pub use imp::*; diff --git a/crates/sandlock-core/src/context.rs b/crates/sandlock-core/src/context.rs index 67627d7..0a3b4ed 100644 --- a/crates/sandlock-core/src/context.rs +++ b/crates/sandlock-core/src/context.rs @@ -240,6 +240,15 @@ pub fn notif_syscalls(policy: &Policy) -> Vec { libc::SYS_waitid as u32, ]; arch::push_optional_syscall(&mut nrs, arch::SYS_VFORK); + // Bare fork(2) carries none of the namespace/process-limit risk of + // clone/clone3 and was historically left out of the BPF filter so + // hot fork-loops (COW map-reduce) bypass the supervisor entirely. + // It only needs interception when policy_fn is active, so the + // supervisor can register the new child via ptrace fork events + // before it can run user code (argv-safety invariant). + if policy.policy_fn.is_some() { + arch::push_optional_syscall(&mut nrs, arch::SYS_FORK); + } if policy.max_memory.is_some() { nrs.push(libc::SYS_mmap as u32); @@ -949,9 +958,21 @@ pub(crate) fn confine_child(args: ChildSpawnArgs<'_>) -> ! { let mut notif = notif_syscalls(policy); if !extra_syscalls.is_empty() { notif.extend_from_slice(extra_syscalls); - notif.sort_unstable(); - notif.dedup(); } + // Argv-safety gate (companion to the policy_fn case in + // notif_syscalls): an extra handler bound to execve/execveat + // can call `read_child_mem` to inspect argv, so the supervisor + // must register newly forked children before they can run user + // code — same invariant policy_fn relies on. Bare fork(2) + // therefore needs to be intercepted here too. + let exec_extra = extra_syscalls.iter().any(|&n| { + n == libc::SYS_execve as u32 || n == libc::SYS_execveat as u32 + }); + if exec_extra { + arch::push_optional_syscall(&mut notif, arch::SYS_FORK); + } + notif.sort_unstable(); + notif.dedup(); let filter = match bpf::assemble_filter(¬if, &deny, &args) { Ok(f) => f, Err(e) => fail!(format!("seccomp assemble: {}", e)), @@ -1093,6 +1114,24 @@ mod tests { if let Some(vfork) = arch::SYS_VFORK { assert!(nrs.contains(&(vfork as u32))); } + // Bare fork(2) is intercepted only when policy_fn is active — + // see notif_syscalls. The default policy has no policy_fn, so + // fork stays out of the BPF filter and hot fork-loops keep + // bypassing the supervisor. + if let Some(fork) = arch::SYS_FORK { + assert!(!nrs.contains(&(fork as u32))); + } + } + + #[test] + fn test_notif_syscalls_fork_gated_on_policy_fn() { + let Some(fork) = arch::SYS_FORK else { return }; + let policy = Policy::builder() + .policy_fn(|_event, _ctx| crate::policy_fn::Verdict::Allow) + .build() + .unwrap(); + let nrs = notif_syscalls(&policy); + assert!(nrs.contains(&(fork as u32))); } #[test] diff --git a/crates/sandlock-core/src/fork.rs b/crates/sandlock-core/src/fork.rs index 14ca53f..4cb4d09 100644 --- a/crates/sandlock-core/src/fork.rs +++ b/crates/sandlock-core/src/fork.rs @@ -5,17 +5,17 @@ //! COW clones that share memory pages with the template. Each clone //! receives `CLONE_ID=0..N-1` and execs `work_cmd`. //! -//! Uses raw `fork()` syscall (NR 57 on x86_64) to bypass seccomp -//! notification — the BPF filter only intercepts `clone`/`clone3`. +//! Uses raw `fork()` syscall (NR 57 on x86_64). The supervisor +//! intercepts fork-like syscalls for process accounting and, when +//! `policy_fn` is active, child registration before user code runs. use std::os::unix::io::RawFd; // ============================================================ -// Raw fork (bypasses seccomp clone interception) +// Raw fork // ============================================================ /// Raw fork() syscall — NR 57 on x86_64. -/// Unlike clone/clone3, this is NOT intercepted by the seccomp notif filter. fn raw_fork() -> std::io::Result { #[cfg(target_arch = "x86_64")] const NR_FORK: i64 = 57; diff --git a/crates/sandlock-core/src/policy_fn.rs b/crates/sandlock-core/src/policy_fn.rs index 2e5f0aa..b7a2ce1 100644 --- a/crates/sandlock-core/src/policy_fn.rs +++ b/crates/sandlock-core/src/policy_fn.rs @@ -63,9 +63,12 @@ pub enum SyscallCategory { /// (`fs_read` / `fs_write` / `fs_deny`); see issue #27. /// /// `argv` *is* exposed for `execve`/`execveat` and is TOCTOU-safe by -/// construction: before the supervisor returns `Continue` for an -/// execve, it `PTRACE_SEIZE`+`PTRACE_INTERRUPT`s every task in the -/// sandbox — both sibling threads of the calling tid (same TGID, share +/// construction: with `policy_fn` active, fork-like syscalls are traced +/// for one ptrace creation event, so children are registered in +/// `ProcessIndex` before they can run user code. Before the supervisor +/// exposes `argv` to `policy_fn` or returns `Continue` for an execve, it +/// then `PTRACE_SEIZE`+`PTRACE_INTERRUPT`s every task that could write +/// the memory — both sibling threads of the calling tid (same TGID, share /// `mm_struct`) and peer threads in other TGIDs that may alias argv /// pages via `MAP_SHARED` mappings or share `mm_struct` via /// `clone(CLONE_VM)`. The kernel's post-Continue re-read therefore @@ -94,8 +97,11 @@ pub struct SyscallEvent { /// Size argument (for mmap, brk). pub size: Option, /// Command arguments for execve/execveat. TOCTOU-safe: every task - /// in the sandbox (caller's siblings and peer processes) is frozen - /// before the kernel re-reads argv from child memory. + /// in `ProcessIndex` (caller's siblings and peer processes) is + /// frozen before argv is read for this event and before the kernel + /// re-reads argv from child memory; fork-like syscalls register + /// children before they can run user code while `policy_fn` is + /// active. pub argv: Option>, /// Whether the supervisor denied this syscall. pub denied: bool, diff --git a/crates/sandlock-core/src/resource.rs b/crates/sandlock-core/src/resource.rs index d4277bf..9d96248 100644 --- a/crates/sandlock-core/src/resource.rs +++ b/crates/sandlock-core/src/resource.rs @@ -1,21 +1,25 @@ // Resource limit handlers — memory and process limit enforcement. // // Continue safety (issue #27): every `Continue` in this module is safe. -// All decisions here are on scalar register args (clone flags, mmap len, -// brk address, etc.) which are copied into the seccomp_notif struct at -// notification time — they are *not* pointers into racy user memory. -// The kernel's re-read of the syscall args after Continue comes from the -// suspended calling thread's saved registers, which a sibling thread -// cannot mutate. So even though we return Continue after taking a -// security-relevant action (e.g., counting an allocation against the -// memory limit), there is no TOCTOU substitution window for the values -// we examined. +// Most decisions here are on scalar register args (clone flags, mmap +// len, brk address, etc.) which are copied into the seccomp_notif +// struct at notification time — they are *not* pointers into racy user +// memory. The one exception is `clone3`, whose flags live in a +// `clone_args` struct that the supervisor reads from child memory; see +// `clone_flags` for the TOCTOU rationale. The reader is used only for +// resource accounting, not for any kernel-enforced security boundary. +// The kernel's re-read of the syscall args after Continue comes from +// the suspended calling thread's saved registers, which a sibling +// thread cannot mutate. +use std::io; +use std::mem; +use std::os::unix::io::RawFd; use std::sync::Arc; use tokio::sync::Mutex; use crate::seccomp::ctx::SupervisorCtx; -use crate::seccomp::notif::{spawn_pid_watcher, NotifAction, NotifPolicy}; +use crate::seccomp::notif::{read_child_mem, spawn_pid_watcher, NotifAction, NotifPolicy}; use crate::seccomp::state::ResourceState; use crate::sys::structs::{ SeccompNotif, CLONE_NS_FLAGS, EAGAIN, EPERM, @@ -27,35 +31,77 @@ const CLONE_THREAD: u64 = 0x0001_0000; /// MAP_ANONYMOUS flag — only anonymous mappings count toward memory limit. const MAP_ANONYMOUS: u64 = 0x20; +/// Effective clone flags for a fork-like notification. +/// +/// `clone(2)` exposes flags directly in `args[0]`. `clone3(2)` instead +/// passes a pointer to a `clone_args` struct in `args[0]` (size in +/// `args[1]`); its `flags` field is the first u64. `fork`/`vfork` +/// have no flags. Anything else returns `None`. +/// +/// TOCTOU note: the `clone3` read is from racy user memory — a sibling +/// thread could mutate the struct between this read and the kernel's +/// re-read after `Continue`. Callers use this only for resource +/// accounting (`proc_count`, fork-event tracking gate), never as a +/// security boundary, so a misread can throttle incorrectly but cannot +/// bypass any kernel-enforced deny. +fn clone_flags(notif: &SeccompNotif, notif_fd: RawFd) -> Option { + let args = ¬if.data.args; + let nr = notif.data.nr as i64; + if nr == libc::SYS_clone { + return Some(args[0]); + } + if nr == libc::SYS_clone3 { + let ptr = args[0]; + let size = args[1] as usize; + if ptr == 0 || size < 8 { + return None; + } + let buf = read_child_mem(notif_fd, notif.id, notif.pid, ptr, 8).ok()?; + let arr: [u8; 8] = buf.as_slice().try_into().ok()?; + return Some(u64::from_ne_bytes(arr)); + } + if Some(nr) == crate::arch::SYS_VFORK || Some(nr) == crate::arch::SYS_FORK { + return Some(0); + } + None +} + +/// True when the fork-like notification creates a thread (CLONE_THREAD +/// set), i.e. it should not bump the process count. Returns false for +/// non-fork notifs and for clone3 calls whose `clone_args` cannot be +/// read (fail-safe: count as a process rather than silently uncount). +fn is_thread_create(notif: &SeccompNotif, notif_fd: RawFd) -> bool { + matches!(clone_flags(notif, notif_fd), Some(f) if f & CLONE_THREAD != 0) +} + /// Handle fork/clone/vfork notifications. /// -/// Enforces namespace creation ban and process limits, registers the -/// new child in `ProcessIndex` (with an owned pidfd), and spawns a -/// per-child pidfd watcher that runs unified cleanup on exit. +/// Enforces namespace creation ban and process limits. /// /// Note: `notif.pid` here is the *parent* (the task issuing -/// clone/fork). The kernel hasn't run the syscall yet, so we don't -/// know the child's pid. The child is discovered and registered later, -/// on its first own seccomp notification, via `register_child_if_new`. +/// fork/clone/vfork). The kernel hasn't run the syscall yet, so we don't +/// know the child's pid yet. When `policy_fn` is active, the supervisor +/// wraps the eventual `Continue` in one-shot ptrace fork-event tracking +/// and registers the new child before it can run user code. pub(crate) async fn handle_fork( notif: &SeccompNotif, + notif_fd: RawFd, resource: &Arc>, _policy: &NotifPolicy, ) -> NotifAction { let nr = notif.data.nr as i64; let args = ¬if.data.args; - // For clone/vfork: check namespace flags in args[0]. - if nr == libc::SYS_clone || Some(nr) == crate::arch::SYS_VFORK { - if nr == libc::SYS_clone && (args[0] & CLONE_NS_FLAGS) != 0 { - return NotifAction::Errno(EPERM); - } - // For clone: if CLONE_THREAD is set, it's a thread — don't count, allow. - if nr == libc::SYS_clone && (args[0] & CLONE_THREAD) != 0 { - return NotifAction::Continue; - } + // Namespace flags are denied for clone (clone3's are caught by the + // BPF arg filter; vfork takes no flags). + if nr == libc::SYS_clone && (args[0] & CLONE_NS_FLAGS) != 0 { + return NotifAction::Errno(EPERM); + } + + // Threads share their parent's process slot — don't count, allow. + if is_thread_create(notif, notif_fd) { + return NotifAction::Continue; } - // For clone3: BPF arg filter handles dangerous cases; proceed to limit check. let mut rs = resource.lock().await; @@ -75,10 +121,16 @@ pub(crate) async fn handle_fork( } /// If `notif.pid` is not yet tracked in the ProcessIndex, register -/// it: open a pidfd, record the canonical PidKey, and spawn the exit -/// watcher. Called from the supervisor's notification dispatcher -/// before per-syscall handlers run, so handlers can rely on -/// `ProcessIndex::key_for(notif.pid)` returning a fresh PidKey. +/// per-process supervisor state for it: open a pidfd, record the +/// canonical PidKey, and spawn the exit watcher. Called from the +/// supervisor's notification dispatcher before per-syscall handlers +/// run, so handlers can rely on `ProcessIndex::key_for(notif.pid)` +/// returning a fresh PidKey. +/// +/// With `policy_fn` active, fork-like syscalls additionally register +/// new child processes at creation time via ptrace fork events, before +/// the child can run user code. Without `policy_fn`, lazy registration +/// is enough because no argv-based security decision is exposed. /// /// The fast path is a single `RwLock` read: if the pid is already /// tracked, we trust the entry. PID-identity correctness comes from @@ -87,23 +139,352 @@ pub(crate) async fn handle_fork( /// parent has waited (which we observe), so a stale entry has no /// window in which to be hit. We deliberately do *not* re-stat /// /proc//stat on every notification. -pub(crate) async fn register_child_if_new(ctx: &Arc, pid: i32) { +pub(crate) fn register_pid_if_new(ctx: &Arc, pid: i32) -> bool { if ctx.processes.contains(pid) { - return; + return true; } let pidfd = match crate::sys::syscall::pidfd_open(pid as u32, 0) { Ok(fd) => fd, - Err(_) => return, // old kernel or process gone — GC backstop will clean up + Err(_) => { + // clone3 can create CLONE_THREAD tasks. Linux 6.9 added + // PIDFD_THREAD so pidfd_open works for non-leader TIDs too. + const PIDFD_THREAD: u32 = libc::O_EXCL as u32; + match crate::sys::syscall::pidfd_open(pid as u32, PIDFD_THREAD) { + Ok(fd) => fd, + Err(_) => { + if matches!(read_tgid_of_tid(pid), Some(tgid) if ctx.processes.contains(tgid)) { + return true; + } + return false; // old kernel or process gone + } + } + } }; let key = match ctx.processes.register(pid) { Some(k) => k, - None => return, // process exited between pidfd_open and stat read + None => return false, // process exited between pidfd_open and stat read }; // Hand the pidfd to the watcher; it owns the fd's lifetime now. spawn_pid_watcher(Arc::clone(ctx), key, pidfd); + true +} + +fn read_tgid_of_tid(tid: i32) -> Option { + let status = std::fs::read_to_string(format!("/proc/{}/status", tid)).ok()?; + for line in status.lines() { + if let Some(rest) = line.strip_prefix("Tgid:") { + return rest.trim().parse().ok(); + } + } + None +} + +pub(crate) async fn register_child_if_new(ctx: &Arc, pid: i32) { + let _ = register_pid_if_new(ctx, pid); +} + +/// One-shot ptrace attachment around a fork-like syscall. RAII guard: +/// on drop, detaches the caller so the supervisor cannot leak a ptrace +/// relationship if a code path between `prepare_*` and `finish_*` +/// panics or returns early. Functions that complete the tracking and +/// detach explicitly should still hand the trace to a consuming +/// function (or let it fall out of scope) — duplicate `PTRACE_DETACH` +/// is harmless (returns ESRCH and is ignored). +pub(crate) struct ProcessCreationTrace { + caller_tid: i32, +} + +impl Drop for ProcessCreationTrace { + fn drop(&mut self) { + detach_traced(self.caller_tid); + } +} + +fn is_process_creation_notif(notif: &SeccompNotif) -> bool { + crate::arch::FORK_LIKE_SYSCALLS.contains(&(notif.data.nr as i64)) +} + +/// True when `handle_fork` would have incremented the concurrent +/// process count for this notification if it returned `Continue`. +/// +/// Mirrors the thread-vs-process decision in `handle_fork`: a clone or +/// clone3 with `CLONE_THREAD` does not bump the count, so a later +/// rollback would be wrong. The clone3 flag check involves a racy read +/// from child memory — see `clone_flags`. +pub(crate) fn fork_counted_on_continue(notif: &SeccompNotif, notif_fd: RawFd) -> bool { + is_process_creation_notif(notif) && !is_thread_create(notif, notif_fd) +} + +/// True when this notification can create a new task that must be in +/// `ProcessIndex` before it can race a later execve argv decision. +pub(crate) fn requires_process_creation_tracking( + notif: &SeccompNotif, + notif_fd: RawFd, + policy: &NotifPolicy, +) -> bool { + policy.argv_safety_required && fork_counted_on_continue(notif, notif_fd) +} + +/// Arm ptrace fork-event tracking on the syscall's calling task. +/// +/// The caller is stopped in seccomp user notification when this runs. +/// After the supervisor sends `Continue`, the kernel executes the +/// fork-like syscall and reports `PTRACE_EVENT_{FORK,VFORK,CLONE}`; +/// the new child is born traced/stopped, so we can register it before +/// detaching either task. +/// +/// Runs the blocking `waitpid` on a tokio blocking-pool thread so the +/// notification handler's worker is not stalled if the wait stretches. +pub(crate) async fn prepare_process_creation_tracking( + caller_tid: i32, +) -> io::Result { + tokio::task::spawn_blocking(move || prepare_process_creation_tracking_blocking(caller_tid)) + .await + .map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("spawn_blocking join: {e}")) + })? +} + +fn prepare_process_creation_tracking_blocking( + caller_tid: i32, +) -> io::Result { + let opts = (libc::PTRACE_O_TRACEFORK + | libc::PTRACE_O_TRACEVFORK + | libc::PTRACE_O_TRACECLONE + | libc::PTRACE_O_TRACESYSGOOD) as libc::c_ulong; + let ret = unsafe { + libc::ptrace( + libc::PTRACE_SEIZE as libc::c_uint, + caller_tid, + 0, + opts, + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + + // Arm the RAII guard the moment SEIZE succeeds: any early return + // from here to the end of this function detaches via Drop. + let trace = ProcessCreationTrace { caller_tid }; + + let ret = unsafe { + libc::ptrace(libc::PTRACE_INTERRUPT as libc::c_uint, caller_tid, 0, 0) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + wait_for_ptrace_stop(caller_tid)?; + + // Arm a syscall-exit stop as a fallback. A successful fork-like + // syscall reports PTRACE_EVENT_{FORK,VFORK,CLONE}; a failed one has + // no child event, but it still reaches syscall-exit so the + // supervisor will not block forever waiting for a child that was + // never created. + let ret = unsafe { + libc::ptrace(libc::PTRACE_SYSCALL as libc::c_uint, caller_tid, 0, 0) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + + Ok(trace) +} + +fn detach_traced(tid: i32) { + let _ = unsafe { libc::ptrace(libc::PTRACE_DETACH, tid, 0, 0) }; +} + +fn wait_for_ptrace_stop(tid: i32) -> io::Result { + let mut status: libc::c_int = 0; + loop { + let ret = unsafe { libc::waitpid(tid, &mut status, libc::__WALL) }; + if ret < 0 { + let err = io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::EINTR) { + continue; + } + return Err(err); + } + break; + } + + if !libc::WIFSTOPPED(status) { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("unexpected ptrace wait status: {status:#x}"), + )); + } + Ok(status) +} + +fn syscall_stop_kind(tid: i32) -> io::Result { + let mut info: libc::ptrace_syscall_info = unsafe { mem::zeroed() }; + let ret = unsafe { + libc::ptrace( + libc::PTRACE_GET_SYSCALL_INFO as libc::c_uint, + tid, + mem::size_of::(), + &mut info, + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + Ok(info.op) +} + +#[cfg(test)] +static CHILD_REGISTERED_HOOK: std::sync::Mutex< + Option>, +> = std::sync::Mutex::new(None); + +#[cfg(test)] +fn child_registered_for_test(child_pid: i32) { + if let Ok(guard) = CHILD_REGISTERED_HOOK.lock() { + if let Some(hook) = guard.as_ref() { + hook(child_pid); + } + } +} + +/// Complete one-shot process-creation tracking after `Continue`. +/// +/// Runs the blocking `waitpid` on a tokio blocking-pool thread so the +/// notification handler's worker is not stalled. +pub(crate) async fn finish_process_creation_tracking( + ctx: &Arc, + trace: ProcessCreationTrace, +) -> io::Result { + let ctx = Arc::clone(ctx); + tokio::task::spawn_blocking(move || finish_process_creation_tracking_blocking(&ctx, trace)) + .await + .map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("spawn_blocking join: {e}")) + })? +} + +fn finish_process_creation_tracking_blocking( + ctx: &Arc, + trace: ProcessCreationTrace, +) -> io::Result { + // Every early return below relies on `trace`'s Drop to detach the + // caller. The success path hands `trace` off to + // `finish_process_creation_event`, which keeps the same guarantee. + loop { + let status = wait_for_ptrace_stop(trace.caller_tid)?; + + let event = (status >> 16) & 0xffff; + let is_fork_event = event == libc::PTRACE_EVENT_FORK + || event == libc::PTRACE_EVENT_VFORK + || event == libc::PTRACE_EVENT_CLONE; + if is_fork_event { + return finish_process_creation_event(ctx, trace); + } + + let stopsig = libc::WSTOPSIG(status); + if event == 0 && stopsig == (libc::SIGTRAP | 0x80) { + let op = syscall_stop_kind(trace.caller_tid)?; + match op { + libc::PTRACE_SYSCALL_INFO_ENTRY => { + let ret = unsafe { + libc::ptrace( + libc::PTRACE_SYSCALL as libc::c_uint, + trace.caller_tid, + 0, + 0, + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + continue; + } + libc::PTRACE_SYSCALL_INFO_EXIT => { + return Ok(false); + } + op => { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("unexpected ptrace syscall stop kind: {op}"), + )); + } + } + } + + return Err(io::Error::new( + io::ErrorKind::Other, + format!("unexpected ptrace event: {event}"), + )); + } +} + +fn finish_process_creation_event( + ctx: &Arc, + trace: ProcessCreationTrace, +) -> io::Result { + // `trace` detaches the caller on drop; the explicit child-side + // detaches stay manual since the child is not held by the guard. + let mut child_pid: libc::c_ulong = 0; + let ret = unsafe { + libc::ptrace( + libc::PTRACE_GETEVENTMSG as libc::c_uint, + trace.caller_tid, + 0, + &mut child_pid, + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + + let child_pid = child_pid as i32; + if !register_pid_if_new(ctx, child_pid) { + let _ = unsafe { libc::kill(child_pid, libc::SIGKILL) }; + detach_traced(child_pid); + return Err(io::Error::new( + io::ErrorKind::Other, + format!("failed to register new child pid {child_pid}"), + )); + } + #[cfg(test)] + child_registered_for_test(child_pid); + + // Wait for the child's birth-traced ptrace-stop, then detach so it + // can run. Result ignored: the child may have already proceeded + // (PTRACE_O_TRACEFORK leaves it stopped, but a racing exit is + // possible) — detach is harmless either way. + let _ = wait_for_ptrace_stop(child_pid); + detach_traced(child_pid); + drop(trace); + Ok(true) +} + +/// Tear down a tracking session whose `Continue` was never sent +/// (e.g., `send_response` failed). Runs the blocking `waitpid` on the +/// tokio blocking pool. +pub(crate) async fn abort_process_creation_tracking(trace: ProcessCreationTrace) { + let _ = tokio::task::spawn_blocking(move || abort_process_creation_tracking_blocking(trace)) + .await; +} + +fn abort_process_creation_tracking_blocking(trace: ProcessCreationTrace) { + // INTERRUPT + wait so we can detach cleanly from a known state; + // the actual detach happens via `trace`'s Drop on scope exit. + let ret = unsafe { + libc::ptrace( + libc::PTRACE_INTERRUPT as libc::c_uint, + trace.caller_tid, + 0, + 0, + ) + }; + if ret == 0 { + let _ = wait_for_ptrace_stop(trace.caller_tid); + } } /// Handle wait4/waitid notifications — decrement the concurrent process count. @@ -120,6 +501,13 @@ pub(crate) async fn handle_wait( NotifAction::Continue } +/// Undo the optimistic process-count increment if a fork-like syscall +/// is denied after `handle_fork` allowed it. +pub(crate) async fn rollback_fork_count(resource: &Arc>) { + let mut rs = resource.lock().await; + rs.proc_count = rs.proc_count.saturating_sub(1); +} + /// Handle memory-related notifications (mmap, munmap, brk, mremap, shmget). /// /// Tracks anonymous memory usage and enforces the configured memory limit. @@ -210,3 +598,296 @@ pub(crate) async fn handle_memory( NotifAction::Continue } + +#[cfg(test)] +mod tests { + use super::*; + use crate::netlink::NetlinkState; + use crate::seccomp::state::{ + ChrootState, CowState, NetworkState, PolicyFnState, ProcessIndex, ProcfsState, + TimeRandomState, + }; + use crate::sys::structs::{SeccompData, SeccompNotif}; + use std::ptr; + + const GO: isize = 0; + const CHILD_RAN: isize = 1; + const REGISTERED_BEFORE_RUN: isize = 2; + const REGISTERED_PID: isize = 3; + const DONE: isize = 4; + const FORK_FAILED: isize = 5; + const FLAGS_LEN: usize = 4096; + + fn fake_notif(nr: i64, arg0: u64) -> SeccompNotif { + SeccompNotif { + id: 0, + pid: 1, + flags: 0, + data: SeccompData { + nr: nr as i32, + arch: 0, + instruction_pointer: 0, + args: [arg0, 0, 0, 0, 0, 0], + }, + } + } + + fn fake_policy(argv_safety_required: bool) -> NotifPolicy { + NotifPolicy { + max_memory_bytes: 0, + max_processes: 0, + has_memory_limit: false, + has_net_allowlist: false, + has_random_seed: false, + has_time_start: false, + argv_safety_required, + time_offset: 0, + num_cpus: None, + port_remap: false, + cow_enabled: false, + chroot_root: None, + chroot_readable: Vec::new(), + chroot_writable: Vec::new(), + chroot_denied: Vec::new(), + chroot_mounts: Vec::new(), + deterministic_dirs: false, + hostname: None, + has_http_acl: false, + virtual_etc_hosts: None, + } + } + + fn fake_supervisor_ctx(argv_safety_required: bool) -> Arc { + Arc::new(SupervisorCtx { + resource: Arc::new(Mutex::new(ResourceState::new(0, 0))), + cow: Arc::new(Mutex::new(CowState::new())), + procfs: Arc::new(Mutex::new(ProcfsState::new())), + network: Arc::new(Mutex::new(NetworkState::new())), + time_random: Arc::new(Mutex::new(TimeRandomState::new(None, None))), + policy_fn: Arc::new(Mutex::new(PolicyFnState::new())), + chroot: Arc::new(Mutex::new(ChrootState::new())), + netlink: Arc::new(NetlinkState::new()), + processes: Arc::new(ProcessIndex::new()), + policy: Arc::new(fake_policy(argv_safety_required)), + child_pidfd: None, + notif_fd: -1, + }) + } + + #[test] + fn process_creation_tracking_predicates_follow_argv_safety_gate() { + let no_argv_safety = fake_policy(false); + let argv_safety = fake_policy(true); + let clone_proc = fake_notif(libc::SYS_clone, 0); + let clone_thread = fake_notif(libc::SYS_clone, CLONE_THREAD); + let clone3 = fake_notif(libc::SYS_clone3, 0); + let openat = fake_notif(libc::SYS_openat, 0); + + // notif_fd = -1: clone3's user-memory read fails (id_valid), + // which fail-safes to "not a thread" → counted as process. + // Matches the synthetic clone3 notif's expected accounting. + let fd = -1; + + assert!(fork_counted_on_continue(&clone_proc, fd)); + assert!(!fork_counted_on_continue(&clone_thread, fd)); + assert!(fork_counted_on_continue(&clone3, fd)); + assert!(!fork_counted_on_continue(&openat, fd)); + + assert!(!requires_process_creation_tracking(&clone_proc, fd, &no_argv_safety)); + assert!(requires_process_creation_tracking(&clone_proc, fd, &argv_safety)); + assert!(!requires_process_creation_tracking(&clone_thread, fd, &argv_safety)); + assert!(requires_process_creation_tracking(&clone3, fd, &argv_safety)); + assert!(!requires_process_creation_tracking(&openat, fd, &argv_safety)); + + if let Some(fork_nr) = crate::arch::SYS_FORK { + let fork = fake_notif(fork_nr, 0); + assert!(fork_counted_on_continue(&fork, fd)); + assert!(requires_process_creation_tracking(&fork, fd, &argv_safety)); + } + if let Some(vfork_nr) = crate::arch::SYS_VFORK { + let vfork = fake_notif(vfork_nr, 0); + assert!(fork_counted_on_continue(&vfork, fd)); + assert!(requires_process_creation_tracking(&vfork, fd, &argv_safety)); + } + } + + struct SharedFlags { + ptr: *mut i32, + } + + impl SharedFlags { + fn new() -> Self { + let ptr = unsafe { + libc::mmap( + ptr::null_mut(), + FLAGS_LEN, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_SHARED | libc::MAP_ANONYMOUS, + -1, + 0, + ) + }; + assert_ne!(ptr, libc::MAP_FAILED, "mmap shared flags"); + Self { + ptr: ptr.cast::(), + } + } + + fn read(&self, slot: isize) -> i32 { + unsafe { ptr::read_volatile(self.ptr.offset(slot)) } + } + + fn write(&self, slot: isize, value: i32) { + unsafe { ptr::write_volatile(self.ptr.offset(slot), value) }; + } + + fn addr(&self) -> usize { + self.ptr as usize + } + } + + impl Drop for SharedFlags { + fn drop(&mut self) { + unsafe { + libc::munmap(self.ptr.cast(), FLAGS_LEN); + } + } + } + + struct HookReset; + + impl Drop for HookReset { + fn drop(&mut self) { + if let Ok(mut hook) = CHILD_REGISTERED_HOOK.lock() { + *hook = None; + } + } + } + + struct CallerGuard { + pid: i32, + flags_addr: usize, + } + + impl CallerGuard { + fn new(pid: i32, flags: &SharedFlags) -> Self { + Self { + pid, + flags_addr: flags.addr(), + } + } + + fn disarm(&mut self) { + self.pid = 0; + } + } + + impl Drop for CallerGuard { + fn drop(&mut self) { + if self.pid <= 0 { + return; + } + let flags = self.flags_addr as *mut i32; + unsafe { + ptr::write_volatile(flags.offset(GO), 1); + ptr::write_volatile(flags.offset(DONE), 1); + libc::kill(self.pid, libc::SIGKILL); + let mut status = 0; + let _ = libc::waitpid(self.pid, &mut status, 0); + } + } + } + + #[cfg(target_arch = "x86_64")] + unsafe fn caller_wait_then_raw_fork(flags: *mut i32) -> ! { + while ptr::read_volatile(flags.offset(GO)) == 0 { + core::hint::spin_loop(); + } + + let pid = libc::syscall(libc::SYS_fork) as i32; + if pid == 0 { + ptr::write_volatile(flags.offset(CHILD_RAN), 1); + while ptr::read_volatile(flags.offset(DONE)) == 0 { + core::hint::spin_loop(); + } + libc::_exit(0); + } + if pid > 0 { + let mut status = 0; + let _ = libc::waitpid(pid, &mut status, 0); + libc::_exit(0); + } + + ptr::write_volatile(flags.offset(FORK_FAILED), 1); + libc::_exit(1); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn process_creation_tracking_registers_child_before_user_code_runs() { + let flags = SharedFlags::new(); + let flags_addr = flags.addr(); + + let caller = unsafe { libc::fork() }; + assert!(caller >= 0, "fork caller"); + if caller == 0 { + unsafe { caller_wait_then_raw_fork(flags.ptr) }; + } + let mut caller_guard = CallerGuard::new(caller, &flags); + + let _hook_reset = HookReset; + { + let mut hook = CHILD_REGISTERED_HOOK.lock().expect("hook lock"); + *hook = Some(Box::new(move |child_pid| { + let flags = flags_addr as *mut i32; + unsafe { + let child_ran = ptr::read_volatile(flags.offset(CHILD_RAN)); + ptr::write_volatile(flags.offset(REGISTERED_PID), child_pid); + ptr::write_volatile( + flags.offset(REGISTERED_BEFORE_RUN), + if child_ran == 0 { 1 } else { -1 }, + ); + } + })); + } + + let ctx = fake_supervisor_ctx(true); + let rt = tokio::runtime::Builder::new_current_thread() + .enable_io() + .build() + .expect("tokio runtime"); + let trace = match rt.block_on(prepare_process_creation_tracking(caller)) { + Ok(trace) => trace, + Err(e) if matches!(e.raw_os_error(), Some(libc::EPERM | libc::EACCES)) => { + eprintln!("skipping ptrace fork-event test: ptrace denied: {e}"); + return; + } + Err(e) => panic!("prepare process-creation tracking: {e}"), + }; + + flags.write(GO, 1); + let created = rt + .block_on(finish_process_creation_tracking(&ctx, trace)) + .expect("finish process-creation tracking"); + assert!(created, "raw fork should produce a ptrace fork event"); + + let registered_pid = flags.read(REGISTERED_PID); + assert!(registered_pid > 0, "child pid should be captured by hook"); + assert!( + ctx.processes.contains(registered_pid), + "child should be registered in ProcessIndex" + ); + assert_eq!( + flags.read(REGISTERED_BEFORE_RUN), + 1, + "child should still be ptrace-stopped when registered" + ); + + flags.write(DONE, 1); + let mut status = 0; + let waited = unsafe { libc::waitpid(caller, &mut status, 0) }; + assert_eq!(waited, caller, "wait caller"); + assert_eq!(flags.read(FORK_FAILED), 0, "raw fork failed in caller"); + caller_guard.disarm(); + } +} diff --git a/crates/sandlock-core/src/sandbox.rs b/crates/sandlock-core/src/sandbox.rs index 02faf11..15c5480 100644 --- a/crates/sandlock-core/src/sandbox.rs +++ b/crates/sandlock-core/src/sandbox.rs @@ -300,7 +300,7 @@ impl Sandbox { /// Create N COW clones of this sandbox. /// /// Requires `new_with_fns()`. Forks a confined child, runs `init_fn`, - /// then forks N times using raw `fork()` (bypasses seccomp). Each + /// then forks N times using raw `fork()`. Each /// clone gets `CLONE_ID=0..N-1` and runs `work_fn(clone_id)`. /// /// Memory pages from `init_fn` are shared copy-on-write across all @@ -997,6 +997,16 @@ impl Sandbox { || !self.policy.http_deny.is_empty(), has_random_seed: self.policy.random_seed.is_some(), has_time_start: self.policy.time_start.is_some(), + // True if any consumer can inspect argv on execve: + // the policy_fn callback or an extra handler bound to + // execve/execveat (which can use read_child_mem). Both + // require the freeze + fork-event tracking to keep + // argv reads TOCTOU-safe. + argv_safety_required: self.policy.policy_fn.is_some() + || self.extra_handlers.iter().any(|h| { + h.syscall_nr == libc::SYS_execve + || h.syscall_nr == libc::SYS_execveat + }), time_offset: time_offset_val, num_cpus: self.policy.num_cpus, port_remap: self.policy.port_remap, @@ -1047,7 +1057,7 @@ impl Sandbox { net_state.port_map.on_bind = Some(cb); } - // ProcfsState (sandbox membership lives in ProcessIndex now). + // ProcfsState (per-notification process state lives in ProcessIndex). let procfs_state = ProcfsState::new(); // ResourceState @@ -1127,8 +1137,10 @@ impl Sandbox { let time_random_state = Arc::new(Mutex::new(time_random_state)); let policy_fn_state = Arc::new(Mutex::new(policy_fn_state)); let chroot_state = Arc::new(Mutex::new(chroot_state)); - // Root child is registered (with watcher) on its first - // notification, the same path grandchildren take. + // Root child gets per-process state (with watcher) on its + // first notification. When policy_fn is active, fork-like + // syscalls are traced at creation time so descendants are + // registered before they can run user code. let processes = Arc::new(crate::seccomp::state::ProcessIndex::new()); let ctx = Arc::new(SupervisorCtx { diff --git a/crates/sandlock-core/src/sandbox_freeze.rs b/crates/sandlock-core/src/sandbox_freeze.rs index d08fe1f..39104ed 100644 --- a/crates/sandlock-core/src/sandbox_freeze.rs +++ b/crates/sandlock-core/src/sandbox_freeze.rs @@ -1,4 +1,4 @@ -//! Freeze sandbox threads of an execve caller before returning Continue. +//! Freeze sandbox threads of an execve caller before exposing argv. //! //! # Why //! @@ -18,20 +18,22 @@ //! that share the calling task's `mm_struct` via //! `clone(CLONE_VM)` without `CLONE_THREAD`. //! -//! `freeze_sandbox_for_execve` closes both classes. It enumerates every -//! TGID tracked in `ProcessIndex` (the canonical sandbox membership -//! set), walks `/proc//task` per TGID, and `PTRACE_SEIZE` + -//! `PTRACE_INTERRUPT` every TID. Together with the supervisor's -//! sequential notification dispatch (which prevents new clone/fork -//! notifications from being processed while the freeze is in flight), -//! every entity that could mutate argv is paused before the kernel -//! re-reads. +//! `freeze_sandbox_for_execve` closes both classes. When `policy_fn` +//! is active, every fork-like syscall is traced for one ptrace +//! fork/clone/vfork event and the child is registered in +//! `ProcessIndex` before it can run user code. The exec freeze can +//! therefore enumerate every tracked TGID, walk `/proc//task`, +//! and `PTRACE_SEIZE` + `PTRACE_INTERRUPT` every TID that could mutate +//! argv. //! //! # Sibling vs peer cleanup //! //! Sibling threads (same TGID as the caller) are killed by the kernel -//! during execve's `de_thread` step, so the supervisor never has to -//! detach them — their ptrace state is reaped along with the threads. +//! during execve's `de_thread` step when execve is allowed, so the +//! supervisor does not detach them on the allow path — their ptrace +//! state is reaped along with the threads. If the policy callback +//! denies execve after argv inspection, the supervisor detaches both +//! siblings and peers because `de_thread` will not run. //! //! Peer threads (different TGID) survive execve. The supervisor must //! `PTRACE_DETACH` them after `NOTIF_SEND` so they can resume normal @@ -187,46 +189,45 @@ fn read_tgid_of_tid(tid: i32) -> io::Result { )) } -/// Outcome of a sandbox-wide freeze. The supervisor must call -/// `detach_peers(&outcome.peer_tids)` after `NOTIF_SEND` to let the -/// peer processes resume. +/// Outcome of a sandbox-wide freeze. #[derive(Debug, Default)] pub(crate) struct SandboxFreeze { + /// Sibling TIDs in the caller's TGID. These die in `de_thread` if + /// execve is allowed, but must be detached if execve is denied + /// after `policy_fn` inspected argv. + pub sibling_tids: Vec, /// TIDs in *other* TGIDs that were ptrace-stopped. These survive /// execve and must be detached so they can resume normal - /// execution. Siblings of `caller_tid` are deliberately not in - /// this list — execve's `de_thread` kills them and the kernel - /// reaps their ptrace state automatically. + /// execution. pub peer_tids: Vec, } /// Freeze every sandbox thread that could mutate execve argv before -/// the kernel re-reads it. +/// the supervisor reads it for `policy_fn` and before the kernel +/// re-reads it. /// -/// Walks every TGID in `processes` (and defensively the caller's own -/// TGID), enumerates each TGID's threads via `/proc//task/`, -/// and `PTRACE_SEIZE` + `PTRACE_INTERRUPT` every TID except -/// `caller_tid`. Sibling threads of `caller_tid` and peer threads in -/// other TGIDs are both covered. +/// Walks every TGID in `processes`, enumerates each TGID's threads via +/// `/proc//task/`, and `PTRACE_SEIZE` + `PTRACE_INTERRUPT`s +/// every TID except `caller_tid`. Sibling threads of `caller_tid` and +/// peer threads in other TGIDs are both covered. `processes` is +/// complete for `policy_fn` runs because fork-like syscalls are tracked +/// before new children can run. /// /// Strict semantics: if any task refuses to be frozen, every /// already-frozen task is detached and the error is propagated. The /// caller is expected to deny the execve with `EPERM`, preserving the /// invariant that exposed argv is always TOCTOU-safe. /// -/// On success, returns the list of *peer* TIDs that survive execve and -/// must be detached after `NOTIF_SEND`. Sibling TIDs are not returned -/// because they die in `de_thread`. +/// On success, returns the sibling and peer TIDs that were frozen. The +/// caller detaches peers after an allowed execve, or detaches all TIDs +/// after a denied execve. pub(crate) fn freeze_sandbox_for_execve( processes: &crate::seccomp::state::ProcessIndex, caller_tid: i32, ) -> io::Result { let caller_tgid = read_tgid_of_tid(caller_tid)?; - - // ProcessIndex is the canonical sandbox membership set. The - // supervisor's `register_child_if_new` runs before per-syscall - // handlers, so the caller's TGID is guaranteed to be present. - let tgids: HashSet = processes.pids_snapshot(); + let mut tgids: HashSet = processes.pids_snapshot(); + tgids.insert(caller_tgid); let mut sibling_tids: Vec = Vec::new(); let mut peer_tids: Vec = Vec::new(); @@ -266,7 +267,10 @@ pub(crate) fn freeze_sandbox_for_execve( } } - Ok(SandboxFreeze { peer_tids }) + Ok(SandboxFreeze { + sibling_tids, + peer_tids, + }) } /// Detach peer TIDs after the kernel has re-read execve argv. Errors @@ -278,6 +282,17 @@ pub(crate) fn detach_peers(peer_tids: &[i32]) { } } +/// Detach every task in a freeze after execve was denied or the +/// notification response could not be sent. +pub(crate) fn detach_all(freeze: &SandboxFreeze) { + for tid in &freeze.sibling_tids { + detach(*tid); + } + for tid in &freeze.peer_tids { + detach(*tid); + } +} + /// Helper called from the dispatch hot path. Returns true if the /// notification is for an execve-class syscall whose Continue response /// requires freezing siblings. @@ -311,15 +326,45 @@ mod tests { /// issue #27 (Changaco): a peer process in the sandbox — different /// TGID, possibly aliasing argv pages via shared memory — must also /// be frozen before the kernel re-reads execve argv. Sibling-thread - /// freeze alone (`freeze_siblings_for_execve`) does not cover this. + /// freeze alone does not cover this. In real policy_fn runs, + /// fork-like syscall tracking registers peer processes before they + /// can run; this unit test mirrors that completed registration. + /// + /// # Why we spawn a separate "caller" process + /// + /// In production, `freeze_sandbox_for_execve` runs in the supervisor + /// process and `caller_tid` is the sandboxed child's tid — i.e. the + /// supervisor and the execve caller are in *different* TGIDs, and + /// every TID the freeze walks is a descendant of the supervisor. + /// Under YAMA `ptrace_scope=1` (the Ubuntu/Debian default), that + /// descendant relationship is exactly what makes PTRACE_SEIZE + /// permitted without any privilege. /// - /// This test registers a peer process in `ProcessIndex` and verifies - /// that `freeze_sandbox_for_execve` puts it in ptrace-stop, the same - /// way `freeze_siblings_for_execve` does for siblings. + /// If this test instead used the test thread's own tid as + /// `caller_tid`, `caller_tgid` would be the cargo test binary's + /// TGID, the freeze would walk the test binary's sibling threads + /// (libtest workers, runtime helpers), and PTRACE_SEIZE would be + /// rejected with EPERM by YAMA — sibling threads are not + /// descendants of each other. That would force the test to require + /// privileges sandlock itself does not require. So we spawn a + /// dedicated "caller" sleep to play the sandboxed-process role, + /// matching production topology. #[test] fn freeze_sandbox_includes_peer_process() { use std::process::{Command, Stdio}; + // The "execve caller" — stands in for the sandboxed process. + // Its tid is a descendant of the test process (the parent), so + // ptracing into its TGID is YAMA-allowed under ptrace_scope=1. + let mut caller = Command::new("/bin/sleep") + .arg("60") + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("spawn caller sleep"); + let caller_tid = caller.id() as i32; + let mut peer = Command::new("/bin/sleep") .arg("60") .stdin(Stdio::null()) @@ -329,23 +374,19 @@ mod tests { .expect("spawn peer sleep"); let peer_pid = peer.id() as i32; - // Give the peer a moment to actually be running. + // Give both children a moment to actually be running. std::thread::sleep(std::time::Duration::from_millis(50)); - // Register the peer in a fresh ProcessIndex (mirrors what the - // supervisor's clone/fork notification handler would do). let processes = ProcessIndex::new(); processes .register(peer_pid) .expect("register peer in ProcessIndex"); - let our_tid = unsafe { libc::syscall(libc::SYS_gettid) } as i32; - - let outcome = freeze_sandbox_for_execve(&processes, our_tid) + let outcome = freeze_sandbox_for_execve(&processes, caller_tid) .expect("freeze_sandbox_for_execve"); // Peer's TID is its own TGID (single-threaded sleep), and it's - // a different TGID from ours, so it should be in peer_tids. + // a different TGID from the execve caller, so it should be in peer_tids. assert!( outcome.peer_tids.contains(&peer_pid), "peer pid {} should be in peer_tids: {:?}", @@ -370,5 +411,7 @@ mod tests { detach_peers(&outcome.peer_tids); let _ = peer.kill(); let _ = peer.wait(); + let _ = caller.kill(); + let _ = caller.wait(); } } diff --git a/crates/sandlock-core/src/seccomp/ctx.rs b/crates/sandlock-core/src/seccomp/ctx.rs index bb1be30..78c9080 100644 --- a/crates/sandlock-core/src/seccomp/ctx.rs +++ b/crates/sandlock-core/src/seccomp/ctx.rs @@ -26,10 +26,12 @@ pub struct SupervisorCtx { pub chroot: Arc>, /// NETLINK_ROUTE virtualization state. pub netlink: Arc, - /// Per-process registry: pid → PidKey. Source of truth for - /// "which processes are in the sandbox" and the anchor for - /// unified per-process state cleanup. Wraps an internal RwLock, - /// so handlers can query it synchronously without `.await`. + /// Per-process registry: pid → PidKey. This anchors unified + /// per-process state cleanup. With policy_fn active, fork-like + /// syscalls populate it at child creation time before user code can + /// run; otherwise it is populated lazily from notifications. Wraps + /// an internal RwLock, so handlers can query it synchronously + /// without `.await`. pub processes: Arc, /// Immutable policy — no lock needed. pub policy: Arc, diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs index 0dd53af..0adc66b 100644 --- a/crates/sandlock-core/src/seccomp/dispatch.rs +++ b/crates/sandlock-core/src/seccomp/dispatch.rs @@ -199,18 +199,14 @@ pub fn build_dispatch_table( // ------------------------------------------------------------------ // Fork/clone family (always on) // ------------------------------------------------------------------ - let mut fork_nrs = vec![libc::SYS_clone, libc::SYS_clone3]; - if let Some(vfork) = arch::SYS_VFORK { - fork_nrs.push(vfork); - } - for nr in fork_nrs { + for &nr in arch::FORK_LIKE_SYSCALLS { let policy = Arc::clone(policy); let resource = Arc::clone(resource); - table.register(nr, Box::new(move |notif, _ctx, _notif_fd| { + table.register(nr, Box::new(move |notif, _ctx, notif_fd| { let policy = Arc::clone(&policy); let resource = Arc::clone(&resource); Box::pin(async move { - crate::resource::handle_fork(¬if, &resource, &policy).await + crate::resource::handle_fork(¬if, notif_fd, &resource, &policy).await }) })); } @@ -844,6 +840,7 @@ mod extra_handler_tests { has_net_allowlist: false, has_random_seed: false, has_time_start: false, + argv_safety_required: false, time_offset: 0, num_cpus: None, port_remap: false, diff --git a/crates/sandlock-core/src/seccomp/notif.rs b/crates/sandlock-core/src/seccomp/notif.rs index 29414f0..01404d8 100644 --- a/crates/sandlock-core/src/seccomp/notif.rs +++ b/crates/sandlock-core/src/seccomp/notif.rs @@ -210,6 +210,13 @@ pub struct NotifPolicy { pub has_net_allowlist: bool, pub has_random_seed: bool, pub has_time_start: bool, + /// Argv-safety gate: the supervisor must freeze every task that + /// could mutate argv before any consumer reads it. True when + /// `policy_fn` is active or when an extra handler is bound to + /// execve/execveat (such handlers can call `read_child_mem`). + /// Also gates ptrace fork-event tracking so `ProcessIndex` is + /// complete when the freeze enumerates it. + pub argv_safety_required: bool, pub time_offset: i64, pub num_cpus: Option, pub port_remap: bool, @@ -559,6 +566,7 @@ fn syscall_name(nr: i64) -> &'static str { n if n == libc::SYS_clone => "clone", n if n == libc::SYS_clone3 => "clone3", n if Some(n) == arch::SYS_VFORK => "vfork", + n if Some(n) == arch::SYS_FORK => "fork", n if n == libc::SYS_execve => "execve", n if n == libc::SYS_execveat => "execveat", n if n == libc::SYS_mmap => "mmap", @@ -587,8 +595,8 @@ fn syscall_category(nr: i64) -> crate::policy_fn::SyscallCategory { || n == libc::SYS_sendmsg || n == libc::SYS_bind || n == libc::SYS_getsockname => SyscallCategory::Network, n if n == libc::SYS_clone || n == libc::SYS_clone3 - || Some(n) == arch::SYS_VFORK || n == libc::SYS_execve - || n == libc::SYS_execveat => SyscallCategory::Process, + || Some(n) == arch::SYS_VFORK || Some(n) == arch::SYS_FORK + || n == libc::SYS_execve || n == libc::SYS_execveat => SyscallCategory::Process, n if n == libc::SYS_mmap || n == libc::SYS_munmap || n == libc::SYS_brk || n == libc::SYS_mremap => SyscallCategory::Memory, @@ -825,10 +833,11 @@ async fn emit_policy_event( // decision is racy (issue #27). Path-based access control belongs // in static Landlock rules. // - // argv IS extracted for execve/execveat: the supervisor freezes - // every task in the sandbox (siblings + peers) before returning - // Continue (sandbox_freeze module), so the post-Continue re-read - // sees the same memory we read here. + // argv IS extracted for allowed execve/execveat notifications: + // the supervisor freezes every task in the sandbox (siblings + + // peers) before this callback reads argv and keeps that freeze + // through Continue, so the post-Continue re-read sees the same + // memory we read here. // // Network fields are TOCTOU-safe because connect/sendto/bind are // performed on-behalf via pidfd_getfd; the kernel never re-reads @@ -838,7 +847,7 @@ async fn emit_policy_event( let mut size = None; let mut argv = None; - if nr == libc::SYS_execve || nr == libc::SYS_execveat { + if !denied && (nr == libc::SYS_execve || nr == libc::SYS_execveat) { // execve(pathname, argv, envp): args[1] = argv ptr // execveat(dirfd, pathname, argv, ..): args[2] = argv ptr let argv_ptr = if nr == libc::SYS_execveat { @@ -914,10 +923,11 @@ async fn handle_notification( ) { let policy = &ctx.policy; - // Ensure every pid that produces a notification is tracked in the - // ProcessIndex with an exit watcher. The fork handler runs on the - // *parent* pid (the child doesn't exist yet at clone-time), so the - // child gets registered the first time it issues its own syscall. + // Ensure every pid that produces a notification has per-process + // supervisor state and an exit watcher. The fork handler runs on + // the *parent* pid (the child doesn't exist yet at clone-time), so + // the child gets registered the first time it issues a notified + // syscall. crate::resource::register_child_if_new(ctx, notif.pid as i32).await; // Re-patch vDSO if needed (exec replaces it with a fresh copy). @@ -951,39 +961,26 @@ async fn handle_notification( } }; - // Emit event to policy_fn callback if active - if let Some(verdict) = emit_policy_event(¬if, &action, &ctx.policy_fn, fd).await { - use crate::policy_fn::Verdict; - match verdict { - Verdict::Deny => { action = NotifAction::Errno(libc::EPERM); } - Verdict::DenyWith(errno) => { action = NotifAction::Errno(errno); } - Verdict::Audit => { /* allow, but could log here */ } - Verdict::Allow => {} - } - } + let nr = notif.data.nr as i64; + let fork_counted = matches!(action, NotifAction::Continue) + && crate::resource::fork_counted_on_continue(¬if, fd); // TOCTOU-close for execve (issue #27): freeze every sandbox task - // that could mutate argv before the kernel re-reads it after - // Continue. This covers two distinct writer classes: + // that could mutate argv before policy_fn reads argv and before the + // kernel re-reads it after Continue. This covers two writer classes: // 1. Sibling threads of the calling tid (same TGID, share mm). // 2. Peer processes in other TGIDs that alias argv pages via // MAP_SHARED mappings or share mm via clone(CLONE_VM). - // Sibling-thread freeze alone closed (1) but not (2), as raised - // by Changaco on issue #27. // - // Only relevant when sending Continue: a denial (Errno) means the - // kernel never re-reads, so no freeze is needed. + // The freeze enumerates ProcessIndex. With policy_fn active, that + // index is complete: fork-like syscalls are traced at creation time + // below, before new children can run user code. // // Strict on failure: if we cannot establish the freeze, we cannot - // uphold the argv-safety invariant, so we deny the execve with - // EPERM rather than letting it through unprotected. - // - // Sibling threads die in execve's de_thread; the kernel reaps - // their ptrace state. Peer threads survive — we detach them after - // NOTIF_SEND so they resume normally. - let nr = notif.data.nr as i64; - let mut peer_tids_to_detach: Vec = Vec::new(); + // safely expose argv or allow execve, so we deny with EPERM. + let mut exec_freeze = None; if matches!(action, NotifAction::Continue) + && policy.argv_safety_required && crate::sandbox_freeze::requires_freeze_on_continue(nr) { match crate::sandbox_freeze::freeze_sandbox_for_execve( @@ -991,7 +988,7 @@ async fn handle_notification( notif.pid as i32, ) { Ok(outcome) => { - peer_tids_to_detach = outcome.peer_tids; + exec_freeze = Some(outcome); } Err(e) => { eprintln!( @@ -1004,14 +1001,80 @@ async fn handle_notification( } } + // Emit event to policy_fn callback if active. For execve, argv is + // only populated after `exec_freeze` has stopped every possible + // writer, and those tasks stay stopped until after NOTIF_SEND. + if let Some(verdict) = emit_policy_event(¬if, &action, &ctx.policy_fn, fd).await { + use crate::policy_fn::Verdict; + match verdict { + Verdict::Deny => { action = NotifAction::Errno(libc::EPERM); } + Verdict::DenyWith(errno) => { action = NotifAction::Errno(errno); } + Verdict::Audit => { /* allow, but could log here */ } + Verdict::Allow => {} + } + } + + if fork_counted && !matches!(action, NotifAction::Continue) { + crate::resource::rollback_fork_count(&ctx.resource).await; + } + + // With policy_fn active, fork-like syscalls are traced for exactly + // one ptrace event so ProcessIndex becomes complete before the new + // child can run user code. That closes the race where a peer + // process could exist without ever having produced a notification. + let mut creation_trace = None; + if matches!(action, NotifAction::Continue) + && crate::resource::requires_process_creation_tracking(¬if, fd, policy) + { + match crate::resource::prepare_process_creation_tracking(notif.pid as i32).await { + Ok(trace) => { + creation_trace = Some(trace); + } + Err(e) => { + eprintln!( + "sandlock: process-creation tracking failed for pid {}: {} \ + — denying fork-like syscall to preserve argv TOCTOU invariant", + notif.pid, e + ); + if fork_counted { + crate::resource::rollback_fork_count(&ctx.resource).await; + } + action = NotifAction::Errno(libc::EPERM); + } + } + } + // Ignore error — child may have exited between recv and response. - let _ = send_response(fd, notif.id, action); + let exec_continued = exec_freeze.is_some() && matches!(action, NotifAction::Continue); + let send_result = send_response(fd, notif.id, action); + + if let Some(trace) = creation_trace { + if send_result.is_ok() { + match crate::resource::finish_process_creation_tracking(ctx, trace).await { + Ok(true) => {} + Ok(false) => { + crate::resource::rollback_fork_count(&ctx.resource).await; + } + Err(e) => { + crate::resource::rollback_fork_count(&ctx.resource).await; + eprintln!( + "sandlock: process-creation tracking completion failed for pid {}: {}", + notif.pid, e + ); + } + } + } else { + crate::resource::rollback_fork_count(&ctx.resource).await; + crate::resource::abort_process_creation_tracking(trace).await; + } + } - // Detach peer processes after NOTIF_SEND so they resume. Siblings - // of the caller's TGID are intentionally not detached: they die in - // execve's de_thread and the kernel reaps the ptrace state. - if !peer_tids_to_detach.is_empty() { - crate::sandbox_freeze::detach_peers(&peer_tids_to_detach); + if let Some(freeze) = exec_freeze { + if exec_continued && send_result.is_ok() { + crate::sandbox_freeze::detach_peers(&freeze.peer_tids); + } else { + crate::sandbox_freeze::detach_all(&freeze); + } } } diff --git a/crates/sandlock-core/src/seccomp/state.rs b/crates/sandlock-core/src/seccomp/state.rs index 199e7ec..b0a3a97 100644 --- a/crates/sandlock-core/src/seccomp/state.rs +++ b/crates/sandlock-core/src/seccomp/state.rs @@ -47,10 +47,10 @@ impl ResourceState { // ProcfsState — /proc virtualization state // ============================================================ -/// /proc virtualization runtime state. Sandbox membership lives in -/// `ProcessIndex`; per-process getdents caches live in -/// `PerProcessState::procfs_dir_cache`. This struct only holds -/// truly global virtualization state. +/// /proc virtualization runtime state. Per-notification process state +/// lives in `ProcessIndex`; per-process getdents caches live in +/// `PerProcessState::procfs_dir_cache`. This struct only holds truly +/// global virtualization state. pub struct ProcfsState { /// Base address of the last vDSO we patched (0 = not yet patched). pub vdso_patched_addr: u64, @@ -117,10 +117,17 @@ pub struct PerProcessState { } // ============================================================ -// ProcessIndex — sandbox membership + per-process state +// ProcessIndex — tracked processes + per-process state // ============================================================ -/// Source-of-truth registry for processes inside the sandbox. +/// Registry for tracked sandbox processes plus their per-process +/// supervisor state. +/// +/// In the default supervisor this is populated lazily from seccomp +/// notifications. When `policy_fn` is active, fork-like syscalls are +/// additionally traced for one ptrace creation event so children are +/// inserted here before they can run user code; this makes the index +/// complete for argv-safety freezes. /// /// Maps the kernel's numeric `pid` (the value that arrives in seccomp /// notifications) to the canonical `PidKey` plus an @@ -188,8 +195,8 @@ impl ProcessIndex { .map(|e| (e.key, Arc::clone(&e.state))) } - /// Cheap membership test — used by /proc virtualization to gate - /// access to `/proc//...` paths and by getdents filtering. + /// Cheap tracked-process test — used by /proc virtualization to + /// gate access to `/proc//...` paths and by getdents filtering. pub fn contains(&self, pid: i32) -> bool { self.inner .read() diff --git a/crates/sandlock-core/tests/integration/test_resource.rs b/crates/sandlock-core/tests/integration/test_resource.rs index 5d29f9d..d57148b 100644 --- a/crates/sandlock-core/tests/integration/test_resource.rs +++ b/crates/sandlock-core/tests/integration/test_resource.rs @@ -207,6 +207,43 @@ async fn test_process_limit_allows_sequential_reuse() { let _ = std::fs::remove_file(&out); } +#[tokio::test] +async fn test_threads_do_not_count_toward_process_limit_clone3() { + // Regression: handle_fork only checked CLONE_THREAD on SYS_clone, not + // SYS_clone3 (whose flags live in a clone_args struct in user memory). + // glibc 2.34+ implements pthread_create via clone3, so spawning many + // Python threads under a tight max_processes would over-count and the + // thread creations would fail with EAGAIN once the limit was hit. + let out = temp_path("clone3-threads"); + + let script = format!(concat!( + "import threading, time\n", + "barrier = threading.Barrier(11)\n", // 10 threads + main + "def worker():\n", + " barrier.wait()\n", + "ts = [threading.Thread(target=worker) for _ in range(10)]\n", + "for t in ts: t.start()\n", + "barrier.wait()\n", // every thread reached this point => all 10 alive together + "for t in ts: t.join()\n", + "open('{out}', 'w').write('ok')\n", + ), out = out.display()); + + // max_processes=2 leaves zero headroom for child *processes*, so any + // pre-fix bug that counted threads as processes would block thread + // creation immediately. + let policy = base_policy().max_processes(2).build().unwrap(); + let result = Sandbox::run_interactive(&policy, &["python3", "-c", &script]).await.unwrap(); + assert!( + matches!(result.exit_status, ExitStatus::Code(0)), + "python should exit 0; got {:?}", + result.exit_status, + ); + let content = std::fs::read_to_string(&out).expect("temp file should exist"); + assert_eq!(content, "ok", "all 10 threads should have started concurrently"); + + let _ = std::fs::remove_file(&out); +} + #[tokio::test] async fn test_memory_limit_enforced() { let out = temp_path("mem-limit");