diff --git a/crates/sandlock-core/src/arch.rs b/crates/sandlock-core/src/arch.rs index 14bd7f1..89969e4 100644 --- a/crates/sandlock-core/src/arch.rs +++ b/crates/sandlock-core/src/arch.rs @@ -3,6 +3,7 @@ #[cfg(target_arch = "x86_64")] mod imp { pub const AUDIT_ARCH: u32 = 0xC000_003E; + pub const MAX_SYSCALL_NR: i64 = 462; pub const SYS_SECCOMP: i64 = 317; pub const SYS_MEMFD_CREATE: i64 = 319; pub const SYS_PIDFD_OPEN: i64 = 434; @@ -45,6 +46,7 @@ mod imp { #[cfg(target_arch = "aarch64")] mod imp { pub const AUDIT_ARCH: u32 = 0xC000_00B7; + pub const MAX_SYSCALL_NR: i64 = 463; pub const SYS_SECCOMP: i64 = 277; pub const SYS_MEMFD_CREATE: i64 = 279; pub const SYS_PIDFD_OPEN: i64 = 434; @@ -82,6 +84,16 @@ mod imp { pub use imp::*; +/// True if `nr` is plausibly a syscall number on the current architecture. +/// Used by [`crate::seccomp::syscall::Syscall::checked`] to reject foot-gun +/// cases like negative or arch-mismatched numbers. +/// +/// Conservative: validates `0 <= nr <= MAX_SYSCALL_NR`. Doesn't enumerate +/// every nr — kernel's seccomp filter rejects unknowns at JEQ stage anyway. +pub fn is_known_syscall(nr: i64) -> bool { + nr >= 0 && nr <= imp::MAX_SYSCALL_NR +} + pub fn push_optional_syscall(v: &mut Vec, nr: Option) { if let Some(nr) = nr { v.push(nr as u32); diff --git a/crates/sandlock-core/src/context.rs b/crates/sandlock-core/src/context.rs index aebcc6f..eeff9b8 100644 --- a/crates/sandlock-core/src/context.rs +++ b/crates/sandlock-core/src/context.rs @@ -763,7 +763,7 @@ pub(crate) struct ChildSpawnArgs<'a> { /// Sandbox instance name. When set, it is also exposed as the /// sandbox's virtual hostname. pub sandbox_name: Option<&'a str>, - /// Syscall numbers for which the parent registered `ExtraHandler`s. + /// Syscall numbers for which the parent registered user `Handler`s. /// Merged into the child's BPF notif list so the kernel actually /// raises USER_NOTIF for them. pub extra_syscalls: &'a [u32], diff --git a/crates/sandlock-core/src/error.rs b/crates/sandlock-core/src/error.rs index f929c23..463c8dc 100644 --- a/crates/sandlock-core/src/error.rs +++ b/crates/sandlock-core/src/error.rs @@ -11,6 +11,9 @@ pub enum SandlockError { #[error("memory protection error: {0}")] MemoryProtect(String), + + #[error("handler error: {0}")] + Handler(#[from] crate::seccomp::dispatch::HandlerError), } #[derive(Debug, Error)] diff --git a/crates/sandlock-core/src/lib.rs b/crates/sandlock-core/src/lib.rs index e0d3533..4b450ba 100644 --- a/crates/sandlock-core/src/lib.rs +++ b/crates/sandlock-core/src/lib.rs @@ -35,6 +35,10 @@ pub use sandbox::Sandbox; pub use pipeline::{Stage, Pipeline, Gather}; pub use dry_run::{Change, ChangeKind, DryRunResult}; +// Public extension API — see docs/extension-handlers.md. +pub use seccomp::dispatch::{Handler, HandlerCtx, HandlerError}; +pub use seccomp::syscall::{Syscall, SyscallError}; + /// Query the Landlock ABI version supported by the running kernel. pub fn landlock_abi_version() -> Result { landlock::abi_version() diff --git a/crates/sandlock-core/src/sandbox.rs b/crates/sandlock-core/src/sandbox.rs index af053ae..2bcd2bf 100644 --- a/crates/sandlock-core/src/sandbox.rs +++ b/crates/sandlock-core/src/sandbox.rs @@ -109,9 +109,10 @@ pub struct Sandbox { /// Optional callback invoked when a port bind is recorded. #[allow(clippy::type_complexity)] on_bind: Option) + Send + Sync>>, - /// User-supplied extra syscall handlers. Taken on spawn and - /// appended to the dispatch table after all builtin handlers. - extra_handlers: Vec, + /// User-supplied extra syscall handlers as `(syscall_nr, Arc)` + /// pairs. Taken on spawn and appended to the dispatch table after + /// all builtin handlers. + extra_handlers: Vec<(i64, Arc)>, } impl Sandbox { @@ -182,7 +183,9 @@ impl Sandbox { name: Option<&str>, cmd: &[&str], ) -> Result { - Self::run_with_extra_handlers(policy, name, cmd, Vec::new()).await + let mut sb = Self::new(policy, name)?; + sb.do_spawn(cmd, true).await?; + sb.wait().await } /// Run a sandboxed process with inherited stdio (interactive mode). @@ -198,69 +201,90 @@ impl Sandbox { /// One-shot run with user-supplied syscall handlers. /// - /// `extra_handlers` are registered in the dispatch table **after** all - /// builtin handlers for the same syscall. They observe the post-builtin - /// view (e.g. [`chroot`]-normalized paths on `openat`) and cannot be used - /// to bypass builtin confinement. See - /// [`crate::seccomp::dispatch::ExtraHandler`] for the ordering contract. + /// `extra_handlers` is any `IntoIterator` over `(syscall, handler)` pairs + /// where: + /// + /// * `syscall: S` is anything implementing `TryInto` — `i64`/`u32` + /// raw numbers (validated through + /// [`crate::seccomp::syscall::Syscall::checked`]), or a pre-validated + /// [`crate::seccomp::syscall::Syscall`]. + /// * `handler: H` is anything implementing + /// [`crate::seccomp::dispatch::Handler`] — a struct with explicit + /// `impl Handler` for stateful handlers, or a closure of shape + /// `Fn(&HandlerCtx) -> impl Future` via the + /// blanket impl. /// - /// When called with an empty vector, this function is identical to - /// [`Self::run`]. + /// Handlers are registered in the dispatch table **after** all builtin + /// handlers for the same syscall, so they observe the post-builtin view + /// (e.g. `chroot`-normalized paths on `openat`) and cannot bypass builtin + /// confinement. + /// + /// Validation happens up-front (before fork): each `syscall` is checked + /// through `Syscall::checked`, and the deny-list contract is enforced via + /// [`crate::seccomp::dispatch::validate_handler_syscalls_against_policy`]. /// /// # Example /// /// ```ignore /// use sandlock_core::{Policy, Sandbox}; - /// use sandlock_core::seccomp::dispatch::{ExtraHandler, HandlerFn}; /// use sandlock_core::seccomp::notif::NotifAction; /// /// # tokio_test::block_on(async { /// let policy = Policy::builder().fs_read("/usr").build().unwrap(); /// - /// let audit: HandlerFn = Box::new(|notif, _ctx, _fd| { - /// Box::pin(async move { - /// eprintln!("openat from pid {}", notif.data.pid); + /// let audit = |cx: &sandlock_core::HandlerCtx<'_>| { + /// let pid = cx.notif.data.pid; + /// async move { + /// eprintln!("openat from pid {}", pid); /// NotifAction::Continue - /// }) - /// }); + /// } + /// }; /// /// let result = Sandbox::run_with_extra_handlers( /// &policy, /// Some("audit"), /// &["/usr/bin/true"], - /// vec![ExtraHandler::new(libc::SYS_openat, audit)], + /// [(libc::SYS_openat, audit)], /// ).await.unwrap(); /// # }); /// ``` - pub async fn run_with_extra_handlers( + pub async fn run_with_extra_handlers( policy: &Policy, name: Option<&str>, cmd: &[&str], - extra_handlers: Vec, - ) -> Result { - // Reject extras that would weaken confinement (e.g. one registered - // on a default-deny syscall). See - // [`crate::seccomp::dispatch::validate_extras_against_policy`] for the - // rationale. Done before fork so the caller gets a clear error - // instead of a silently-broken sandbox. - if let Err(nr) = - crate::seccomp::dispatch::validate_extras_against_policy(&extra_handlers, policy) - { - return Err(SandboxError::Child(format!( - "ExtraHandler on syscall {} conflicts with the deny list \ - (DEFAULT_DENY_SYSCALLS or policy.deny_syscalls) and would let \ - user code bypass it via SECCOMP_USER_NOTIF_FLAG_CONTINUE", - nr - )) - .into()); - } - + extra_handlers: I, + ) -> Result + where + I: IntoIterator, + S: TryInto, + H: crate::seccomp::dispatch::Handler, + { + let pending = collect_extra_handlers(extra_handlers, policy)?; let mut sb = Self::new(policy, name)?; - sb.extra_handlers = extra_handlers; + sb.extra_handlers = pending; sb.do_spawn(cmd, true).await?; sb.wait().await } + /// Interactive-stdio counterpart of [`Self::run_with_extra_handlers`]. + pub async fn run_interactive_with_extra_handlers( + policy: &Policy, + name: Option<&str>, + cmd: &[&str], + extra_handlers: I, + ) -> Result + where + I: IntoIterator, + S: TryInto, + H: crate::seccomp::dispatch::Handler, + { + let pending = collect_extra_handlers(extra_handlers, policy)?; + let mut sb = Self::new(policy, name)?; + sb.extra_handlers = pending; + sb.do_spawn(cmd, false).await?; + sb.wait().await + } + /// Dry-run: spawn, wait, collect filesystem changes, then abort. /// Returns the run result plus a list of changes that would have been /// committed. The workdir is left unchanged. @@ -939,7 +963,7 @@ impl Sandbox { let extra_syscalls: Vec = self .extra_handlers .iter() - .map(|h| h.syscall_nr as u32) + .map(|h| h.0 as u32) .collect(); // This never returns. @@ -1036,8 +1060,8 @@ impl Sandbox { // argv reads TOCTOU-safe. argv_safety_required: self.policy.policy_fn.is_some() || self.extra_handlers.iter().any(|h| { - h.syscall_nr == libc::SYS_execve - || h.syscall_nr == libc::SYS_execveat + h.0 == libc::SYS_execve + || h.0 == libc::SYS_execveat }), time_offset: time_offset_val, num_cpus: self.policy.num_cpus, @@ -1207,7 +1231,8 @@ impl Sandbox { }); // Spawn notif supervisor. `extra_handlers` is consumed here - // (moved into the supervisor task) because HandlerFn is not Clone. + // (moved into the supervisor task) because each `Arc` + // is shared with the dispatch table and must outlive it. let extra_handlers = std::mem::take(&mut self.extra_handlers); self.notif_handle = Some(tokio::spawn( notif::supervisor(notif_fd, ctx, extra_handlers), @@ -1246,6 +1271,40 @@ impl Sandbox { } } +// ============================================================ +// Helpers +// ============================================================ + +/// Convert a user-supplied iterator of `(syscall, handler)` pairs into +/// the internal `Vec<(i64, Arc)>` shape used by the +/// supervisor, validating each syscall up-front against the deny list. +fn collect_extra_handlers( + extra_handlers: I, + policy: &Policy, +) -> Result)>, SandlockError> +where + I: IntoIterator, + S: TryInto, + H: crate::seccomp::dispatch::Handler, +{ + use crate::seccomp::dispatch::{Handler, HandlerError}; + + let pending: Vec<(i64, Arc)> = extra_handlers + .into_iter() + .map(|(syscall, handler)| { + let nr = syscall.try_into().map_err(HandlerError::from)?.raw(); + let h: Arc = Arc::new(handler); + Ok::<_, HandlerError>((nr, h)) + }) + .collect::>()?; + + let nrs: Vec = pending.iter().map(|(nr, _)| *nr).collect(); + crate::seccomp::dispatch::validate_handler_syscalls_against_policy(&nrs, policy) + .map_err(|nr_u| HandlerError::OnDenySyscall { syscall_nr: nr_u as i64 })?; + + Ok(pending) +} + // ============================================================ // Drop — kill and reap child if still running // ============================================================ diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs index 3420a7f..c3e52ee 100644 --- a/crates/sandlock-core/src/seccomp/dispatch.rs +++ b/crates/sandlock-core/src/seccomp/dispatch.rs @@ -17,76 +17,134 @@ // them approve a syscall based on user-memory contents. use std::collections::HashMap; -use std::future::Future; use std::os::unix::io::RawFd; -use std::pin::Pin; use std::sync::Arc; use super::ctx::SupervisorCtx; use super::notif::{NotifAction, NotifPolicy}; use super::state::ResourceState; +use super::syscall::SyscallError; use crate::arch; use crate::sys::structs::SeccompNotif; +use thiserror::Error; use tokio::sync::Mutex; // ============================================================ // Types // ============================================================ -/// An async handler function. Receives the notification, the supervisor -/// context, and the notif fd. Returns a `NotifAction`. -pub type HandlerFn = Box< - dyn Fn(SeccompNotif, Arc, RawFd) -> Pin + Send>> - + Send - + Sync, ->; +// ============================================================ +// Handler trait — the new public extension API. +// ============================================================ -/// A user-supplied handler bound to a specific syscall number. -/// -/// Passed to [`crate::Sandbox::run_with_extra_handlers`]; appended to the -/// dispatch table **after** all builtin handlers for the same syscall. +/// Public extension trait for sandlock seccomp-notif handlers. /// -/// # Ordering and security boundary +/// Each implementor is registered against a [`crate::seccomp::syscall::Syscall`] +/// through [`crate::Sandbox::run_with_extra_handlers`] / +/// [`crate::Sandbox::run_interactive_with_extra_handlers`]. Receives +/// `&HandlerCtx` borrowed for the call; cannot outlive the dispatch +/// invocation. /// -/// Within a syscall's chain, handlers run in registration order and the -/// first non-[`NotifAction::Continue`] result wins. Builtin handlers are -/// registered first (for example `chroot` path-normalization on `openat`), -/// so an `ExtraHandler` observes the post-builtin view of each syscall. -/// This ordering is fixed and cannot be changed by downstream crates — -/// it is the security boundary that prevents user handlers from bypassing -/// sandlock confinement. +/// State lives on the implementor — no `Arc::clone` ladders, no +/// closure ceremony at registration time. /// -/// # Example -/// -/// ```ignore -/// use sandlock_core::seccomp::dispatch::{ExtraHandler, HandlerFn}; -/// use sandlock_core::seccomp::notif::NotifAction; +/// `handle` returns a boxed `Future` so the trait stays dyn-compatible +/// (the supervisor stores user handlers as `Vec>`, +/// keyed by syscall number). Returning `impl Future` directly via +/// RPITIT would be more efficient but is not object-safe, and changing +/// the storage to a non-erased shape would force a generic dispatch +/// chain incompatible with arbitrary user handler types. +pub trait Handler: Send + Sync + 'static { + fn handle<'a>( + &'a self, + cx: &'a HandlerCtx, + ) -> std::pin::Pin + Send + 'a>>; +} + +/// Context passed to `Handler::handle`. /// -/// let audit: HandlerFn = Box::new(|notif, _ctx, _fd| { -/// Box::pin(async move { -/// eprintln!("openat from pid {}", notif.data.pid); -/// NotifAction::Continue -/// }) -/// }); +/// `notif` is the kernel notification (owned by value — it's a small +/// `repr(C)` struct, cheap to copy). `notif_fd` is the supervisor's +/// seccomp listener fd, used by helpers like `read_child_mem` / +/// `write_child_mem` / `read_child_cstr` for TOCTOU-safe child memory +/// access. /// -/// let extras = vec![ExtraHandler::new(libc::SYS_openat, audit)]; -/// ``` -pub struct ExtraHandler { - pub syscall_nr: i64, - pub handler: HandlerFn, +/// Handler state lives on the implementor (`&self`). Supervisor-internal +/// state is intentionally not exposed here so the `SupervisorCtx` +/// internal fields are not part of the downstream extension contract. +pub struct HandlerCtx { + pub notif: SeccompNotif, + pub notif_fd: RawFd, +} + +// Blanket impl: any Fn(&HandlerCtx) -> Future is a Handler. +// +// Lets lightweight closure-style handlers work without ceremony at the +// call site. Handlers that need state should use `struct + explicit +// impl Handler` instead. +impl Handler for F +where + F: Fn(&HandlerCtx) -> Fut + Send + Sync + 'static, + Fut: std::future::Future + Send + 'static, +{ + fn handle<'a>( + &'a self, + cx: &'a HandlerCtx, + ) -> std::pin::Pin + Send + 'a>> { + Box::pin((self)(cx)) + } } -impl ExtraHandler { - pub fn new(syscall_nr: i64, handler: HandlerFn) -> Self { - Self { syscall_nr, handler } +// Concrete impls for `Box` and `Arc` so callers +// can erase concrete handler types behind a smart pointer when mixing +// different handler shapes in one `IntoIterator` passed to +// `run_with_extra_handlers` — e.g. `Vec<(i64, Box)>` lets a +// downstream register handlers of different concrete types without +// writing a per-crate wrapper enum. +// +// These are concrete `Box` / `Arc` rather than +// `` blankets to avoid coherence overlap with the +// `impl Handler for F where F: Fn(&HandlerCtx) -> Fut` blanket +// above. +impl Handler for Box { + fn handle<'a>( + &'a self, + cx: &'a HandlerCtx, + ) -> std::pin::Pin + Send + 'a>> { + (**self).handle(cx) } } -/// Reject extras that would weaken sandlock's confinement guarantees. +impl Handler for std::sync::Arc { + fn handle<'a>( + &'a self, + cx: &'a HandlerCtx, + ) -> std::pin::Pin + Send + 'a>> { + (**self).handle(cx) + } +} + +/// Errors raised when registering user handlers via +/// [`crate::Sandbox::run_with_extra_handlers`]. +#[derive(Debug, Error, PartialEq, Eq)] +pub enum HandlerError { + #[error("invalid syscall in handler registration: {0}")] + InvalidSyscall(#[from] SyscallError), + + #[error( + "handler on syscall {syscall_nr} conflicts with the deny list \ + (DEFAULT_DENY_SYSCALLS or policy.deny_syscalls) and would let \ + user code bypass it via SECCOMP_USER_NOTIF_FLAG_CONTINUE" + )] + OnDenySyscall { syscall_nr: i64 }, +} + +/// Reject handler registrations that would weaken sandlock's confinement +/// guarantees. /// /// The cBPF program emits notif JEQs *before* deny JEQs, so a syscall -/// present in both lists hits `SECCOMP_RET_USER_NOTIF` first. An extra +/// present in both lists hits `SECCOMP_RET_USER_NOTIF` first. A handler /// registered on a syscall that is on the deny list would therefore /// convert a kernel-deny into a user-supervised path: a handler returning /// `NotifAction::Continue` becomes `SECCOMP_USER_NOTIF_FLAG_CONTINUE` and @@ -98,38 +156,36 @@ impl ExtraHandler { /// `allow_syscalls` is set; both branches are guarded by this function. /// /// **Allowlist mode** (`policy.allow_syscalls = Some(_)`): the resolved -/// deny list is empty, so this function returns `Ok(())` for any extra. +/// deny list is empty, so this function returns `Ok(())` for any syscall. /// That is sound because the BPF deny block is empty in this mode too — /// confinement comes from the allowlist enforced at the kernel level, -/// and there is no notif/deny overlap for an extra to bypass. +/// and there is no notif/deny overlap to bypass. +/// +/// Takes only the syscall numbers because that's all it needs to check. +/// Called from the `run_with_extra_handlers` entry points before any +/// handler is registered against the dispatch table. /// /// Returns the offending syscall number on rejection so the caller can /// surface it to the end user. -/// -/// Visibility: kept `pub(crate)` because the only safe consumption path -/// is via [`crate::Sandbox::run_with_extra_handlers`], which calls this -/// function before fork. Downstream crates that pre-build their own -/// `Vec` get the same enforcement transparently through -/// that entry point — there is no `ExtraHandler::register_into` API -/// that would let a user bypass it. -pub(crate) fn validate_extras_against_policy( - extras: &[ExtraHandler], +pub(crate) fn validate_handler_syscalls_against_policy( + syscall_nrs: &[i64], policy: &crate::policy::Policy, ) -> Result<(), u32> { let deny: std::collections::HashSet = crate::context::deny_syscall_numbers(policy).into_iter().collect(); - for extra in extras { - let nr = extra.syscall_nr as u32; - if deny.contains(&nr) { - return Err(nr); + for &nr in syscall_nrs { + let nr_u = nr as u32; + if deny.contains(&nr_u) { + return Err(nr_u); } } Ok(()) } + /// Ordered chain of handlers for a single syscall number. struct HandlerChain { - handlers: Vec, + handlers: Vec>, } /// Maps syscall numbers to handler chains. @@ -145,29 +201,43 @@ impl DispatchTable { } } - /// Register a handler for the given syscall number. Handlers are called in - /// registration order; the first non-Continue result wins. - pub fn register(&mut self, syscall_nr: i64, handler: HandlerFn) { + /// Register a handler for the given syscall number. Handlers are + /// called in registration order; the first non-Continue result wins. + /// + /// Generic over `H: Handler` — accepts either a struct with explicit + /// `impl Handler for ...` or a closure (via blanket impl). + pub fn register(&mut self, syscall_nr: i64, handler: H) { + self.register_arc(syscall_nr, std::sync::Arc::new(handler)); + } + + /// Register a pre-`Arc`'d handler. Used both by builtin chunks + /// that share state via `Arc::clone` (one `ForkHandler` instance + /// registers against `SYS_clone`/`SYS_clone3`/`SYS_vfork`) and by + /// `run_with_extra_handlers` when each item already arrives as + /// `Arc`. + pub(crate) fn register_arc( + &mut self, + syscall_nr: i64, + handler: std::sync::Arc, + ) { self.chains .entry(syscall_nr) - .or_insert_with(|| HandlerChain { - handlers: Vec::new(), - }) + .or_insert_with(|| HandlerChain { handlers: Vec::new() }) .handlers .push(handler); } /// Dispatch a notification through the handler chain for its syscall number. - pub async fn dispatch( + pub(crate) async fn dispatch( &self, notif: SeccompNotif, - ctx: &Arc, notif_fd: RawFd, ) -> NotifAction { let nr = notif.data.nr as i64; if let Some(chain) = self.chains.get(&nr) { + let handler_ctx = HandlerCtx { notif, notif_fd }; for handler in &chain.handlers { - let action = handler(notif, Arc::clone(ctx), notif_fd).await; + let action = handler.handle(&handler_ctx).await; if !matches!(action, NotifAction::Continue) { return action; } @@ -185,14 +255,15 @@ impl DispatchTable { /// monolithic `dispatch()` function is translated into a `table.register()` call. /// Priority is preserved by registration order. /// -/// `extra_handlers` are appended **after** all builtin handlers, so they +/// `pending_handlers` are appended **after** all builtin handlers, so they /// observe the post-builtin view (e.g. `chroot`-normalized paths on /// `openat`). Builtins cannot be overridden or removed — this is the /// security boundary for downstream crates. -pub fn build_dispatch_table( +pub(crate) fn build_dispatch_table( policy: &Arc, resource: &Arc>, - extra_handlers: Vec, + ctx: &Arc, + pending_handlers: Vec<(i64, std::sync::Arc)>, ) -> DispatchTable { let mut table = DispatchTable::new(); @@ -200,28 +271,32 @@ pub fn build_dispatch_table( // Fork/clone family (always on) // ------------------------------------------------------------------ for &nr in arch::FORK_LIKE_SYSCALLS { - let policy = Arc::clone(policy); - let resource = Arc::clone(resource); - table.register(nr, Box::new(move |notif, _ctx, notif_fd| { - let policy = Arc::clone(&policy); - let resource = Arc::clone(&resource); - Box::pin(async move { + let policy_for_fork = Arc::clone(policy); + let resource_for_fork = Arc::clone(resource); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let notif_fd = cx.notif_fd; + let policy = Arc::clone(&policy_for_fork); + let resource = Arc::clone(&resource_for_fork); + async move { crate::resource::handle_fork(¬if, notif_fd, &resource, &policy).await - }) - })); + } + }); } // ------------------------------------------------------------------ // Wait family (always on) // ------------------------------------------------------------------ for &nr in &[libc::SYS_wait4, libc::SYS_waitid] { - let resource = Arc::clone(resource); - table.register(nr, Box::new(move |notif, _ctx, _notif_fd| { - let resource = Arc::clone(&resource); - Box::pin(async move { + let resource_for_wait = Arc::clone(resource); + let __sup = Arc::clone(ctx); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let resource = Arc::clone(&resource_for_wait); + async move { crate::resource::handle_wait(¬if, &resource).await - }) - })); + } + }); } // ------------------------------------------------------------------ @@ -232,13 +307,16 @@ pub fn build_dispatch_table( libc::SYS_mmap, libc::SYS_munmap, libc::SYS_brk, libc::SYS_mremap, libc::SYS_shmget, ] { - let policy = Arc::clone(policy); - table.register(nr, Box::new(move |notif, ctx, _notif_fd| { - let policy = Arc::clone(&policy); - Box::pin(async move { - crate::resource::handle_memory(¬if, &ctx, &policy).await - }) - })); + let policy_for_mem = Arc::clone(policy); + let __sup = Arc::clone(ctx); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let policy = Arc::clone(&policy_for_mem); + async move { + crate::resource::handle_memory(¬if, &sup, &policy).await + } + }); } } @@ -247,11 +325,15 @@ pub fn build_dispatch_table( // ------------------------------------------------------------------ if policy.has_net_allowlist || policy.has_http_acl { for &nr in &[libc::SYS_connect, libc::SYS_sendto, libc::SYS_sendmsg] { - table.register(nr, Box::new(|notif, ctx, notif_fd| { - Box::pin(async move { - crate::network::handle_net(¬if, &ctx, notif_fd).await - }) - })); + let __sup = Arc::clone(ctx); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + async move { + crate::network::handle_net(¬if, &sup, notif_fd).await + } + }); } } @@ -259,33 +341,41 @@ pub fn build_dispatch_table( // Deterministic random — getrandom() // ------------------------------------------------------------------ if policy.has_random_seed { - table.register(libc::SYS_getrandom, Box::new(|notif, ctx, notif_fd| { - Box::pin(async move { - let mut tr = ctx.time_random.lock().await; + let __sup = Arc::clone(ctx); + table.register(libc::SYS_getrandom, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + async move { + let mut tr = sup.time_random.lock().await; if let Some(ref mut rng) = tr.random_state { crate::random::handle_getrandom(¬if, rng, notif_fd) } else { NotifAction::Continue } - }) - })); + } + }); } // ------------------------------------------------------------------ // Deterministic random — /dev/urandom opens (openat) // ------------------------------------------------------------------ if policy.has_random_seed { - table.register(libc::SYS_openat, Box::new(|notif, ctx, notif_fd| { - Box::pin(async move { - let mut tr = ctx.time_random.lock().await; + let __sup = Arc::clone(ctx); + table.register(libc::SYS_openat, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + async move { + let mut tr = sup.time_random.lock().await; if let Some(ref mut rng) = tr.random_state { if let Some(action) = crate::random::handle_random_open(¬if, rng, notif_fd) { return action; } } NotifAction::Continue - }) - })); + } + }); } // ------------------------------------------------------------------ @@ -298,11 +388,13 @@ pub fn build_dispatch_table( libc::SYS_timerfd_settime as i64, libc::SYS_timer_settime as i64, ] { - table.register(nr, Box::new(move |notif, _ctx, notif_fd| { - Box::pin(async move { + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let notif_fd = cx.notif_fd; + async move { crate::time::handle_timer(¬if, time_offset, notif_fd) - }) - })); + } + }); } } @@ -310,97 +402,114 @@ pub fn build_dispatch_table( // Chroot path interception (before COW) // ------------------------------------------------------------------ if policy.chroot_root.is_some() { - register_chroot_handlers(&mut table, policy); + register_chroot_handlers(&mut table, policy, ctx); } // ------------------------------------------------------------------ // COW filesystem interception // ------------------------------------------------------------------ if policy.cow_enabled { - register_cow_handlers(&mut table); + register_cow_handlers(&mut table, ctx); } // ------------------------------------------------------------------ // /proc virtualization (always on) // ------------------------------------------------------------------ { - let policy = Arc::clone(policy); - let resource = Arc::clone(resource); - table.register(libc::SYS_openat, Box::new(move |notif, ctx, notif_fd| { - let policy = Arc::clone(&policy); - let resource = Arc::clone(&resource); - let processes = Arc::clone(&ctx.processes); - let network = Arc::clone(&ctx.network); - Box::pin(async move { + let policy_for_proc_open = Arc::clone(policy); + let resource_for_proc_open = Arc::clone(resource); + let __sup = Arc::clone(ctx); + table.register(libc::SYS_openat, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + let policy = Arc::clone(&policy_for_proc_open); + let resource = Arc::clone(&resource_for_proc_open); + async move { + let processes = Arc::clone(&sup.processes); + let network = Arc::clone(&sup.network); crate::procfs::handle_proc_open(¬if, &processes, &resource, &network, &policy, notif_fd).await - }) - })); + } + }); } let mut getdents_nrs = vec![libc::SYS_getdents64]; if let Some(getdents) = arch::SYS_GETDENTS { getdents_nrs.push(getdents); } for nr in getdents_nrs { - let policy = Arc::clone(policy); - table.register(nr, Box::new(move |notif, ctx, notif_fd| { - let policy = Arc::clone(&policy); - let processes = Arc::clone(&ctx.processes); - Box::pin(async move { + let policy_for_getdents = Arc::clone(policy); + let __sup = Arc::clone(ctx); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + let policy = Arc::clone(&policy_for_getdents); + async move { + let processes = Arc::clone(&sup.processes); crate::procfs::handle_getdents(¬if, &processes, &policy, notif_fd).await - }) - })); + } + }); } // ------------------------------------------------------------------ // Virtual CPU count // ------------------------------------------------------------------ if let Some(n) = policy.num_cpus { - table.register(libc::SYS_sched_getaffinity, Box::new(move |notif, _ctx, notif_fd| { - Box::pin(async move { + table.register(libc::SYS_sched_getaffinity, move |cx: &HandlerCtx| { + let notif = cx.notif; + let notif_fd = cx.notif_fd; + async move { crate::procfs::handle_sched_getaffinity(¬if, n, notif_fd) - }) - })); + } + }); } // ------------------------------------------------------------------ // Hostname virtualization // ------------------------------------------------------------------ if let Some(ref hostname) = policy.virtual_hostname { - let hostname = hostname.clone(); - let hostname2 = hostname.clone(); - table.register(libc::SYS_uname, Box::new(move |notif, _ctx, notif_fd| { - let hostname = hostname.clone(); - Box::pin(async move { + let hostname_for_uname = hostname.clone(); + let hostname_for_open = hostname.clone(); + table.register(libc::SYS_uname, move |cx: &HandlerCtx| { + let notif = cx.notif; + let notif_fd = cx.notif_fd; + let hostname = hostname_for_uname.clone(); + async move { crate::procfs::handle_uname(¬if, &hostname, notif_fd) - }) - })); - table.register(libc::SYS_openat, Box::new(move |notif, _ctx, notif_fd| { - let hostname = hostname2.clone(); - Box::pin(async move { + } + }); + table.register(libc::SYS_openat, move |cx: &HandlerCtx| { + let notif = cx.notif; + let notif_fd = cx.notif_fd; + let hostname = hostname_for_open.clone(); + async move { if let Some(action) = crate::procfs::handle_hostname_open(¬if, &hostname, notif_fd) { action } else { NotifAction::Continue } - }) - })); + } + }); } // ------------------------------------------------------------------ // /etc/hosts virtualization (for net_allow_hosts) // ------------------------------------------------------------------ if let Some(ref etc_hosts) = policy.virtual_etc_hosts { - let etc_hosts = etc_hosts.clone(); - table.register(libc::SYS_openat, Box::new(move |notif, _ctx, notif_fd| { - let etc_hosts = etc_hosts.clone(); - Box::pin(async move { + let etc_hosts_for_open = etc_hosts.clone(); + let __sup = Arc::clone(ctx); + table.register(libc::SYS_openat, move |cx: &HandlerCtx| { + let notif = cx.notif; + let notif_fd = cx.notif_fd; + let etc_hosts = etc_hosts_for_open.clone(); + async move { if let Some(action) = crate::procfs::handle_etc_hosts_open(¬if, &etc_hosts, notif_fd) { action } else { NotifAction::Continue } - }) - })); + } + }); } // ------------------------------------------------------------------ @@ -412,12 +521,16 @@ pub fn build_dispatch_table( getdents_nrs.push(getdents); } for nr in getdents_nrs { - table.register(nr, Box::new(|notif, ctx, notif_fd| { - let processes = Arc::clone(&ctx.processes); - Box::pin(async move { + let __sup = Arc::clone(ctx); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + async move { + let processes = Arc::clone(&sup.processes); crate::procfs::handle_sorted_getdents(¬if, &processes, notif_fd).await - }) - })); + } + }); } } @@ -434,74 +547,99 @@ pub fn build_dispatch_table( // runs first and returns `Continue` for non-cookie fds. // ------------------------------------------------------------------ { - table.register(libc::SYS_socket, Box::new(|notif, ctx, _fd| { - let state = Arc::clone(&ctx.netlink); - Box::pin(async move { + let __sup = Arc::clone(ctx); + table.register(libc::SYS_socket, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + async move { + let state = Arc::clone(&sup.netlink); crate::netlink::handlers::handle_socket(¬if, &state).await - }) - })); - table.register(libc::SYS_bind, Box::new(|notif, ctx, _fd| { - let state = Arc::clone(&ctx.netlink); - Box::pin(async move { + } + }); + let __sup = Arc::clone(ctx); + table.register(libc::SYS_bind, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + async move { + let state = Arc::clone(&sup.netlink); crate::netlink::handlers::handle_bind(¬if, &state).await - }) - })); - table.register(libc::SYS_getsockname, Box::new(|notif, ctx, notif_fd| { - let state = Arc::clone(&ctx.netlink); - Box::pin(async move { + } + }); + let __sup = Arc::clone(ctx); + table.register(libc::SYS_getsockname, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + async move { + let state = Arc::clone(&sup.netlink); crate::netlink::handlers::handle_getsockname(¬if, &state, notif_fd).await - }) - })); + } + }); // Zero the msg_name region on recv so glibc sees nl_pid=0 // (the kernel only writes sun_family on unix socketpair recvmsg, // leaving the rest of the buffer as stack garbage otherwise). for &nr in &[libc::SYS_recvfrom, libc::SYS_recvmsg] { - table.register(nr, Box::new(|notif, ctx, notif_fd| { - let state = Arc::clone(&ctx.netlink); - Box::pin(async move { + let __sup = Arc::clone(ctx); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + async move { + let state = Arc::clone(&sup.netlink); crate::netlink::handlers::handle_netlink_recvmsg(¬if, &state, notif_fd).await - }) - })); + } + }); } // Unregister on close so the (pid, fd) slot isn't left in the // cookie set once the child reuses the fd for something else. - table.register(libc::SYS_close, Box::new(|notif, ctx, _fd| { - let state = Arc::clone(&ctx.netlink); - Box::pin(async move { + let __sup = Arc::clone(ctx); + table.register(libc::SYS_close, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + async move { + let state = Arc::clone(&sup.netlink); crate::netlink::handlers::handle_close(¬if, &state).await - }) - })); + } + }); } // ------------------------------------------------------------------ // Bind — on-behalf // ------------------------------------------------------------------ if policy.port_remap || policy.has_net_allowlist { - table.register(libc::SYS_bind, Box::new(|notif, ctx, notif_fd| { - Box::pin(async move { - crate::port_remap::handle_bind(¬if, &ctx.network, notif_fd).await - }) - })); + let __sup = Arc::clone(ctx); + table.register(libc::SYS_bind, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + async move { + crate::port_remap::handle_bind(¬if, &sup.network, notif_fd).await + } + }); } // ------------------------------------------------------------------ // getsockname — port remap // ------------------------------------------------------------------ if policy.port_remap { - table.register(libc::SYS_getsockname, Box::new(|notif, ctx, notif_fd| { - Box::pin(async move { - crate::port_remap::handle_getsockname(¬if, &ctx.network, notif_fd).await - }) - })); + let __sup = Arc::clone(ctx); + table.register(libc::SYS_getsockname, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + async move { + crate::port_remap::handle_getsockname(¬if, &sup.network, notif_fd).await + } + }); } // ------------------------------------------------------------------ - // Extra handlers supplied by the caller of `Sandbox::run_with_extra_handlers`. - // Appended last so builtin handlers keep their security-critical priority - // (chroot path normalization, COW writes, resource accounting). + // Pending user handlers — appended after builtins so builtin handlers + // keep their security-critical priority (chroot path normalization, + // COW writes, resource accounting). // ------------------------------------------------------------------ - for extra in extra_handlers { - table.register(extra.syscall_nr, extra.handler); + for (nr, h) in pending_handlers { + table.register_arc(nr, h); } table @@ -511,17 +649,28 @@ pub fn build_dispatch_table( // Chroot handler registration // ============================================================ -fn register_chroot_handlers(table: &mut DispatchTable, policy: &Arc) { +fn register_chroot_handlers( + table: &mut DispatchTable, + policy: &Arc, + ctx: &Arc, +) { use crate::chroot::dispatch::ChrootCtx; - // Helper macro to reduce boilerplate for chroot handlers that unconditionally - // return (non-fallthrough). + // Helper macro — produces a closure satisfying Handler via blanket impl. + // The closure clones `policy` (Arc) before the async block; inside the + // async block it borrows fields of that cloned Arc to build `ChrootCtx`. macro_rules! chroot_handler { ($policy:expr, $handler:expr) => {{ let policy = Arc::clone($policy); - let handler_fn: HandlerFn = Box::new(move |notif, ctx, notif_fd| { + let chroot_state = Arc::clone(&ctx.chroot); + let cow_state = Arc::clone(&ctx.cow); + move |cx: &HandlerCtx| { + let notif = cx.notif; + let chroot_state = Arc::clone(&chroot_state); + let cow_state = Arc::clone(&cow_state); + let notif_fd = cx.notif_fd; let policy = Arc::clone(&policy); - Box::pin(async move { + async move { let chroot_ctx = ChrootCtx { root: policy.chroot_root.as_ref().unwrap(), readable: &policy.chroot_readable, @@ -529,20 +678,26 @@ fn register_chroot_handlers(table: &mut DispatchTable, policy: &Arc denied: &policy.chroot_denied, mounts: &policy.chroot_mounts, }; - $handler(¬if, &ctx.chroot, &ctx.cow, notif_fd, &chroot_ctx).await - }) - }); - handler_fn + $handler(¬if, &chroot_state, &cow_state, notif_fd, &chroot_ctx).await + } + } }}; } - // Helper for chroot handlers that may fall through (return Continue). + // Same shape for fall-through variants (semantically identical here; + // kept separate for symmetry with the old code). macro_rules! chroot_handler_fallthrough { ($policy:expr, $handler:expr) => {{ let policy = Arc::clone($policy); - let handler_fn: HandlerFn = Box::new(move |notif, ctx, notif_fd| { + let chroot_state = Arc::clone(&ctx.chroot); + let cow_state = Arc::clone(&ctx.cow); + move |cx: &HandlerCtx| { + let notif = cx.notif; + let chroot_state = Arc::clone(&chroot_state); + let cow_state = Arc::clone(&cow_state); + let notif_fd = cx.notif_fd; let policy = Arc::clone(&policy); - Box::pin(async move { + async move { let chroot_ctx = ChrootCtx { root: policy.chroot_root.as_ref().unwrap(), readable: &policy.chroot_readable, @@ -550,10 +705,9 @@ fn register_chroot_handlers(table: &mut DispatchTable, policy: &Arc denied: &policy.chroot_denied, mounts: &policy.chroot_mounts, }; - $handler(¬if, &ctx.chroot, &ctx.cow, notif_fd, &chroot_ctx).await - }) - }); - handler_fn + $handler(¬if, &chroot_state, &cow_state, notif_fd, &chroot_ctx).await + } + } }}; } @@ -589,36 +743,46 @@ fn register_chroot_handlers(table: &mut DispatchTable, policy: &Arc crate::chroot::dispatch::handle_chroot_legacy_unlink)); } if let Some(nr) = arch::SYS_RMDIR { + let __sup = Arc::clone(ctx); table.register(nr, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_legacy_rmdir)); } if let Some(nr) = arch::SYS_MKDIR { + let __sup = Arc::clone(ctx); table.register(nr, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_legacy_mkdir)); } if let Some(nr) = arch::SYS_RENAME { + let __sup = Arc::clone(ctx); table.register(nr, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_legacy_rename)); } if let Some(nr) = arch::SYS_SYMLINK { + let __sup = Arc::clone(ctx); table.register(nr, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_legacy_symlink)); } if let Some(nr) = arch::SYS_LINK { + let __sup = Arc::clone(ctx); table.register(nr, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_legacy_link)); } if let Some(nr) = arch::SYS_CHMOD { + let __sup = Arc::clone(ctx); table.register(nr, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_legacy_chmod)); } // chown — non-follow if let Some(chown) = arch::SYS_CHOWN { - let policy = Arc::clone(policy); - table.register(chown, Box::new(move |notif, ctx, notif_fd| { - let policy = Arc::clone(&policy); - Box::pin(async move { + let policy_for_chown = Arc::clone(policy); + let __sup = Arc::clone(ctx); + table.register(chown, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + let policy = Arc::clone(&policy_for_chown); + async move { let chroot_ctx = ChrootCtx { root: policy.chroot_root.as_ref().unwrap(), readable: &policy.chroot_readable, @@ -626,17 +790,21 @@ fn register_chroot_handlers(table: &mut DispatchTable, policy: &Arc denied: &policy.chroot_denied, mounts: &policy.chroot_mounts, }; - crate::chroot::dispatch::handle_chroot_legacy_chown(¬if, &ctx.chroot, &ctx.cow, notif_fd, &chroot_ctx, false).await - }) - })); + crate::chroot::dispatch::handle_chroot_legacy_chown(¬if, &sup.chroot, &sup.cow, notif_fd, &chroot_ctx, false).await + } + }); } // lchown — follow if let Some(lchown) = arch::SYS_LCHOWN { - let policy = Arc::clone(policy); - table.register(lchown, Box::new(move |notif, ctx, notif_fd| { - let policy = Arc::clone(&policy); - Box::pin(async move { + let policy_for_lchown = Arc::clone(policy); + let __sup = Arc::clone(ctx); + table.register(lchown, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + let policy = Arc::clone(&policy_for_lchown); + async move { let chroot_ctx = ChrootCtx { root: policy.chroot_root.as_ref().unwrap(), readable: &policy.chroot_readable, @@ -644,9 +812,9 @@ fn register_chroot_handlers(table: &mut DispatchTable, policy: &Arc denied: &policy.chroot_denied, mounts: &policy.chroot_mounts, }; - crate::chroot::dispatch::handle_chroot_legacy_chown(¬if, &ctx.chroot, &ctx.cow, notif_fd, &chroot_ctx, true).await - }) - })); + crate::chroot::dispatch::handle_chroot_legacy_chown(¬if, &sup.chroot, &sup.cow, notif_fd, &chroot_ctx, true).await + } + }); } // stat family @@ -691,17 +859,22 @@ fn register_chroot_handlers(table: &mut DispatchTable, policy: &Arc getdents_nrs.push(getdents); } for nr in getdents_nrs { + let __sup = Arc::clone(ctx); table.register(nr, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_getdents)); } // chdir, getcwd, statfs, utimensat + let __sup = Arc::clone(ctx); table.register(libc::SYS_chdir as i64, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_chdir)); + let __sup = Arc::clone(ctx); table.register(libc::SYS_getcwd as i64, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_getcwd)); + let __sup = Arc::clone(ctx); table.register(libc::SYS_statfs as i64, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_statfs)); + let __sup = Arc::clone(ctx); table.register(libc::SYS_utimensat as i64, chroot_handler!(policy, crate::chroot::dispatch::handle_chroot_utimensat)); } @@ -710,16 +883,22 @@ fn register_chroot_handlers(table: &mut DispatchTable, policy: &Arc // COW handler registration // ============================================================ -fn register_cow_handlers(table: &mut DispatchTable) { - // Helper to grab cow + processes from ctx in one place. +fn register_cow_handlers(table: &mut DispatchTable, ctx: &Arc) { + // Helper to grab cow + processes from cx.sup in one place. macro_rules! cow_call { - ($handler:expr) => { - Box::new(|notif, ctx, notif_fd| { - let cow = Arc::clone(&ctx.cow); - let processes = Arc::clone(&ctx.processes); - Box::pin(async move { $handler(¬if, &cow, &processes, notif_fd).await }) - }) - }; + ($handler:expr) => {{ + let cow_state = Arc::clone(&ctx.cow); + let processes_state = Arc::clone(&ctx.processes); + move |cx: &HandlerCtx| { + let notif = cx.notif; + let cow_state = Arc::clone(&cow_state); + let processes_state = Arc::clone(&processes_state); + let notif_fd = cx.notif_fd; + async move { + $handler(¬if, &cow_state, &processes_state, notif_fd).await + } + } + }}; } // Write syscalls (*at variants + legacy) @@ -840,9 +1019,9 @@ mod extra_handler_tests { has_net_allowlist: false, has_random_seed: false, has_time_start: false, - argv_safety_required: false, time_offset: 0, num_cpus: None, + argv_safety_required: false, port_remap: false, cow_enabled: false, chroot_root: None, @@ -860,15 +1039,6 @@ mod extra_handler_tests { }) } - #[test] - fn extra_handler_ctor_preserves_fields() { - let h: HandlerFn = Box::new(|_notif, _ctx, _fd| { - Box::pin(async { NotifAction::Continue }) - }); - let eh = ExtraHandler::new(libc::SYS_openat, h); - assert_eq!(eh.syscall_nr, libc::SYS_openat); - } - /// All registered handlers run, in registration order, when each /// returns `Continue`. Verifies that `register` appends to the /// underlying `Vec` and that `dispatch` walks it front-to-back. @@ -878,22 +1048,22 @@ mod extra_handler_tests { let order = Arc::new(std::sync::Mutex::new(Vec::::new())); for tag in [1u8, 2u8, 3u8] { - let order = Arc::clone(&order); + let order_clone = Arc::clone(&order); table.register( libc::SYS_openat, - Box::new(move |_notif, _ctx, _fd| { - let order = Arc::clone(&order); - Box::pin(async move { + move |_cx: &HandlerCtx| { + let order = Arc::clone(&order_clone); + async move { order.lock().unwrap().push(tag); NotifAction::Continue - }) - }), + } + }, ); } - let ctx = fake_supervisor_ctx(); + let _ctx = fake_supervisor_ctx(); let action = table - .dispatch(fake_notif(libc::SYS_openat as i32), &ctx, -1) + .dispatch(fake_notif(libc::SYS_openat as i32), -1) .await; assert!(matches!(action, NotifAction::Continue)); @@ -905,13 +1075,13 @@ mod extra_handler_tests { ); } - /// Append-after-builtin contract: when an `ExtraHandler` is registered - /// after a builtin-like handler, dispatch invokes the builtin first - /// and the extra second. This is the security-load-bearing invariant — + /// Append-after-builtin contract: when a user handler is registered + /// after a builtin, dispatch invokes the builtin first and the + /// user handler second. This is the security-load-bearing invariant — /// a builtin returning a non-`Continue` `NotifAction` must short-circuit - /// before the extra runs (covered by + /// before the user handler runs (covered by /// `dispatch_stops_at_first_non_continue`); when the builtin returns - /// `Continue`, the extra observes the post-builtin view. + /// `Continue`, the user handler observes the post-builtin view. #[tokio::test] async fn dispatch_runs_builtin_before_extra() { let mut table = DispatchTable::new(); @@ -921,33 +1091,32 @@ mod extra_handler_tests { let order_builtin = Arc::clone(&order); table.register( libc::SYS_openat, - Box::new(move |_notif, _ctx, _fd| { + move |_cx: &HandlerCtx| { let order = Arc::clone(&order_builtin); - Box::pin(async move { + async move { order.lock().unwrap().push(b'B'); NotifAction::Continue - }) - }), + } + }, ); - // Extra after, tagged 'E'. Routed through `ExtraHandler` to mirror - // how `build_dispatch_table` consumes user-supplied handlers. + // Extra after, tagged 'E'. Registered after builtin to mirror + // append-after-builtin placement from `build_dispatch_table`. let order_extra = Arc::clone(&order); - let extra = ExtraHandler::new( + table.register( libc::SYS_openat, - Box::new(move |_notif, _ctx, _fd| { + move |_cx: &HandlerCtx| { let order = Arc::clone(&order_extra); - Box::pin(async move { + async move { order.lock().unwrap().push(b'E'); NotifAction::Continue - }) - }), + } + }, ); - table.register(extra.syscall_nr, extra.handler); - let ctx = fake_supervisor_ctx(); + let _ctx = fake_supervisor_ctx(); let action = table - .dispatch(fake_notif(libc::SYS_openat as i32), &ctx, -1) + .dispatch(fake_notif(libc::SYS_openat as i32), -1) .await; assert!(matches!(action, NotifAction::Continue)); @@ -974,31 +1143,31 @@ mod extra_handler_tests { let calls_first = Arc::clone(&calls); table.register( libc::SYS_openat, - Box::new(move |_notif, _ctx, _fd| { + move |_cx: &HandlerCtx| { let calls = Arc::clone(&calls_first); - Box::pin(async move { + async move { calls.fetch_add(1, Ordering::SeqCst); NotifAction::Errno(libc::EACCES) - }) - }), + } + }, ); // Second handler — must NOT be called. let calls_second = Arc::clone(&calls); table.register( libc::SYS_openat, - Box::new(move |_notif, _ctx, _fd| { + move |_cx: &HandlerCtx| { let calls = Arc::clone(&calls_second); - Box::pin(async move { + async move { calls.fetch_add(1, Ordering::SeqCst); NotifAction::Continue - }) - }), + } + }, ); - let ctx = fake_supervisor_ctx(); + let _ctx = fake_supervisor_ctx(); let action = table - .dispatch(fake_notif(libc::SYS_openat as i32), &ctx, -1) + .dispatch(fake_notif(libc::SYS_openat as i32), -1) .await; match action { @@ -1012,24 +1181,12 @@ mod extra_handler_tests { ); } - #[test] - fn extras_vec_empty_leaves_table_without_change() { - // build_dispatch_table with empty extras should not add any entries. - // We verify the for-loop degenerates to nop. - let extras: Vec = Vec::new(); - let mut handler_count = 0usize; - for _ in extras { - handler_count += 1; - } - assert_eq!(handler_count, 0, "empty extras registers zero handlers"); - } - - /// `validate_extras_against_policy` must reject extras whose syscall is in - /// the policy's user-specified `deny_syscalls` list, with the same - /// rationale as DEFAULT_DENY: the BPF program emits notif JEQs before - /// deny JEQs, so a user handler returning `Continue` would translate into - /// `SECCOMP_USER_NOTIF_FLAG_CONTINUE` and silently bypass the kernel-level - /// deny. + /// `validate_handler_syscalls_against_policy` must reject handlers whose + /// syscall is in the policy's user-specified `deny_syscalls` list, with + /// the same rationale as DEFAULT_DENY: the BPF program emits notif JEQs + /// before deny JEQs, so a user handler returning `Continue` would + /// translate into `SECCOMP_USER_NOTIF_FLAG_CONTINUE` and silently bypass + /// the kernel-level deny. /// /// Uses `mremap` because it is in `syscall_name_to_nr` but not in /// `DEFAULT_DENY_SYSCALLS` — putting it into `deny_syscalls` is the only @@ -1046,15 +1203,90 @@ mod extra_handler_tests { .deny_syscalls(vec!["mremap".into()]) .build() .expect("policy builds"); - let handler: HandlerFn = - Box::new(|_notif, _ctx, _fd| Box::pin(async { NotifAction::Continue })); - let extras = vec![ExtraHandler::new(libc::SYS_mremap, handler)]; - let result = validate_extras_against_policy(&extras, &policy); + let result = validate_handler_syscalls_against_policy(&[libc::SYS_mremap], &policy); assert_eq!( result, Err(libc::SYS_mremap as u32), - "extras on user-specified deny must be rejected, naming the offending syscall" + "handler on user-specified deny must be rejected, naming the offending syscall" + ); + } + + // ---- Handler trait tests -------------------------------------- + + #[tokio::test] + async fn handler_via_blanket_impl_dispatches_closures() { + use std::sync::atomic::{AtomicU64, Ordering}; + let counter = Arc::new(AtomicU64::new(0)); + let counter_clone = Arc::clone(&counter); + + let h = move |cx: &HandlerCtx| { + let counter = Arc::clone(&counter_clone); + async move { + counter.fetch_add(1, Ordering::SeqCst); + let _ = cx.notif.pid; // touch ctx so it's exercised + NotifAction::Continue + } + }; + + let _sup = fake_supervisor_ctx(); + let notif = fake_notif(libc::SYS_openat as i32); + let cx = HandlerCtx { notif, notif_fd: -1 }; + + let action = h.handle(&cx).await; + assert!(matches!(action, NotifAction::Continue)); + assert_eq!(counter.load(Ordering::SeqCst), 1); + } + + /// Struct-based `Handler` registered through `DispatchTable::register` + /// MUST be invoked when `dispatch()` walks the chain — and `&self` + /// state MUST persist across notifications. Bridges the gap between + /// the trait-shape unit tests above (which call `.handle()` directly) + /// and the dispatch ordering tests (which use closures via blanket + /// impl). Without this test, a regression where the dispatch walker + /// dropped `Arc` calls but kept closures working would + /// not be caught at the unit layer. + #[tokio::test] + async fn dispatch_invokes_struct_handler_with_persistent_self_state() { + use std::sync::atomic::{AtomicU64, Ordering}; + + struct StructHandler { + calls: AtomicU64, + } + + impl Handler for StructHandler { + fn handle<'a>( + &'a self, + _cx: &'a HandlerCtx, + ) -> std::pin::Pin + Send + 'a>> { + Box::pin(async move { + self.calls.fetch_add(1, Ordering::SeqCst); + NotifAction::Continue + }) + } + } + + let mut table = DispatchTable::new(); + let handler = std::sync::Arc::new(StructHandler { + calls: AtomicU64::new(0), + }); + table.register_arc(libc::SYS_openat, handler.clone() as std::sync::Arc); + + let _sup = fake_supervisor_ctx(); + let notif = fake_notif(libc::SYS_openat as i32); + + // Three independent dispatches against the same registered handler. + // Walker MUST hit the struct's handle() each time, accumulating + // state on &self.calls. + for _ in 0..3 { + let action = table.dispatch(notif, -1).await; + assert!(matches!(action, NotifAction::Continue)); + } + + assert_eq!( + handler.calls.load(Ordering::SeqCst), + 3, + "dispatch must invoke the struct-based handler on every walk" ); } } diff --git a/crates/sandlock-core/src/seccomp/mod.rs b/crates/sandlock-core/src/seccomp/mod.rs index 4f22b80..231fc86 100644 --- a/crates/sandlock-core/src/seccomp/mod.rs +++ b/crates/sandlock-core/src/seccomp/mod.rs @@ -1,5 +1,6 @@ pub mod bpf; -pub mod ctx; +pub(crate) mod ctx; pub mod dispatch; pub mod notif; -pub mod state; +pub(crate) mod state; +pub mod syscall; diff --git a/crates/sandlock-core/src/seccomp/notif.rs b/crates/sandlock-core/src/seccomp/notif.rs index 19a900b..b031ba4 100644 --- a/crates/sandlock-core/src/seccomp/notif.rs +++ b/crates/sandlock-core/src/seccomp/notif.rs @@ -423,9 +423,9 @@ fn write_child_mem_vm(pid: u32, addr: u64, data: &[u8]) -> Result<(), NotifError /// still live (kernel did not abort or release the trapped syscall while the /// supervisor was reading guest memory). /// -/// Public — used by downstream `ExtraHandler`s (sandbox-sber/vfs-engine etc.) -/// to read syscall arguments that the kernel passes by pointer (paths in -/// `openat`, buffers in `write`/`writev`). +/// Public — used by downstream `Handler` implementations to read syscall +/// arguments that the kernel passes by pointer (paths in `openat`, buffers +/// in `write`/`writev`, etc.). pub fn read_child_mem( notif_fd: RawFd, id: u64, @@ -441,7 +441,19 @@ pub fn read_child_mem( /// Read a NUL-terminated string from child memory without crossing unmapped /// page boundaries in a single `process_vm_readv` call. -pub(crate) fn read_child_cstr( +/// +/// TOCTOU-safe — internally calls [`read_child_mem`], inheriting the +/// `id_valid` checks bracketing each `process_vm_readv` call. +/// +/// Page-aware: reads up to a page boundary at a time and stops at the +/// first NUL byte, never crossing into unmapped memory. Returns +/// `None` for `addr == 0`, `max_len == 0`, a read failure, or a string +/// that exceeds `max_len` without a NUL. +/// +/// Public — used by downstream `Handler` implementations that read +/// path arguments from notifications (`openat`, `unlinkat`, `statx`, +/// `newfstatat`, etc.). +pub fn read_child_cstr( notif_fd: RawFd, id: u64, pid: u32, @@ -473,9 +485,10 @@ pub(crate) fn read_child_cstr( /// Write bytes to a child process via `process_vm_writev` with TOCTOU validation. /// -/// Same TOCTOU contract as [`read_child_mem`]. Public for downstream -/// `ExtraHandler`s that synthesise syscall results into guest memory -/// (e.g. fake `getdents64` listings populated from a virtual tree-index). +/// Same TOCTOU contract as [`read_child_mem`]. Public for downstream +/// `Handler` implementations that synthesise syscall results into +/// guest memory (e.g. fake `getdents64` listings populated from a +/// virtual directory index, or synthesised `stat` buffers). pub fn write_child_mem( notif_fd: RawFd, id: u64, @@ -954,10 +967,10 @@ async fn handle_notification( NotifAction::Errno(libc::EACCES) } else { drop(pfs); - dispatch_table.dispatch(notif, ctx, fd).await + dispatch_table.dispatch(notif, fd).await } } else { - dispatch_table.dispatch(notif, ctx, fd).await + dispatch_table.dispatch(notif, fd).await } }; @@ -1086,13 +1099,13 @@ async fn handle_notification( /// /// Runs until the notification fd is closed (child exits or filter is removed). /// -/// `extra_handlers` are user-supplied syscall handlers registered after all -/// builtin handlers (see [`super::dispatch::ExtraHandler`]). For the default -/// behaviour without any custom handlers pass an empty `Vec`. +/// `pending_handlers` are user-supplied syscall handlers registered after all +/// builtin handlers. For the default behaviour without any custom handlers +/// pass an empty `Vec`. pub async fn supervisor( notif_fd: OwnedFd, ctx: Arc, - extra_handlers: Vec, + pending_handlers: Vec<(i64, std::sync::Arc)>, ) { let fd = notif_fd.as_raw_fd(); @@ -1100,7 +1113,8 @@ pub async fn supervisor( let dispatch_table = Arc::new(super::dispatch::build_dispatch_table( &ctx.policy, &ctx.resource, - extra_handlers, + &ctx, + pending_handlers, )); // Try to enable sync wakeup (Linux 6.7+, ignore error on older kernels). @@ -1212,6 +1226,14 @@ pub(crate) async fn cleanup_pid(ctx: &super::ctx::SupervisorCtx, key: super::sta mod tests { use super::*; + #[test] + fn read_child_cstr_returns_none_for_null_addr_or_zero_max_len() { + // Smoke: addr == 0 short-circuits without touching the child. + assert!(read_child_cstr(-1, 0, 0, 0, 4096).is_none()); + // max_len == 0 also short-circuits. + assert!(read_child_cstr(-1, 0, 0, 0xdeadbeef, 0).is_none()); + } + #[test] fn test_notif_action_debug() { // Ensure all variants implement Debug. diff --git a/crates/sandlock-core/src/seccomp/syscall.rs b/crates/sandlock-core/src/seccomp/syscall.rs new file mode 100644 index 0000000..86fc232 --- /dev/null +++ b/crates/sandlock-core/src/seccomp/syscall.rs @@ -0,0 +1,78 @@ +//! `Syscall` — checked syscall number newtype. +//! +//! Closes the footgun where `add_handler(-5, h)` would compile but +//! silently never fire because the cBPF filter cannot emit a JEQ for +//! an architecture-unknown syscall number. + +use thiserror::Error; + +#[derive(Debug, Error, PartialEq, Eq)] +pub enum SyscallError { + #[error("syscall number {0} is negative")] + Negative(i64), + #[error("syscall number {0} is unknown for the current architecture")] + UnknownForArch(i64), +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct Syscall(i64); + +impl Syscall { + /// Validates that `nr` is non-negative and known on the current architecture. + pub fn checked(nr: i64) -> Result { + if nr < 0 { + return Err(SyscallError::Negative(nr)); + } + if !crate::arch::is_known_syscall(nr) { + return Err(SyscallError::UnknownForArch(nr)); + } + Ok(Self(nr)) + } + + pub fn raw(self) -> i64 { + self.0 + } +} + +impl TryFrom for Syscall { + type Error = SyscallError; + fn try_from(nr: i64) -> Result { + Self::checked(nr) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn checked_accepts_valid_openat() { + let s = Syscall::checked(libc::SYS_openat).expect("openat is valid"); + assert_eq!(s.raw(), libc::SYS_openat); + } + + #[test] + fn checked_rejects_negative() { + match Syscall::checked(-5) { + Err(SyscallError::Negative(-5)) => {} + other => panic!("expected Negative(-5), got {:?}", other), + } + } + + #[test] + fn checked_rejects_arch_unknown() { + // 99_999 is above any reasonable MAX_SYSCALL_NR. + match Syscall::checked(99_999) { + Err(SyscallError::UnknownForArch(99_999)) => {} + other => panic!("expected UnknownForArch(99_999), got {:?}", other), + } + } + + #[test] + fn try_from_i64_delegates_to_checked() { + let s: Syscall = libc::SYS_openat.try_into().expect("openat valid"); + assert_eq!(s.raw(), libc::SYS_openat); + let bad: Result = (-1i64).try_into(); + assert!(matches!(bad, Err(SyscallError::Negative(-1)))); + } +} diff --git a/crates/sandlock-core/tests/integration/test_extra_handlers.rs b/crates/sandlock-core/tests/integration/test_extra_handlers.rs index 65b70be..a9a00c9 100644 --- a/crates/sandlock-core/tests/integration/test_extra_handlers.rs +++ b/crates/sandlock-core/tests/integration/test_extra_handlers.rs @@ -1,34 +1,33 @@ -//! Integration tests for the user-supplied `ExtraHandler` API. +//! Integration tests for the user-supplied `Handler` extension API +//! (`Sandbox::run_with_extra_handlers`). //! //! These tests exercise the full plumbing through the kernel: the guest //! issues a syscall, the BPF filter raises a `USER_NOTIF`, the supervisor -//! walks the dispatch chain (builtins first, extras last) and the kernel -//! applies the `NotifAction` returned by the extra handler. Any of the +//! walks the dispatch chain (builtins first, user handlers last) and the +//! kernel applies the `NotifAction` returned by the handler. Any of the //! following regressions would break them: //! -//! * extra-handler syscalls not added to the BPF filter → kernel never +//! * user-handler syscalls not added to the BPF filter → kernel never //! raises a notification, the handler silently never fires; -//! * extras registered before builtins → handler observes pre-builtin -//! arguments (e.g. unnormalized chroot paths) or short-circuits a -//! security-critical builtin; +//! * user handlers registered before builtins → handler observes +//! pre-builtin arguments (e.g. unnormalized chroot paths) or +//! short-circuits a security-critical builtin; //! * `Continue` not translated to `SECCOMP_USER_NOTIF_FLAG_CONTINUE` → //! observe-only handlers wedge the guest. //! //! Each test uses `SYS_getcwd` because under the default policy it is -//! **not** intercepted by any builtin (`getcwd` is added only for -//! chroot or COW path virtualization). This isolates the behaviour under test -//! to the extras path. The guest must run `/bin/pwd` (the binary), not -//! `pwd` (the shell builtin which reads `$PWD` and never issues the -//! syscall) — otherwise any errno injected by an extra handler can't -//! reach the user-visible exit code. +//! **not** intercepted by any builtin (`uname` is added only when the +//! policy sets a `hostname`). This isolates the behaviour under test +//! to the user-handler path. use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; -use sandlock_core::seccomp::dispatch::{ExtraHandler, HandlerFn}; use sandlock_core::seccomp::notif::NotifAction; -use sandlock_core::{Policy, Sandbox}; +use sandlock_core::{ + Handler, HandlerCtx, HandlerError, Policy, Sandbox, SandlockError, SyscallError, +}; /// Read a NUL-terminated path from the sandboxed child's address space. /// @@ -89,7 +88,7 @@ fn temp_out(name: &str) -> PathBuf { /// does not intercept (`SYS_getcwd`) MUST receive notifications and its /// `NotifAction::Errno` MUST surface in the guest as the corresponding /// errno. This is the security contract: without BPF plumbing the -/// kernel would never raise USER_NOTIF for `getcwd` and the handler +/// kernel would never raise USER_NOTIF for `uname` and the handler /// would silently never fire — the maintainer-cited footgun. #[tokio::test] async fn extra_handler_intercepts_syscall_outside_builtin_set() { @@ -99,19 +98,22 @@ async fn extra_handler_intercepts_syscall_outside_builtin_set() { let calls = Arc::new(AtomicUsize::new(0)); let calls_in_handler = Arc::clone(&calls); - let handler: HandlerFn = Box::new(move |_notif, _ctx, _fd| { + let handler = move |_cx: &HandlerCtx| { let calls = Arc::clone(&calls_in_handler); - Box::pin(async move { + async move { calls.fetch_add(1, Ordering::SeqCst); NotifAction::Errno(libc::EACCES) - }) - }); - - let extras = vec![ExtraHandler::new(libc::SYS_getcwd, handler)]; + } + }; - let result = Sandbox::run_with_extra_handlers(&policy, Some("test"), &["sh", "-c", &cmd], extras) - .await - .expect("sandbox spawn failed"); + let result = Sandbox::run_with_extra_handlers( + &policy, + None, + &["sh", "-c", &cmd], + [(libc::SYS_getcwd, handler)], + ) + .await + .expect("sandbox spawn failed"); let contents = std::fs::read_to_string(&out).unwrap_or_default(); let _ = std::fs::remove_file(&out); @@ -124,7 +126,7 @@ async fn extra_handler_intercepts_syscall_outside_builtin_set() { ); assert_ne!( code, 0, - "getcwd must observe the errno injected by the extra handler" + "uname must observe the errno injected by the extra handler" ); } @@ -139,19 +141,22 @@ async fn extra_handler_continue_lets_syscall_proceed() { let calls = Arc::new(AtomicUsize::new(0)); let calls_in_handler = Arc::clone(&calls); - let handler: HandlerFn = Box::new(move |_notif, _ctx, _fd| { + let handler = move |_cx: &HandlerCtx| { let calls = Arc::clone(&calls_in_handler); - Box::pin(async move { + async move { calls.fetch_add(1, Ordering::SeqCst); NotifAction::Continue - }) - }); - - let extras = vec![ExtraHandler::new(libc::SYS_getcwd, handler)]; + } + }; - let result = Sandbox::run_with_extra_handlers(&policy, Some("test"), &["sh", "-c", &cmd], extras) - .await - .expect("sandbox spawn failed"); + let result = Sandbox::run_with_extra_handlers( + &policy, + None, + &["sh", "-c", &cmd], + [(libc::SYS_getcwd, handler)], + ) + .await + .expect("sandbox spawn failed"); let contents = std::fs::read_to_string(&out).unwrap_or_default(); let _ = std::fs::remove_file(&out); @@ -164,18 +169,20 @@ async fn extra_handler_continue_lets_syscall_proceed() { ); assert_eq!( code, 0, - "Continue must let the kernel execute getcwd normally" + "Continue must let the kernel execute uname normally" ); } -/// `Sandbox::run_with_extra_handlers(_, _, _, vec![])` must be observably -/// identical to `Sandbox::run(_, _, _)`. +/// `Sandbox::run_with_extra_handlers(_, _, vec![])` must be observably +/// identical to `Sandbox::run(_, _)`. Guards the documented backwards +/// compatibility contract. #[tokio::test] async fn empty_extras_preserves_default_behaviour() { let policy = base_policy().build().unwrap(); - let baseline = Sandbox::run(&policy, Some("test"), &["pwd"]).await.unwrap(); - let with_extras = Sandbox::run_with_extra_handlers(&policy, Some("test"), &["pwd"], Vec::new()) + let baseline = Sandbox::run(&policy, None, &["/bin/pwd"]).await.unwrap(); + let no_handlers: [(i64, fn(&HandlerCtx) -> std::future::Ready); 0] = []; + let with_extras = Sandbox::run_with_extra_handlers(&policy, None, &["/bin/pwd"], no_handlers) .await .unwrap(); @@ -200,22 +207,25 @@ async fn extra_handler_runs_after_builtin_returns_continue() { let openat_calls = Arc::new(AtomicUsize::new(0)); let openat_in_handler = Arc::clone(&openat_calls); - let handler: HandlerFn = Box::new(move |_notif, _ctx, _fd| { + let handler = move |_cx: &HandlerCtx| { let openat_calls = Arc::clone(&openat_in_handler); - Box::pin(async move { + async move { openat_calls.fetch_add(1, Ordering::SeqCst); // Continue lets the kernel resume the syscall — the builtin // already returned Continue for non-/proc paths and this // handler must not break the chain. NotifAction::Continue - }) - }); - - let extras = vec![ExtraHandler::new(libc::SYS_openat, handler)]; + } + }; - let result = Sandbox::run_with_extra_handlers(&policy, Some("test"), &["sh", "-c", &cmd], extras) - .await - .expect("sandbox spawn failed"); + let result = Sandbox::run_with_extra_handlers( + &policy, + None, + &["sh", "-c", &cmd], + [(libc::SYS_openat, handler)], + ) + .await + .expect("sandbox spawn failed"); let contents = std::fs::read_to_string(&out).unwrap_or_default(); let _ = std::fs::remove_file(&out); @@ -254,23 +264,27 @@ async fn builtin_non_continue_blocks_extra() { let observed: Arc>> = Arc::new(Mutex::new(Vec::new())); let observed_in_handler = Arc::clone(&observed); - let handler: HandlerFn = Box::new(move |notif, _ctx, _fd| { + let handler = move |cx: &HandlerCtx| { let observed = Arc::clone(&observed_in_handler); - Box::pin(async move { + let notif = cx.notif; + async move { // openat(dirfd, pathname, flags, mode) → args[1] is the path let path_addr = notif.data.args[1]; if let Some(p) = read_path_from_child(notif.pid, path_addr) { observed.lock().unwrap().push(p); } NotifAction::Continue - }) - }); - - let extras = vec![ExtraHandler::new(libc::SYS_openat, handler)]; + } + }; - let _ = Sandbox::run_with_extra_handlers(&policy, Some("test"), &["sh", "-c", &cmd], extras) - .await - .expect("sandbox spawn failed"); + let _ = Sandbox::run_with_extra_handlers( + &policy, + None, + &["sh", "-c", &cmd], + [(libc::SYS_openat, handler)], + ) + .await + .expect("sandbox spawn failed"); let _ = std::fs::remove_file(&out); let paths = observed.lock().unwrap(); @@ -302,6 +316,30 @@ async fn builtin_non_continue_blocks_extra() { /// short-circuit the chain before `extra1` ran. #[tokio::test] async fn chain_of_extras_runs_in_insertion_order() { + // Two struct instances with the same concrete type keep the iterator's + // `H` parameter homogeneous; an `id` field plus a configurable action + // distinguishes their behaviour. + struct Counter { + c: Arc, + action: NotifAction, + } + + impl Handler for Counter { + fn handle<'a>( + &'a self, + _cx: &'a HandlerCtx, + ) -> std::pin::Pin + Send + 'a>> { + Box::pin(async move { + self.c.fetch_add(1, Ordering::SeqCst); + match self.action { + NotifAction::Continue => NotifAction::Continue, + NotifAction::Errno(e) => NotifAction::Errno(e), + _ => unreachable!("test only uses Continue / Errno"), + } + }) + } + } + let policy = base_policy().build().unwrap(); let out = temp_out("chain-order"); let cmd = format!("/bin/pwd; echo $? > {}", out.display()); @@ -309,32 +347,23 @@ async fn chain_of_extras_runs_in_insertion_order() { let c1 = Arc::new(AtomicUsize::new(0)); let c2 = Arc::new(AtomicUsize::new(0)); - let c1_in_h = Arc::clone(&c1); - let h1: HandlerFn = Box::new(move |_n, _c, _f| { - let c = Arc::clone(&c1_in_h); - Box::pin(async move { - c.fetch_add(1, Ordering::SeqCst); - NotifAction::Continue - }) - }); - - let c2_in_h = Arc::clone(&c2); - let h2: HandlerFn = Box::new(move |_n, _c, _f| { - let c = Arc::clone(&c2_in_h); - Box::pin(async move { - c.fetch_add(1, Ordering::SeqCst); - NotifAction::Errno(libc::EACCES) - }) - }); - - let extras = vec![ - ExtraHandler::new(libc::SYS_getcwd, h1), - ExtraHandler::new(libc::SYS_getcwd, h2), - ]; + let h1 = Counter { + c: Arc::clone(&c1), + action: NotifAction::Continue, + }; + let h2 = Counter { + c: Arc::clone(&c2), + action: NotifAction::Errno(libc::EACCES), + }; - let result = Sandbox::run_with_extra_handlers(&policy, Some("test"), &["sh", "-c", &cmd], extras) - .await - .expect("sandbox spawn failed"); + let result = Sandbox::run_with_extra_handlers( + &policy, + None, + &["sh", "-c", &cmd], + [(libc::SYS_getcwd, h1), (libc::SYS_getcwd, h2)], + ) + .await + .expect("sandbox spawn failed"); let contents = std::fs::read_to_string(&out).unwrap_or_default(); let _ = std::fs::remove_file(&out); @@ -352,7 +381,7 @@ async fn chain_of_extras_runs_in_insertion_order() { ); assert_ne!( code, 0, - "getcwd must observe the EACCES injected by the second handler" + "uname must observe the EACCES injected by the second handler" ); } @@ -366,12 +395,15 @@ async fn chain_of_extras_runs_in_insertion_order() { #[tokio::test] async fn extra_handler_on_default_deny_syscall_is_rejected() { let policy = base_policy().build().unwrap(); - let handler: HandlerFn = Box::new(|_notif, _ctx, _fd| { - Box::pin(async { NotifAction::Continue }) - }); - let extras = vec![ExtraHandler::new(libc::SYS_mount, handler)]; + let handler = |_cx: &HandlerCtx| async { NotifAction::Continue }; - let result = Sandbox::run_with_extra_handlers(&policy, Some("test"), &["true"], extras).await; + let result = Sandbox::run_with_extra_handlers( + &policy, + None, + &["true"], + [(libc::SYS_mount, handler)], + ) + .await; assert!( result.is_err(), @@ -379,7 +411,7 @@ async fn extra_handler_on_default_deny_syscall_is_rejected() { ); let msg = format!("{}", result.unwrap_err()); assert!( - msg.contains("default-deny") || msg.contains("bypass"), + msg.contains("deny") || msg.contains("bypass"), "error must explain why the registration is rejected, got: {}", msg ); @@ -403,12 +435,15 @@ async fn extra_handler_on_user_specified_deny_is_rejected() { .deny_syscalls(vec!["mremap".into()]) .build() .unwrap(); - let handler: HandlerFn = Box::new(|_notif, _ctx, _fd| { - Box::pin(async { NotifAction::Continue }) - }); - let extras = vec![ExtraHandler::new(libc::SYS_mremap, handler)]; + let handler = |_cx: &HandlerCtx| async { NotifAction::Continue }; - let result = Sandbox::run_with_extra_handlers(&policy, Some("test"), &["true"], extras).await; + let result = Sandbox::run_with_extra_handlers( + &policy, + None, + &["true"], + [(libc::SYS_mremap, handler)], + ) + .await; assert!( result.is_err(), @@ -427,3 +462,235 @@ async fn extra_handler_on_user_specified_deny_is_rejected() { msg ); } + +// ============================================================ +// New Handler trait API — integration tests +// ============================================================ + +/// A closure-shaped handler (via the blanket `impl Handler for F`) +/// passed to `run_with_extra_handlers` MUST observe notifications and the +/// guest MUST see the handler's `Errno`. This verifies the parameter-type +/// rework on `run_with_extra_handlers` doesn't drop notifications. +#[tokio::test] +async fn handler_via_blanket_impl_dispatches_in_sandbox() { + let policy = base_policy().build().unwrap(); + let out = temp_out("blanket-impl-eacces"); + let cmd = format!("/bin/pwd; echo $? > {}", out.display()); + + let calls = Arc::new(AtomicUsize::new(0)); + let calls_in_handler = Arc::clone(&calls); + let handler = move |_cx: &HandlerCtx| { + let calls = Arc::clone(&calls_in_handler); + async move { + calls.fetch_add(1, Ordering::SeqCst); + NotifAction::Errno(libc::EACCES) + } + }; + + let _result = Sandbox::run_with_extra_handlers( + &policy, + None, + &["sh", "-c", &cmd], + [(libc::SYS_getcwd, handler)], + ) + .await + .expect("run_with_extra_handlers"); + + assert!( + calls.load(Ordering::SeqCst) > 0, + "handler must have fired through BPF -> notif -> dispatch" + ); + + let exit_code = std::fs::read_to_string(&out) + .map(|s| s.trim().parse::().unwrap_or(-1)) + .unwrap_or(-1); + let _ = std::fs::remove_file(&out); + assert_ne!( + exit_code, 0, + "/bin/pwd must have failed because EACCES was returned" + ); +} + +/// A struct-based `Handler` (with state on `&self`, not captured `Arc`) +/// MUST be invocable through `run_with_extra_handlers` and accumulate +/// state across multiple notifications within one sandbox run. +/// +/// This exercises the full struct-impl-Handler shape end-to-end: the +/// handler owns its own `Arc` field, gets registered +/// against `SYS_getcwd`, and the dispatch walker invokes +/// `UnameCounter::handle` on every notification. Returning `Errno(EPERM)` +/// serialises the notification cycle (kernel waits for the response before +/// letting the child proceed), so the counter is guaranteed observable +/// after `run_with_extra_handlers` returns. +/// +/// Without this test, a regression where dispatch dropped the +/// struct-`Arc` path but kept closures-via-blanket-impl +/// working would not be caught at the integration layer. +#[tokio::test] +async fn struct_handler_state_persists_across_sandbox_calls() { + struct UnameCounter { + calls: Arc, + } + + impl Handler for UnameCounter { + fn handle<'a>( + &'a self, + _cx: &'a HandlerCtx, + ) -> std::pin::Pin + Send + 'a>> { + Box::pin(async move { + self.calls.fetch_add(1, Ordering::SeqCst); + NotifAction::Errno(libc::EPERM) + }) + } + } + + let policy = base_policy().build().unwrap(); + let calls = Arc::new(AtomicUsize::new(0)); + let handler = UnameCounter { + calls: Arc::clone(&calls), + }; + + let out = temp_out("struct-handler-counter"); + let cmd = format!("/bin/pwd; /bin/pwd; echo done > {}", out.display()); + + Sandbox::run_with_extra_handlers( + &policy, + None, + &["sh", "-c", &cmd], + [(libc::SYS_getcwd, handler)], + ) + .await + .expect("run_with_extra_handlers"); + let _ = std::fs::remove_file(&out); + + assert!( + calls.load(Ordering::SeqCst) >= 2, + "struct-based handler MUST have observed at least 2 uname calls \ + (state persists across notifications via &self), got {}", + calls.load(Ordering::SeqCst) + ); +} + +/// `run_with_extra_handlers` with a negative syscall number MUST return +/// `HandlerError::InvalidSyscall(SyscallError::Negative)` up-front, before +/// fork. Closes the silent-never-fires footgun. +#[tokio::test] +async fn run_with_extra_handlers_rejects_negative_syscall() { + let policy = base_policy().build().unwrap(); + let handler = |_cx: &HandlerCtx| async { NotifAction::Continue }; + + let result = + Sandbox::run_with_extra_handlers(&policy, None, &["true"], [(-5i64, handler)]).await; + + match result { + Err(SandlockError::Handler(HandlerError::InvalidSyscall(SyscallError::Negative(-5)))) => {} + other => panic!( + "expected Handler(InvalidSyscall(Negative(-5))), got {:?}", + other.err() + ), + } +} + +/// Same as above but for an arch-unknown syscall number. +#[tokio::test] +async fn run_with_extra_handlers_rejects_arch_unknown_syscall() { + let policy = base_policy().build().unwrap(); + let handler = |_cx: &HandlerCtx| async { NotifAction::Continue }; + + let result = + Sandbox::run_with_extra_handlers(&policy, None, &["true"], [(99_999i64, handler)]).await; + + match result { + Err(SandlockError::Handler(HandlerError::InvalidSyscall( + SyscallError::UnknownForArch(99_999), + ))) => {} + other => panic!( + "expected Handler(InvalidSyscall(UnknownForArch(99_999))), got {:?}", + other.err() + ), + } +} + +/// Two handlers passed in one `IntoIterator` on the same syscall MUST +/// fire in iteration order, with the chain short-circuiting on the +/// first non-Continue. Mirror of `chain_of_extras_runs_in_insertion_order` +/// — that test already covers chain ordering through the dispatch path, +/// this one covers ordering specifically through the new +/// `IntoIterator` parameter shape. Two instances of the +/// same struct keep the iterator's `H` type homogeneous. +#[tokio::test] +async fn run_with_extra_handlers_preserves_insertion_order_in_sandbox_chain() { + struct OrderTracker { + id: u8, + order: Arc>>, + action: NotifAction, + } + + impl Handler for OrderTracker { + fn handle<'a>( + &'a self, + _cx: &'a HandlerCtx, + ) -> std::pin::Pin + Send + 'a>> { + Box::pin(async move { + self.order.lock().unwrap().push(self.id); + match self.action { + NotifAction::Continue => NotifAction::Continue, + NotifAction::Errno(e) => NotifAction::Errno(e), + _ => unreachable!("test only uses Continue / Errno"), + } + }) + } + } + + let policy = base_policy().build().unwrap(); + let out = temp_out("run-with-extras-order"); + let cmd = format!("/bin/pwd; echo $? > {}", out.display()); + + let order = Arc::new(Mutex::new(Vec::::new())); + let h1 = OrderTracker { + id: 1, + order: Arc::clone(&order), + action: NotifAction::Continue, + }; + let h2 = OrderTracker { + id: 2, + order: Arc::clone(&order), + action: NotifAction::Errno(libc::EACCES), + }; + + Sandbox::run_with_extra_handlers( + &policy, + None, + &["sh", "-c", &cmd], + [(libc::SYS_getcwd, h1), (libc::SYS_getcwd, h2)], + ) + .await + .expect("run_with_extra_handlers"); + + let order = order.lock().unwrap(); + assert!(order.len() >= 2, "expected at least 2 dispatches, got {:?}", *order); + assert_eq!(order[0], 1, "h1 must run before h2; order: {:?}", *order); + assert_eq!(order[1], 2, "h2 must run after h1; order: {:?}", *order); + + let _ = std::fs::remove_file(&out); +} + +/// `run_with_extra_handlers` on a default-deny syscall MUST return +/// `HandlerError::OnDenySyscall` up-front (before fork) — closes the +/// kernel-deny -> NOTIF_FLAG_CONTINUE bypass attack. +#[tokio::test] +async fn run_with_extra_handlers_rejects_handler_on_default_deny_syscall() { + let policy = base_policy().build().unwrap(); + let handler = |_cx: &HandlerCtx| async { NotifAction::Continue }; + + // SYS_mount is in DEFAULT_DENY_SYSCALLS. + let result = + Sandbox::run_with_extra_handlers(&policy, None, &["true"], [(libc::SYS_mount, handler)]).await; + + match result { + Err(SandlockError::Handler(HandlerError::OnDenySyscall { syscall_nr })) => { + assert_eq!(syscall_nr, libc::SYS_mount); + } + other => panic!("expected Handler(OnDenySyscall), got {:?}", other.err()), + } +} diff --git a/docs/extension-handlers.md b/docs/extension-handlers.md index 514b5f0..5704d68 100644 --- a/docs/extension-handlers.md +++ b/docs/extension-handlers.md @@ -1,4 +1,4 @@ -# `ExtraHandler`: user-supplied seccomp-notification handlers +# `Handler`: user-supplied seccomp-notification handlers `sandlock-core` routes every intercepted syscall through a chain-of-responsibility table where builtin handlers (`chroot`, `cow`, `procfs`, `network`, `port_remap`, @@ -7,43 +7,303 @@ each chain, handlers run in registration order; the first non-[`NotifAction::Continue`](../crates/sandlock-core/src/seccomp/notif.rs) result wins. -`ExtraHandler` is the public extension point that lets downstream crates append -their own handler functions to the chain after all builtins. It is the -supported alternative to forking the crate or duplicating +`Handler` is the public extension trait that lets downstream crates append their +own handlers to the chain after all builtins. It is the supported alternative to +forking the crate or duplicating [`notif::supervisor`](../crates/sandlock-core/src/seccomp/notif.rs) — one [`SECCOMP_FILTER_FLAG_NEW_LISTENER`](https://man7.org/linux/man-pages/man2/seccomp.2.html) -per process means one supervisor, so user code that needs to intercept extra +per process means one supervisor task, so user code that needs to intercept extra syscalls in the same sandbox as the builtins must run inside the same dispatch loop. ## API -A handler is an async closure bound to a syscall number: +### `Handler` trait + +The `Handler` trait has a single async method `handle(&self, cx: &HandlerCtx) -> NotifAction`. +State lives on the struct's fields — no `Arc::clone` ladders, no `Box::pin` ceremony at call site. ```rust -use sandlock_core::seccomp::dispatch::{ExtraHandler, HandlerFn}; +use std::sync::atomic::{AtomicU64, Ordering}; +use sandlock_core::{Handler, HandlerCtx, Policy, Sandbox}; use sandlock_core::seccomp::notif::NotifAction; -use sandlock_core::{Policy, Sandbox}; +use async_trait::async_trait; -let policy = Policy::builder().fs_read("/usr").fs_write("/tmp").build()?; +struct OpenAudit { count: AtomicU64 } -let handler: HandlerFn = Box::new(|notif, _ctx, _fd| { - Box::pin(async move { - // inspect notif.data.args, perform side effects, return action +#[async_trait] +impl Handler for OpenAudit { + async fn handle(&self, cx: &HandlerCtx) -> NotifAction { + let n = self.count.fetch_add(1, Ordering::SeqCst) + 1; + eprintln!("[audit #{n}] pid={} openat", cx.notif.pid); NotifAction::Continue - }) -}); + } +} +let policy = Policy::builder().fs_read("/usr").fs_write("/tmp").build()?; Sandbox::run_with_extra_handlers( &policy, &["python3", "-c", "print(42)"], - vec![ExtraHandler::new(libc::SYS_openat, handler)], -).await?; + [(libc::SYS_openat, OpenAudit { count: AtomicU64::new(0) })], +) +.await?; +``` + +`HandlerCtx` is passed by reference for the dispatch call. It exposes only the kernel +notification (`notif`) and the supervisor's seccomp listener fd (`notif_fd`); supervisor-internal +state is intentionally not part of this contract — handler state lives on the implementor. + +### Closures via blanket impl + +For trivial single-shot handlers, closures work via the blanket `impl Handler for F`: + +```rust +let audit = |cx: &HandlerCtx| async move { + eprintln!("openat from pid {}", cx.notif.pid); + NotifAction::Continue +}; + +Sandbox::run_with_extra_handlers(&policy, &cmd, [(libc::SYS_openat, audit)]).await?; +``` + +Use closures for prototyping or trivial state; switch to a struct as soon as the handler grows +non-trivial captures. + +### `Syscall::checked` newtype + +`Syscall::checked(nr)` validates against the architecture's known syscall set and rejects negatives: + +```rust +use sandlock_core::{Syscall, SyscallError}; + +assert!(Syscall::checked(libc::SYS_openat).is_ok()); +assert!(matches!(Syscall::checked(-5), Err(SyscallError::Negative(-5)))); +assert!(matches!(Syscall::checked(99_999), Err(SyscallError::UnknownForArch(99_999)))); +``` + +`run_with_extra_handlers` accepts an `IntoIterator` where `S: TryInto`, +so callers can pass raw `i64`/`u32` syscall numbers and they are validated up-front: + +```rust +Sandbox::run_with_extra_handlers(&policy, &cmd, [(libc::SYS_openat, openat_h)]).await?; +``` + +Without `Syscall::checked`, passing `-5` as a syscall number would compile but never fire — the +cBPF filter cannot emit a JEQ for an arch-unknown number. + +### Entry points + +There are two entry points; both spawn the sandbox, wait for it to exit, and return the result: + +| name | stdio | +| --- | --- | +| `Sandbox::run_with_extra_handlers(policy, cmd, handlers)` | captured (returned in `RunResult`) | +| `Sandbox::run_interactive_with_extra_handlers(policy, cmd, handlers)` | inherited from the parent | + +Both have the same generic shape: + +```rust +pub async fn run_with_extra_handlers( + policy: &Policy, + cmd: &[&str], + extra_handlers: I, +) -> Result +where + I: IntoIterator, + S: TryInto, + H: Handler; +``` + +Multiple handlers — passed in one array literal: + +```rust +Sandbox::run_with_extra_handlers( + &policy, + &cmd, + [ + (libc::SYS_openat, openat_handler), + (libc::SYS_close, close_handler), + (libc::SYS_mmap, mmap_deny), + ], +) +.await?; +``` + +When the iterator mixes handlers of different opaque types (e.g. several different closures, or +a closure plus a struct), erase them via `Box` (or `Arc`) — both +implement `Handler` themselves, so `H` resolves to a single type: + +```rust +let openat_h: Box = Box::new(my_openat_handler); +let close_h: Box = Box::new(MyCloseStruct { ... }); + +Sandbox::run_with_extra_handlers( + &policy, + None, + &cmd, + [(libc::SYS_openat, openat_h), (libc::SYS_close, close_h)], +) +.await?; +``` + +Errors at registration time, before fork: + +- `SyscallError::Negative` / `SyscallError::UnknownForArch` from `Syscall::checked` (wrapped in + `HandlerError::InvalidSyscall`, then in `SandlockError::Handler`). +- `HandlerError::OnDenySyscall` if any registered syscall is in `policy.deny_syscalls` or + `DEFAULT_DENY_SYSCALLS` (see [Security boundary](#security-boundary)). + +### Interactive mode + +For REPL-like workflows (a sandboxed shell, a long-running supervised process whose stdin/stdout +should be inherited from the host), use `run_interactive_with_extra_handlers`. The handler API +is identical: + +```rust +Sandbox::run_interactive_with_extra_handlers( + &policy, + &["bash"], + [(libc::SYS_openat, audit_handler)], +) +.await?; // host stdin/stdout inherited ``` -[`Sandbox::run`](../crates/sandlock-core/src/sandbox.rs) is preserved and -delegates to `run_with_extra_handlers` with an empty `Vec`, so callers that do -not need extras observe no API change. +`run_interactive_with_extra_handlers` does not capture stdout/stderr — the child sees the parent's +terminal directly. + +### Reading syscall arguments + +The kernel passes most syscall arguments by pointer (paths in `openat`, buffers in `write`/`writev`, +`struct stat` slot in `newfstatat`, …). To read those out of guest memory inside a handler, use the +TOCTOU-safe helpers in [`crate::seccomp::notif`](../crates/sandlock-core/src/seccomp/notif.rs): + +| Helper | Purpose | +|---|---| +| [`read_child_cstr`](../crates/sandlock-core/src/seccomp/notif.rs) | NUL-terminated string (paths). Page-aware, never crosses unmapped boundaries. | +| [`read_child_mem`](../crates/sandlock-core/src/seccomp/notif.rs) | Fixed-length byte buffer. | +| [`write_child_mem`](../crates/sandlock-core/src/seccomp/notif.rs) | Synthesise return data into the guest (e.g. fake `getdents64` listings, synthesised `stat` buffers). | + +All three bracket the syscall with `id_valid` checks before and after `process_vm_readv` / +`process_vm_writev`, so they will not race with the kernel aborting or releasing the trapped +syscall while the supervisor is reading guest memory. + +Example: an `openat` handler that reads the path argument and rejects access to a denylist +of suffixes: + +```rust +use sandlock_core::seccomp::dispatch::{Handler, HandlerCtx}; +use sandlock_core::seccomp::notif::{read_child_cstr, NotifAction}; +use async_trait::async_trait; + +struct ExtensionDenyHandler { denied_suffixes: Vec } + +#[async_trait] +impl Handler for ExtensionDenyHandler { + async fn handle(&self, cx: &HandlerCtx) -> NotifAction { + // openat(2): args[1] is `const char *pathname`. 4096 = PATH_MAX. + let path = match read_child_cstr(cx.notif_fd, cx.notif.id, cx.notif.pid, + cx.notif.data.args[1], 4096) { + Some(p) => p, + // Couldn't read path (rare: NULL pointer, kernel released the + // notification mid-read, etc.). Pass through and let the kernel + // handle the syscall — usually it will fail with EFAULT itself. + None => return NotifAction::Continue, + }; + + if self.denied_suffixes.iter().any(|s| path.ends_with(s)) { + return NotifAction::Errno(libc::EACCES); + } + NotifAction::Continue + } +} +``` + +Synthesising data INTO the guest follows the same pattern with `write_child_mem`. For example, a +`getdents64` handler that returns an empty directory listing: + +```rust +async fn handle(&self, cx: &HandlerCtx) -> NotifAction { + // args[1] is `struct linux_dirent64 *dirp` — write zero bytes (empty + // listing) and return 0 (no entries) to the guest. + let buf_addr = cx.notif.data.args[1]; + if let Err(_e) = write_child_mem(cx.notif_fd, cx.notif.id, cx.notif.pid, buf_addr, &[]) { + return NotifAction::Errno(libc::EFAULT); + } + NotifAction::ReturnValue(0) +} +``` + +### State patterns + +Common confusion: when a handler holds mutable state, what kind of synchronisation is needed? +`Handler::handle` takes `&self`, so anything mutated must be in an interior-mutability container. +Pick by access pattern: + +| Pattern | Use when | Example | +|---|---|---| +| `AtomicU64` / `AtomicUsize` | Counter or single value, lock-free. | Audit call count. | +| `parking_lot::Mutex` (or `std::sync::Mutex`) | Short critical section, never crosses `.await`. | Append to a `Vec` log buffer. | +| `tokio::sync::RwLock` | Read-heavy, value rebuilt occasionally. | A small in-memory virtual file table refreshed on changes. | +| `dashmap::DashMap` | High-fanout per-key concurrent access. | Per-pid open-file table indexed by `(pid, fd)`. | + +⚠️ **Continue-site safety** (see [Semantics → Continue-site safety](#continue-site-safety)) applies +to async locks: never hold a `tokio::sync::Mutex`/`RwLock` guard across an `.await` inside a +handler, or the next notification will park behind it and the trapped syscall will never resume. +Use sync `parking_lot::Mutex` for short critical sections instead — it cannot deadlock the +supervisor loop because it cannot be held across `.await`. + +```rust +use std::sync::atomic::{AtomicU64, Ordering}; + +struct CallStats { + openat: AtomicU64, + close: AtomicU64, +} + +#[async_trait] +impl Handler for CallStats { + async fn handle(&self, cx: &HandlerCtx) -> NotifAction { + match cx.notif.data.nr as i64 { + n if n == libc::SYS_openat => self.openat.fetch_add(1, Ordering::Relaxed), + n if n == libc::SYS_close => self.close.fetch_add(1, Ordering::Relaxed), + _ => 0, + }; + NotifAction::Continue + } +} +``` + +For one handler instance shared across multiple syscall registrations, write a thin wrapper: + +```rust +struct DispatchCallStats(std::sync::Arc); + +#[async_trait] +impl Handler for DispatchCallStats { + async fn handle(&self, cx: &HandlerCtx) -> NotifAction { + self.0.handle(cx).await + } +} + +let stats = std::sync::Arc::new(CallStats { + openat: AtomicU64::new(0), + close: AtomicU64::new(0), +}); + +Sandbox::run_with_extra_handlers( + &policy, + &cmd, + [ + (libc::SYS_openat, DispatchCallStats(std::sync::Arc::clone(&stats))), + (libc::SYS_close, DispatchCallStats(std::sync::Arc::clone(&stats))), + ], +) +.await?; + +println!("openat: {}, close: {}", + stats.openat.load(Ordering::Relaxed), + stats.close.load(Ordering::Relaxed)); +``` ## Semantics @@ -54,134 +314,144 @@ For each intercepted syscall: 1. Builtin handlers registered inside [`build_dispatch_table`](../crates/sandlock-core/src/seccomp/dispatch.rs) run first, in their internal registration order. -2. `extra_handlers` run afterwards, in `Vec` order. -3. Multiple extras on the same syscall run in insertion order. +2. Handlers passed to `run_with_extra_handlers` run afterwards, in iterator order. +3. Multiple iterator entries on the same syscall run in insertion order. -The chain stops as soon as a handler returns a non-`NotifAction::Continue` -result; subsequent handlers in the chain are not invoked. This contract is -enforced structurally — `build_dispatch_table` registers builtins into an empty -table *before* iterating `extra_handlers`, and the chain evaluator -short-circuits on the first non-`Continue`. +The chain stops as soon as a handler returns a non-`NotifAction::Continue` result; subsequent +handlers in the chain are not invoked. This contract is enforced structurally — +`build_dispatch_table` registers builtins into an empty table *before* iterating user-supplied +handlers, and the chain evaluator short-circuits on the first non-`Continue`. The contract is exercised at two layers: - Unit, in [`seccomp::dispatch::extra_handler_tests`](../crates/sandlock-core/src/seccomp/dispatch.rs): `dispatch_walks_chain_in_registration_order`, `dispatch_runs_builtin_before_extra`, - `dispatch_stops_at_first_non_continue` drive the actual `dispatch()` walker - against a minimal `SupervisorCtx`. + `dispatch_stops_at_first_non_continue` drive `dispatch()` walker against a minimal `SupervisorCtx`. - End-to-end, in [`tests/integration/test_extra_handlers.rs`](../crates/sandlock-core/tests/integration/test_extra_handlers.rs): - `extra_handler_runs_after_builtin_returns_continue`, + `run_with_extra_handlers_preserves_insertion_order_in_sandbox_chain`, `builtin_non_continue_blocks_extra`, - `chain_of_extras_runs_in_insertion_order` drive a live Landlock+seccomp - sandbox. + `extra_handler_runs_after_builtin_returns_continue` drive a live Landlock+seccomp sandbox. ### Return values -`HandlerFn` returns [`NotifAction`](../crates/sandlock-core/src/seccomp/notif.rs): +`Handler::handle` returns [`NotifAction`](../crates/sandlock-core/src/seccomp/notif.rs): | Variant | Effect | |---|---| | `Continue` | Fall through to the next handler in the chain; if last, the kernel resumes the syscall (`SECCOMP_USER_NOTIF_FLAG_CONTINUE`). | | `Errno(e)` | Return `-e` to the guest; the kernel does not run the syscall. | -| `ReturnValue(val)` | Return `val` to the guest; the kernel does not run the syscall (useful for faking `write` and similar). | +| `ReturnValue(val)` | Return `val` to the guest; the kernel does not run the syscall (useful for synthesising `read`/`fstat`/`getdents64`/...). | | `InjectFd { srcfd, targetfd }` | Inject `srcfd` into the guest at slot `targetfd`, then continue. | +| `InjectFdSendTracked { srcfd, newfd_flags, on_success }` | Inject `srcfd`; `on_success` callback runs synchronously when the kernel returns the slot, so downstream tracking cannot race with the guest seeing the new fd. | | `Kill { sig, pgid }` | Send `sig` to the guest's process group. | ### Continue-site safety -The supervisor processes notifications sequentially in a single tokio task, so -the response sent for one notification gates the kernel resumption of the -trapped syscall. A handler must not hold any -[`SupervisorCtx`](../crates/sandlock-core/src/seccomp/ctx.rs) internal lock -(`tokio::sync::Mutex`/`RwLock`) across an `.await` point: if the guard is alive -when control returns to the supervisor loop, the next notification that needs -the same lock parks, the response for the current notification is not sent, and -the child stays trapped in the syscall. Acquire, mutate, drop — `await` only -after the guard is out of scope. See [issue #27][i27] for the underlying -contract that this convention extends to user handlers. +The supervisor processes notifications sequentially in a single tokio task, so the response sent +for one notification gates the kernel resumption of the trapped syscall. Sandlock-internal +locks (`tokio::sync::Mutex`/`RwLock`) live on the supervisor; user handlers do not have access +to them through `HandlerCtx`, so the contract here is local to handler-owned state on `&self`: +a `tokio::sync::Mutex` or `RwLock` field on your handler must not be held across an +`.await` point. If the guard is alive when control returns to the supervisor loop, the next +notification that needs the same lock parks, the response for the current notification is not +sent, and the child stays trapped in the syscall. Acquire, mutate, drop — `await` only after +the guard is out of scope. + +See [issue #27][i27] for the underlying supervisor-loop contract that this convention extends to +user handlers. [i27]: https://github.com/multikernel/sandlock/issues/27 ## Security boundary -Extras run after builtins. By the time a user handler observes a notification, -builtins have already normalised paths (chroot), validated access (Landlock -pre-checks at the BPF/notif layer), and short-circuited any call that conflicts -with the policy. +User handlers run after builtins. By the time a user handler observes a notification, builtins +have already normalised paths (chroot), validated access (Landlock pre-checks at the BPF/notif +layer), and short-circuited any call that conflicts with the policy. -Extras cannot: +User handlers cannot: - Remove a builtin handler. -- Reorder a builtin handler to run after the extra. +- Reorder a builtin handler to run after the user handler. - Skip a builtin's `Errno`/`ReturnValue`/`Kill` response. -Extras can: +User handlers can: -- Observe every syscall sandlock intercepts via `SECCOMP_RET_USER_NOTIF` — - builtins for that syscall must have returned `Continue` for the extra to - see it. -- Fake results (`ReturnValue`, `Errno`) — but only after the builtins for the - same syscall returned `Continue`, so they cannot subvert confinement. +- Observe every syscall sandlock intercepts via `SECCOMP_RET_USER_NOTIF` — builtins for that + syscall must have returned `Continue` for the user handler to see it. +- Fake results (`ReturnValue`, `Errno`) — but only after the builtins for the same syscall + returned `Continue`, so they cannot subvert confinement. +- Inject fds (`InjectFd`/`InjectFdSendTracked`) — useful for materialising virtual file content + via `memfd` without ever touching the host filesystem. ### BPF coverage -`run_with_extra_handlers` collects the syscall numbers declared by the supplied -`Vec` and merges them into the cBPF notification list installed -in the child before `execve`. Without this step the kernel never raises -`SECCOMP_RET_USER_NOTIF` for a syscall that no builtin intercepts, and the user -handler silently never fires. The merge is dedup-aware: an `openat` registered -both by a builtin and an extra produces a single JEQ in the assembled program. +`run_with_extra_handlers` collects the syscall numbers declared by the user-supplied handlers and merges them +into the cBPF notification list installed in the child before `execve`. Without this step the +kernel never raises `SECCOMP_RET_USER_NOTIF` for a syscall that no builtin intercepts, and the +user handler silently never fires. The merge is dedup-aware: an `openat` registered both by a +builtin and a user handler produces a single JEQ in the assembled program. + +Validation runs at registration time (before fork). If `Syscall::checked` fails, `run_with_extra_handlers` +returns the error without enqueueing the handler. ### Deny-list bypass guard -The cBPF program emits notif JEQs *before* deny JEQs, so a syscall present in -both lists hits `SECCOMP_RET_USER_NOTIF` first. An extra registered on a -syscall in +The cBPF program emits notif JEQs *before* deny JEQs, so a syscall present in both lists hits +`SECCOMP_RET_USER_NOTIF` first. A handler registered on a syscall in [`DEFAULT_DENY_SYSCALLS`](../crates/sandlock-core/src/sys/structs.rs) — or in -`policy.deny_syscalls` — would convert a kernel-deny into a user-supervised -path; a handler returning `NotifAction::Continue` would become -`SECCOMP_USER_NOTIF_FLAG_CONTINUE` and the kernel would actually run the -syscall, silently bypassing deny. - -`run_with_extra_handlers` rejects this configuration at registration time -(before fork) and returns `SandboxError::Child` naming the offending syscall. -The check is implemented in -[`validate_extras_against_policy`](../crates/sandlock-core/src/seccomp/dispatch.rs) -and covers both the default-deny branch (`DEFAULT_DENY_SYSCALLS`) and the -user-specified branch (`policy.deny_syscalls`); both branches are unit-tested +`policy.deny_syscalls` — would convert a kernel-deny into a user-supervised path; a handler +returning `NotifAction::Continue` would become `SECCOMP_USER_NOTIF_FLAG_CONTINUE` and the kernel +would actually run the syscall, silently bypassing deny. + +`run_with_extra_handlers` rejects this configuration at registration time and returns +`HandlerError::OnDenySyscall { syscall_nr }`. The check is implemented in +[`validate_handler_syscalls_against_policy`](../crates/sandlock-core/src/seccomp/dispatch.rs) +and covers both the default-deny branch (`DEFAULT_DENY_SYSCALLS`) and the user-specified branch +(`policy.deny_syscalls`); both branches are tested (`validate_extras_rejects_user_specified_deny`, `extra_handler_on_default_deny_syscall_is_rejected`, -`extra_handler_on_user_specified_deny_is_rejected`). +`run_with_extra_handlers_rejects_handler_on_default_deny_syscall`, +`run_with_extra_handlers_rejects_negative_syscall`, +`run_with_extra_handlers_rejects_arch_unknown_syscall`). -In allowlist mode (`policy.allow_syscalls = Some(_)`) the resolved deny list is -empty and the guard is a no-op — but so is the BPF deny block, and confinement -comes entirely from the kernel-enforced allowlist, so there is no overlap to -bypass. +In allowlist mode (`policy.allow_syscalls = Some(_)`) the resolved deny list is empty and the +guard is a no-op — but so is the BPF deny block, and confinement comes entirely from the +kernel-enforced allowlist, so there is no overlap to bypass. ## Panics -`DispatchTable::dispatch` does not wrap handler calls in `catch_unwind`. A -panic inside a user handler propagates up the `tokio::spawn` task that drives -the supervisor, leading to task failure and the child being killed by -sandlock's watchdog. +`DispatchTable::dispatch` does not wrap handler calls in `catch_unwind`. A panic inside a user +handler propagates up the `tokio::spawn` task that drives the supervisor, leading to task failure +and the child being killed by sandlock's watchdog. To tolerate bugs in downstream handlers, wrap each one with -[`futures::FutureExt::catch_unwind`][catch] (the synchronous -`std::panic::catch_unwind` does not apply to async futures): +[`futures::FutureExt::catch_unwind`][catch] (the synchronous `std::panic::catch_unwind` does not +apply to async futures): ```rust +use async_trait::async_trait; use futures::future::FutureExt as _; use std::panic::AssertUnwindSafe; -let safe: HandlerFn = Box::new(|notif, ctx, fd| { - Box::pin(async move { - AssertUnwindSafe(actual_handler(notif, ctx, fd)) +struct PanicSafe(H); + +#[async_trait] +impl Handler for PanicSafe { + async fn handle(&self, cx: &HandlerCtx) -> NotifAction { + AssertUnwindSafe(self.0.handle(cx)) .catch_unwind() .await .unwrap_or(NotifAction::Continue) // fail-open on panic - }) -}); + } +} + +Sandbox::run_with_extra_handlers( + &policy, + &cmd, + [(libc::SYS_openat, PanicSafe(actual_handler))], +) +.await?; ``` [catch]: https://docs.rs/futures/latest/futures/future/trait.FutureExt.html#method.catch_unwind @@ -190,70 +460,106 @@ let safe: HandlerFn = Box::new(|notif, ctx, fd| { ### VFS engine: real-time uploads to object storage -A deployment that streams guest-generated artefacts to object storage as the -process runs (rather than collecting them after exit) needs interceptors on -`openat(O_CREAT)`, `write`, and `close` to translate filesystem operations -into multipart-upload calls. Those interceptors must live inside the same -supervisor task as sandlock's builtins — `SECCOMP_FILTER_FLAG_NEW_LISTENER` -allows only one listener per process, so a second supervisor cannot run -alongside. +A deployment that streams guest-generated artefacts to object storage as the process runs (rather +than collecting them after exit) needs interceptors on `openat(O_CREAT)`, `write`, and `close` to +translate filesystem operations into multipart-upload calls. Those interceptors must live inside +the same supervisor task as sandlock's builtins — `SECCOMP_FILTER_FLAG_NEW_LISTENER` allows only +one listener per process. + +Wrap each handler in `Box` so the iterator's `H` parameter is uniform across +heterogeneous handler types: ```rust -let extras = vec![ - ExtraHandler::new(libc::SYS_openat, s3_open_handler), - ExtraHandler::new(libc::SYS_write, s3_write_handler), - ExtraHandler::new(libc::SYS_close, s3_close_handler), -]; -Sandbox::run_with_extra_handlers(&policy, &cmd, extras).await?; +Sandbox::run_with_extra_handlers( + &policy, + None, + &cmd, + [ + (libc::SYS_openat, Box::new(S3OpenHandler::new(&cfg)?) as Box), + (libc::SYS_close, Box::new(S3CloseHandler::new(&cfg)?) as Box), + (libc::SYS_mmap, Box::new(MmapDenyManaged::new(&open_files)) as Box), + ], +) +.await?; ``` -Each handler observes the post-builtin view: by the time `s3_open_handler` -runs, the `openat` arguments are already chroot-normalised, so the path the -handler inspects can be trusted against the configured policy. +Each handler observes the post-builtin view: by the time `S3OpenHandler::handle` runs, the +`openat` arguments are already chroot-normalised, so the path the handler inspects can be trusted +against the configured policy. + +### Synthetic file content via `InjectFdSendTracked` + +A read-only virtual file (e.g. `/etc/hostname`, an in-memory configuration generated per-call) +can be exposed by intercepting `openat` and injecting a sealed `memfd` containing the content. +The kernel returns the new fd slot to the guest, the handler's `on_success` callback runs +synchronously to register the fd in the handler's bookkeeping, and the guest reads the content +via the `memfd` — no host filesystem touched. ### Deterministic audit trail for compliance -Regulated environments (CIS, GDPR data-residency) require a guaranteed audit -log of every file read/write the user code performs, tamper-proof against the -guest. Python wrappers (`wrapt`, import hooks) are easy for the guest to -circumvent through `ctypes` or raw syscalls; eBPF file tracing requires -`CAP_BPF`, which is often unavailable in managed Kubernetes. +Regulated environments (CIS, GDPR data-residency) require a guaranteed audit log of every file +read/write the user code performs, tamper-proof against the guest. Python wrappers (`wrapt`, +import hooks) are easy for the guest to circumvent through `ctypes` or raw syscalls; eBPF file +tracing requires `CAP_BPF`, which is often unavailable in managed Kubernetes. -An `ExtraHandler` on `SYS_openat`/`SYS_write`/`SYS_unlinkat` captures the call -before the kernel acts on it. The guest cannot bypass it without bypassing -seccomp itself, which sandlock blocks at the BPF level. +A `Handler` on `SYS_openat`/`SYS_write`/`SYS_unlinkat` captures the call before the kernel acts +on it. The guest cannot bypass it without bypassing seccomp itself, which sandlock blocks at the +BPF level. A minimal runnable example lives in [`examples/openat_audit.rs`](../crates/sandlock-core/examples/openat_audit.rs). ## Limitations -- **No builtin override.** Security-critical handlers (`chroot`, `cow`) always - run first. To change builtin behaviour, modify sandlock directly. -- **No before-builtin priority.** An audit handler that wants to observe calls - rejected by builtins is a coherent use case, but it requires a - `HandlerPriority` enum that has not been added; the current API only supports - appending to the chain. -- **No declarative `Policy` extension.** Adding handlers is a runtime action, - not a serialisable part of the policy. `Policy` remains a pure data struct. +- **No builtin override.** Security-critical handlers (`chroot`, `cow`) always run first. To + change builtin behaviour, modify sandlock directly. +- **No before-builtin priority.** An audit handler that wants to observe calls rejected by + builtins is a coherent use case, but it requires a `HandlerPriority` enum that has not been + added; the current API only supports appending to the chain. +- **No declarative `Policy` extension.** Adding handlers is a runtime action, not a serialisable + part of the policy. `Policy` remains a pure data struct. -## Backwards compatibility +## Downstream usage -`Sandbox::run(policy, cmd)` is preserved and delegates to -`Sandbox::run_with_extra_handlers(policy, cmd, Vec::new())`. Existing unit and -integration tests pass without modification; downstream callers that do not -need extras need no change. +A typical downstream crate exports a struct per handler kind: -## Downstream usage +```rust +pub struct OpenatHandler { + pub virtual_tree: Arc>, + pub workspace: PathBuf, + /* ... */ +} + +#[async_trait] +impl Handler for OpenatHandler { + async fn handle(&self, cx: &HandlerCtx) -> NotifAction { + /* read path arg via sandlock_core::seccomp::notif::read_child_cstr, + consult self.virtual_tree, return NotifAction::InjectFdSendTracked + / Errno / ... */ + } +} +``` -A typical downstream crate exports a builder: +The host binary instantiates the handlers and passes them as one +`IntoIterator`. When the handler types differ +(common in a real downstream), erase them via `Box` so the +iterator's `H` parameter stays homogeneous: ```rust -pub fn build_vfs_handlers( - config: VfsConfig, -) -> Vec { /* ... */ } +Sandbox::run_with_extra_handlers( + &policy, + None, + &cmd, + [ + (libc::SYS_openat, Box::new(OpenatHandler { virtual_tree, workspace }) as Box), + (libc::SYS_close, Box::new(CloseHandler { virtual_tree, oft, store }) as Box), + (libc::SYS_getdents64, Box::new(DirReadHandler { virtual_tree, oft }) as Box), + ], +) +.await?; ``` -which the supervisor binary passes straight into `run_with_extra_handlers`. The -crate links against `sandlock-core` as an ordinary dependency — no fork, no +For a single concrete handler type the bare struct works without the `Box::new` wrapper. + +The crate links against `sandlock-core` as an ordinary dependency — no fork, no `[patch.crates-io]`, no duplication of `notif::supervisor`.