diff --git a/README.md b/README.md index 1fc4d01..cddb812 100644 --- a/README.md +++ b/README.md @@ -332,7 +332,7 @@ positive int = deny with errno, `"audit"`/`-2` = allow + flag. ### Rust API ```rust -use sandlock_core::{Policy, Sandbox, Pipeline, Stage, confine_current_process}; +use sandlock_core::{ConfinePolicy, Policy, Sandbox, Pipeline, Stage, confine}; // Basic run let policy = Policy::builder() @@ -352,11 +352,11 @@ let policy = Policy::builder() let result = Sandbox::run(&policy, Some("agent-box"), &["python3", "agent.py"]).await?; // Confine the current process (Landlock filesystem only, irreversible) -let policy = Policy::builder() +let policy = ConfinePolicy::builder() .fs_read("/usr").fs_read("/lib") .fs_write("/tmp") - .build()?; -confine_current_process(&policy)?; + .build(); +confine(&policy)?; // Pipeline let result = ( @@ -393,6 +393,7 @@ fs_readable = ["/usr", "/lib", "/lib64", "/bin", "/etc"] clean_env = true max_memory = "512M" max_processes = 50 +block_syscalls = [] [env] CC = "gcc" @@ -648,8 +649,7 @@ Policy( fs_denied=["/proc/kcore"], # Explicitly denied # Syscall filtering (seccomp) - deny_syscalls=None, # None = default blocklist - allow_syscalls=None, # Allowlist mode (stricter) + block_syscalls=[], # Extra syscalls to block in addition to Sandlock defaults # Network — see "Network Model" above. Each entry is `host:port[,port,...]`, # `:port`, `*:port`, `host:*`, or `:*` / `*:*`. Empty list = deny all @@ -660,7 +660,7 @@ Policy( # HTTP ACL (transparent proxy) http_allow=["POST api.openai.com/v1/*"], # Allow rules (METHOD host/path) - http_deny=["* */admin/*"], # Deny rules (checked first) + http_deny=["* */admin/*"], # Block rules (checked first) http_ports=[80], # Ports to intercept (default: [80]) https_ca="ca.pem", # CA cert for HTTPS MITM (adds port 443) https_key="ca-key.pem", # CA key for HTTPS MITM diff --git a/crates/sandlock-cli/src/main.rs b/crates/sandlock-cli/src/main.rs index 7e779de..7d4501b 100644 --- a/crates/sandlock-cli/src/main.rs +++ b/crates/sandlock-cli/src/main.rs @@ -280,6 +280,7 @@ async fn main() -> Result<()> { if let Some(cpu) = base.max_cpu { b = b.max_cpu(cpu); } if let Some(seed) = base.random_seed { b = b.random_seed(seed); } if let Some(n) = base.num_cpus { b = b.num_cpus(n); } + b = b.block_syscalls(base.block_syscalls.clone()); b = b.allow_udp(base.allow_udp); b = b.allow_icmp(base.allow_icmp); b = b.allow_sysv_ipc(base.allow_sysv_ipc); @@ -685,11 +686,17 @@ fn no_supervisor_exec(policy: &Policy, cmd: &[&str]) -> Result<()> { use std::ffi::CString; // 1. Apply Landlock confinement (sets NO_NEW_PRIVS + Landlock rules) - sandlock_core::confine_current_process(policy) + if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 { + return Err(anyhow!( + "prctl(PR_SET_NO_NEW_PRIVS) failed: {}", + std::io::Error::last_os_error() + )); + } + sandlock_core::landlock::confine(policy) .map_err(|e| anyhow!("Landlock confinement failed: {}", e))?; // 2. Install deny-only seccomp filter (blocks dangerous syscalls without supervisor) - let deny_nrs = sandlock_core::context::no_supervisor_deny_syscall_numbers(policy); + let deny_nrs = sandlock_core::context::no_supervisor_blocklist_syscall_numbers(policy); let filter = sandlock_core::seccomp::bpf::assemble_filter(&[], &deny_nrs, &[]) .map_err(|e| anyhow!("seccomp assemble failed: {}", e))?; sandlock_core::seccomp::bpf::install_deny_filter(&filter) diff --git a/crates/sandlock-core/examples/openat_audit.rs b/crates/sandlock-core/examples/openat_audit.rs index 31b07f5..cd2f2f4 100644 --- a/crates/sandlock-core/examples/openat_audit.rs +++ b/crates/sandlock-core/examples/openat_audit.rs @@ -24,9 +24,8 @@ use std::env; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; -use sandlock_core::seccomp::dispatch::{ExtraHandler, HandlerFn}; use sandlock_core::seccomp::notif::NotifAction; -use sandlock_core::{Policy, Sandbox}; +use sandlock_core::{HandlerCtx, Policy, Sandbox}; #[tokio::main] async fn main() -> Result<(), Box> { @@ -51,21 +50,22 @@ async fn main() -> Result<(), Box> { let counter = Arc::new(AtomicUsize::new(0)); let counter_clone = Arc::clone(&counter); - let audit: HandlerFn = Box::new(move |notif, _ctx, _fd| { + let audit = move |cx: &HandlerCtx| { let counter = Arc::clone(&counter_clone); - Box::pin(async move { + let pid = cx.notif.pid; + async move { let n = counter.fetch_add(1, Ordering::SeqCst) + 1; - eprintln!("[audit #{n}] pid={} openat", notif.pid); + eprintln!("[audit #{n}] pid={pid} openat"); // Continue = let the default table and the kernel handle it. NotifAction::Continue - }) - }); + } + }; let result = Sandbox::run_with_extra_handlers( &policy, Some("openat-audit"), &cmd_ref, - vec![ExtraHandler::new(libc::SYS_openat, audit)], + [(libc::SYS_openat, audit)], ) .await?; diff --git a/crates/sandlock-core/src/context.rs b/crates/sandlock-core/src/context.rs index eeff9b8..6bf5b3b 100644 --- a/crates/sandlock-core/src/context.rs +++ b/crates/sandlock-core/src/context.rs @@ -11,7 +11,7 @@ use crate::seccomp::bpf::{self, stmt, jump}; use crate::sys::structs::{ AF_INET, AF_INET6, BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W, - CLONE_NS_FLAGS, DEFAULT_DENY_SYSCALLS, EPERM, SYSV_IPC_DENY_SYSCALLS, + CLONE_NS_FLAGS, DEFAULT_BLOCKLIST_SYSCALLS, EPERM, SYSV_IPC_BLOCKLIST_SYSCALLS, SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO, SIOCETHTOOL, SIOCGIFADDR, SIOCGIFBRDADDR, SIOCGIFCONF, SIOCGIFDSTADDR, SIOCGIFFLAGS, SIOCGIFHWADDR, SIOCGIFINDEX, SIOCGIFNAME, SIOCGIFNETMASK, @@ -125,7 +125,7 @@ pub(crate) fn read_u32_fd(fd: RawFd) -> io::Result { /// Map a syscall name to its `libc::SYS_*` number. /// -/// Covers all names in `DEFAULT_DENY_SYSCALLS` plus extras needed for +/// Covers all names in `DEFAULT_BLOCKLIST_SYSCALLS` plus extras needed for /// notif and arg-filter lists. pub fn syscall_name_to_nr(name: &str) -> Option { let nr: i64 = match name { @@ -272,7 +272,7 @@ pub fn notif_syscalls(policy: &Policy, sandbox_name: Option<&str>) -> Vec { // layout puts notif JEQs before deny JEQs, so a syscall on // both lists would notify (RET_USER_NOTIF) and silently // bypass the kernel-level deny. When --allow-sysv-ipc is - // unset, shmget belongs only on the deny list. + // unset, shmget belongs only on the blocklist. if policy.allow_sysv_ipc { nrs.push(libc::SYS_shmget as u32); } @@ -442,16 +442,18 @@ pub fn notif_syscalls(policy: &Policy, sandbox_name: Option<&str>) -> Vec { nrs } -/// Resolve `NO_SUPERVISOR_DENY_SYSCALLS` names to numbers, plus +/// Resolve `NO_SUPERVISOR_BLOCKLIST_SYSCALLS` names to numbers, plus /// SysV IPC syscalls when `policy.allow_sysv_ipc` is false. -pub fn no_supervisor_deny_syscall_numbers(policy: &Policy) -> Vec { - use crate::sys::structs::NO_SUPERVISOR_DENY_SYSCALLS; - let mut nrs: Vec = NO_SUPERVISOR_DENY_SYSCALLS +pub fn no_supervisor_blocklist_syscall_numbers(policy: &Policy) -> Vec { + use crate::sys::structs::NO_SUPERVISOR_BLOCKLIST_SYSCALLS; + let mut nrs: Vec = NO_SUPERVISOR_BLOCKLIST_SYSCALLS .iter() + .copied() + .chain(policy.block_syscalls.iter().map(String::as_str)) .filter_map(|n| syscall_name_to_nr(n)) .collect(); if !policy.allow_sysv_ipc { - for name in SYSV_IPC_DENY_SYSCALLS { + for name in SYSV_IPC_BLOCKLIST_SYSCALLS { if let Some(nr) = syscall_name_to_nr(name) { if !nrs.contains(&nr) { nrs.push(nr); @@ -459,36 +461,24 @@ pub fn no_supervisor_deny_syscall_numbers(policy: &Policy) -> Vec { } } } + nrs.sort_unstable(); + nrs.dedup(); nrs } -/// Resolve `deny_syscalls` names to numbers. -/// -/// If both `deny_syscalls` and `allow_syscalls` are `None`, returns the -/// numbers for `DEFAULT_DENY_SYSCALLS`. +/// Resolve the default syscall blocklist plus policy extras to numbers. /// -/// SysV IPC syscalls are appended to the resolved deny list when -/// `policy.allow_sysv_ipc` is false — both for the default branch and -/// the user-supplied `deny_syscalls` branch. They are not appended in -/// allowlist mode (`allow_syscalls = Some(_)`); a user enumerating the -/// exact set of permitted syscalls is already in control. -pub fn deny_syscall_numbers(policy: &Policy) -> Vec { - let mut nrs: Vec = if let Some(ref names) = policy.deny_syscalls { - names - .iter() - .filter_map(|n| syscall_name_to_nr(n)) - .collect() - } else if policy.allow_syscalls.is_none() { - DEFAULT_DENY_SYSCALLS - .iter() - .filter_map(|n| syscall_name_to_nr(n)) - .collect() - } else { - // allow_syscalls is set — no deny list - return Vec::new(); - }; +/// SysV IPC syscalls are appended to the resolved blocklist when +/// `policy.allow_sysv_ipc` is false. +pub fn blocklist_syscall_numbers(policy: &Policy) -> Vec { + let mut nrs: Vec = DEFAULT_BLOCKLIST_SYSCALLS + .iter() + .copied() + .chain(policy.block_syscalls.iter().map(String::as_str)) + .filter_map(|n| syscall_name_to_nr(n)) + .collect(); if !policy.allow_sysv_ipc { - for name in SYSV_IPC_DENY_SYSCALLS { + for name in SYSV_IPC_BLOCKLIST_SYSCALLS { if let Some(nr) = syscall_name_to_nr(name) { if !nrs.contains(&nr) { nrs.push(nr); @@ -496,6 +486,8 @@ pub fn deny_syscall_numbers(policy: &Policy) -> Vec { } } } + nrs.sort_unstable(); + nrs.dedup(); nrs } @@ -980,7 +972,7 @@ pub(crate) fn confine_child(args: ChildSpawnArgs<'_>) -> ! { } // 9. Assemble and install seccomp filter (IRREVERSIBLE) - let deny = deny_syscall_numbers(policy); + let deny = blocklist_syscall_numbers(policy); let args = arg_filters(policy); let mut keep_fd: i32 = -1; @@ -1188,7 +1180,7 @@ mod tests { #[test] fn test_notif_syscalls_memory() { // shmget only appears in notif when SysV IPC is allowed — - // otherwise it is on the kernel deny list and notifying would + // otherwise it is on the kernel blocklist and notifying would // bypass the deny (notif JEQs precede deny JEQs in the BPF // layout). let policy = Policy::builder() @@ -1269,9 +1261,9 @@ mod tests { } #[test] - fn test_deny_syscall_numbers_default() { + fn test_blocklist_syscall_numbers_default() { let policy = Policy::builder().build().unwrap(); - let nrs = deny_syscall_numbers(&policy); + let nrs = blocklist_syscall_numbers(&policy); // Should contain mount, ptrace, etc. assert!(nrs.contains(&(libc::SYS_mount as u32))); assert!(nrs.contains(&(libc::SYS_ptrace as u32))); @@ -1286,13 +1278,13 @@ mod tests { } #[test] - fn test_deny_syscall_numbers_custom() { + fn test_blocklist_syscall_numbers_custom() { let policy = Policy::builder() - .deny_syscalls(vec!["mount".into(), "ptrace".into()]) + .block_syscalls(vec!["mount".into(), "ptrace".into()]) .build() .unwrap(); - let nrs = deny_syscall_numbers(&policy); - // User-supplied deny list still gets SysV IPC appended + let nrs = blocklist_syscall_numbers(&policy); + // User-supplied blocklist still gets SysV IPC appended // (allow_sysv_ipc defaults to false). assert!(nrs.contains(&(libc::SYS_mount as u32))); assert!(nrs.contains(&(libc::SYS_ptrace as u32))); @@ -1300,41 +1292,28 @@ mod tests { } #[test] - fn test_deny_syscall_numbers_custom_with_sysv_ipc_allowed() { + fn test_blocklist_syscall_numbers_custom_with_sysv_ipc_allowed() { let policy = Policy::builder() - .deny_syscalls(vec!["mount".into(), "ptrace".into()]) + .block_syscalls(vec!["mount".into(), "ptrace".into()]) .allow_sysv_ipc(true) .build() .unwrap(); - let nrs = deny_syscall_numbers(&policy); - // Exactly the user-supplied two — no SysV IPC append. - assert_eq!(nrs.len(), 2); + let nrs = blocklist_syscall_numbers(&policy); + // Default blocklist plus user extras — no SysV IPC append. assert!(nrs.contains(&(libc::SYS_mount as u32))); assert!(nrs.contains(&(libc::SYS_ptrace as u32))); + assert!(nrs.contains(&(libc::SYS_bpf as u32))); assert!(!nrs.contains(&(libc::SYS_shmget as u32))); } #[test] - fn test_deny_syscall_numbers_empty_when_allow_set() { - let policy = Policy::builder() - .allow_syscalls(vec!["read".into(), "write".into()]) - .build() - .unwrap(); - let nrs = deny_syscall_numbers(&policy); - // Allowlist mode: user enumerated exactly what is permitted — - // we do not append SysV IPC denials (the absence of those - // syscalls in allow_syscalls already denies them). - assert!(nrs.is_empty()); - } - - #[test] - fn test_deny_syscall_numbers_default_with_sysv_ipc_allowed() { + fn test_blocklist_syscall_numbers_default_with_sysv_ipc_allowed() { let policy = Policy::builder() .allow_sysv_ipc(true) .build() .unwrap(); - let nrs = deny_syscall_numbers(&policy); - // Default deny list still present, but SysV IPC is permitted. + let nrs = blocklist_syscall_numbers(&policy); + // Default blocklist still present, but SysV IPC is permitted. assert!(nrs.contains(&(libc::SYS_mount as u32))); assert!(!nrs.contains(&(libc::SYS_shmget as u32))); assert!(!nrs.contains(&(libc::SYS_msgget as u32))); @@ -1342,21 +1321,21 @@ mod tests { } #[test] - fn test_no_supervisor_deny_includes_sysv_ipc_by_default() { + fn test_no_supervisor_blocklist_includes_sysv_ipc_by_default() { let policy = Policy::builder().build().unwrap(); - let nrs = no_supervisor_deny_syscall_numbers(&policy); + let nrs = no_supervisor_blocklist_syscall_numbers(&policy); assert!(nrs.contains(&(libc::SYS_shmget as u32))); assert!(nrs.contains(&(libc::SYS_msgget as u32))); assert!(nrs.contains(&(libc::SYS_semget as u32))); } #[test] - fn test_no_supervisor_deny_excludes_sysv_ipc_when_allowed() { + fn test_no_supervisor_blocklist_excludes_sysv_ipc_when_allowed() { let policy = Policy::builder() .allow_sysv_ipc(true) .build() .unwrap(); - let nrs = no_supervisor_deny_syscall_numbers(&policy); + let nrs = no_supervisor_blocklist_syscall_numbers(&policy); assert!(!nrs.contains(&(libc::SYS_shmget as u32))); assert!(!nrs.contains(&(libc::SYS_msgget as u32))); assert!(!nrs.contains(&(libc::SYS_semget as u32))); @@ -1428,7 +1407,7 @@ mod tests { #[test] fn test_syscall_name_to_nr_covers_defaults() { - // Every name in DEFAULT_DENY_SYSCALLS should resolve unless the + // Every name in DEFAULT_BLOCKLIST_SYSCALLS should resolve unless the // running architecture does not expose that syscall. let expected_unresolved: &[&str] = &[ "nfsservctl", @@ -1438,7 +1417,7 @@ mod tests { "iopl", ]; let mut skipped = 0; - for name in DEFAULT_DENY_SYSCALLS { + for name in DEFAULT_BLOCKLIST_SYSCALLS { match syscall_name_to_nr(name) { Some(_) => {} None => { diff --git a/crates/sandlock-core/src/error.rs b/crates/sandlock-core/src/error.rs index 463c8dc..3570da5 100644 --- a/crates/sandlock-core/src/error.rs +++ b/crates/sandlock-core/src/error.rs @@ -21,14 +21,14 @@ pub enum PolicyError { #[error("invalid policy: {0}")] Invalid(String), - #[error("deny_syscalls and allow_syscalls are mutually exclusive")] - MutuallyExclusiveSyscalls, - #[error("fs_isolation requires workdir to be set")] FsIsolationRequiresWorkdir, #[error("max_cpu must be 1-100, got {0}")] InvalidCpuPercent(u8), + + #[error("confine() only accepts Landlock filesystem policy; unsupported fields: {0}")] + UnsupportedForConfine(String), } #[derive(Debug, Error)] diff --git a/crates/sandlock-core/src/landlock.rs b/crates/sandlock-core/src/landlock.rs index 80db874..3b48b81 100644 --- a/crates/sandlock-core/src/landlock.rs +++ b/crates/sandlock-core/src/landlock.rs @@ -175,6 +175,15 @@ pub const MIN_ABI: u32 = 6; /// Requires Landlock ABI v6 or later. Returns an error if the kernel does /// not meet this requirement. pub fn confine(policy: &Policy) -> Result<(), SandlockError> { + confine_inner(policy, true) +} + +/// Apply Landlock filesystem confinement without TCP bind/connect rules. +pub fn confine_filesystem(policy: &Policy) -> Result<(), SandlockError> { + confine_inner(policy, false) +} + +fn confine_inner(policy: &Policy, handle_net: bool) -> Result<(), SandlockError> { // Step 1 -- detect and validate ABI version. let abi = abi_version().map_err(|e| { SandlockError::Sandbox(crate::error::SandboxError::Confinement(e)) @@ -205,7 +214,9 @@ pub fn confine(policy: &Policy) -> Result<(), SandlockError> { // the on-behalf path becomes `NetworkPolicy::Unrestricted` (no // additional check). Bind enforcement is unaffected. let net_wildcard = policy.net_allow.iter().any(|r| r.all_ports); - let handled_access_net = if net_wildcard { + let handled_access_net = if !handle_net { + 0 + } else if net_wildcard { LANDLOCK_ACCESS_NET_BIND_TCP } else { LANDLOCK_ACCESS_NET_BIND_TCP | LANDLOCK_ACCESS_NET_CONNECT_TCP @@ -287,10 +298,12 @@ pub fn confine(policy: &Policy) -> Result<(), SandlockError> { } // Step 5 -- add network port rules. - for &port in &policy.net_bind { - add_net_rule(&ruleset_fd, port, LANDLOCK_ACCESS_NET_BIND_TCP).map_err(|e| { - SandlockError::Sandbox(crate::error::SandboxError::Confinement(e)) - })?; + if handle_net { + for &port in &policy.net_bind { + add_net_rule(&ruleset_fd, port, LANDLOCK_ACCESS_NET_BIND_TCP).map_err(|e| { + SandlockError::Sandbox(crate::error::SandboxError::Confinement(e)) + })?; + } } // For TCP connect, Landlock is the only enforcer on the direct path. // The on-behalf path (when enabled) re-checks (ip, port) against the @@ -302,7 +315,7 @@ pub fn confine(policy: &Policy) -> Result<(), SandlockError> { // When `net_wildcard` is set we already excluded CONNECT_TCP from // `handled_access_net`, so adding rules here would fail with EINVAL. // Skip — the on-behalf path is the sole enforcer. - if !net_wildcard { + if handle_net && !net_wildcard { let mut connect_ports: std::collections::HashSet = std::collections::HashSet::new(); for rule in &policy.net_allow { for &p in &rule.ports { diff --git a/crates/sandlock-core/src/lib.rs b/crates/sandlock-core/src/lib.rs index 4b450ba..3554063 100644 --- a/crates/sandlock-core/src/lib.rs +++ b/crates/sandlock-core/src/lib.rs @@ -29,7 +29,7 @@ pub(crate) mod http_acl; pub use error::SandlockError; pub use checkpoint::Checkpoint; -pub use policy::{Policy, PolicyBuilder}; +pub use policy::{ConfinePolicy, ConfinePolicyBuilder, Policy, PolicyBuilder}; pub use result::{RunResult, ExitStatus}; pub use sandbox::Sandbox; pub use pipeline::{Stage, Pipeline, Gather}; @@ -49,17 +49,12 @@ pub const MIN_LANDLOCK_ABI: u32 = landlock::MIN_ABI; /// Confine the calling process with Landlock restrictions. /// -/// This applies `PR_SET_NO_NEW_PRIVS` and Landlock rules from the policy's -/// filesystem (`fs_readable`, `fs_writable`) fields. IPC and signal -/// isolation are always enabled. The confinement is **irreversible**. -/// -/// `fs_denied` is not enforced here because it requires supervisor-mediated -/// path interception rather than Landlock's allowlist model. -/// -/// Network, seccomp, resource limits, and other policy fields are ignored. +/// This applies `PR_SET_NO_NEW_PRIVS` and Landlock rules from the policy. +/// IPC and signal isolation are always enabled. The confinement is +/// **irreversible**. /// /// This does NOT fork or exec — it confines the current process in-place. -pub fn confine_current_process(policy: &Policy) -> Result<(), SandlockError> { +pub fn confine(policy: &ConfinePolicy) -> Result<(), SandlockError> { // Set NO_NEW_PRIVS (required for Landlock) if unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) } != 0 { return Err(SandlockError::Sandbox( @@ -72,13 +67,15 @@ pub fn confine_current_process(policy: &Policy) -> Result<(), SandlockError> { )); } - // Build a stripped policy with only Landlock-native fields that - // confine_current_process supports: filesystem + IPC + signals. - // Network rules are excluded — they require the full sandbox. - let mut stripped = policy.clone(); - stripped.net_bind.clear(); - stripped.net_allow.clear(); + let mut builder = Policy::builder(); + for path in &policy.fs_readable { + builder = builder.fs_read(path.clone()); + } + for path in &policy.fs_writable { + builder = builder.fs_write(path.clone()); + } + let stripped = builder.build()?; - // Apply Landlock rules - landlock::confine(&stripped) + // Apply Landlock filesystem rules. + landlock::confine_filesystem(&stripped) } diff --git a/crates/sandlock-core/src/policy.rs b/crates/sandlock-core/src/policy.rs index c8dd6aa..0f3ffbb 100644 --- a/crates/sandlock-core/src/policy.rs +++ b/crates/sandlock-core/src/policy.rs @@ -56,6 +56,100 @@ impl ByteSize { } } +/// Policy for confining the current process in place. +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct ConfinePolicy { + pub fs_writable: Vec, + pub fs_readable: Vec, +} + +impl ConfinePolicy { + pub fn builder() -> ConfinePolicyBuilder { + ConfinePolicyBuilder::default() + } +} + +#[derive(Default)] +pub struct ConfinePolicyBuilder { + fs_writable: Vec, + fs_readable: Vec, +} + +impl ConfinePolicyBuilder { + pub fn fs_write(mut self, path: impl Into) -> Self { + self.fs_writable.push(path.into()); + self + } + + pub fn fs_read(mut self, path: impl Into) -> Self { + self.fs_readable.push(path.into()); + self + } + + pub fn build(self) -> ConfinePolicy { + ConfinePolicy { + fs_writable: self.fs_writable, + fs_readable: self.fs_readable, + } + } +} + +impl TryFrom<&Policy> for ConfinePolicy { + type Error = PolicyError; + + fn try_from(policy: &Policy) -> Result { + let mut unsupported = Vec::new(); + if !policy.fs_denied.is_empty() { unsupported.push("fs_denied"); } + if !policy.block_syscalls.is_empty() { unsupported.push("block_syscalls"); } + if !policy.net_allow.is_empty() { unsupported.push("net_allow"); } + if !policy.net_bind.is_empty() { unsupported.push("net_bind"); } + if policy.allow_udp { unsupported.push("allow_udp"); } + if policy.allow_icmp { unsupported.push("allow_icmp"); } + if policy.allow_sysv_ipc { unsupported.push("allow_sysv_ipc"); } + if !policy.http_allow.is_empty() { unsupported.push("http_allow"); } + if !policy.http_deny.is_empty() { unsupported.push("http_deny"); } + if !policy.http_ports.is_empty() { unsupported.push("http_ports"); } + if policy.https_ca.is_some() { unsupported.push("https_ca"); } + if policy.https_key.is_some() { unsupported.push("https_key"); } + if policy.max_memory.is_some() { unsupported.push("max_memory"); } + if policy.max_processes != 64 { unsupported.push("max_processes"); } + if policy.max_open_files.is_some() { unsupported.push("max_open_files"); } + if policy.max_cpu.is_some() { unsupported.push("max_cpu"); } + if policy.random_seed.is_some() { unsupported.push("random_seed"); } + if policy.time_start.is_some() { unsupported.push("time_start"); } + if policy.no_randomize_memory { unsupported.push("no_randomize_memory"); } + if policy.no_huge_pages { unsupported.push("no_huge_pages"); } + if policy.no_coredump { unsupported.push("no_coredump"); } + if policy.deterministic_dirs { unsupported.push("deterministic_dirs"); } + if policy.fs_isolation != FsIsolation::None { unsupported.push("fs_isolation"); } + if policy.workdir.is_some() { unsupported.push("workdir"); } + if policy.cwd.is_some() { unsupported.push("cwd"); } + if policy.fs_storage.is_some() { unsupported.push("fs_storage"); } + if policy.max_disk.is_some() { unsupported.push("max_disk"); } + if policy.on_exit != BranchAction::Commit { unsupported.push("on_exit"); } + if policy.on_error != BranchAction::Abort { unsupported.push("on_error"); } + if !policy.fs_mount.is_empty() { unsupported.push("fs_mount"); } + if policy.chroot.is_some() { unsupported.push("chroot"); } + if policy.clean_env { unsupported.push("clean_env"); } + if !policy.env.is_empty() { unsupported.push("env"); } + if policy.gpu_devices.is_some() { unsupported.push("gpu_devices"); } + if policy.cpu_cores.is_some() { unsupported.push("cpu_cores"); } + if policy.num_cpus.is_some() { unsupported.push("num_cpus"); } + if policy.port_remap { unsupported.push("port_remap"); } + if policy.uid.is_some() { unsupported.push("uid"); } + if policy.policy_fn.is_some() { unsupported.push("policy_fn"); } + + if !unsupported.is_empty() { + return Err(PolicyError::UnsupportedForConfine(unsupported.join(", "))); + } + + Ok(Self { + fs_writable: policy.fs_writable.clone(), + fs_readable: policy.fs_readable.clone(), + }) + } +} + /// Filesystem isolation mode. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] pub enum FsIsolation { @@ -287,7 +381,7 @@ pub fn prefix_or_exact_match(pattern: &str, value: &str) -> bool { /// Evaluate HTTP ACL rules against a request. /// -/// - Deny rules are checked first; if any match, return false. +/// - Block rules are checked first; if any match, return false. /// - Allow rules are checked next; if any match, return true. /// - If allow rules exist but none matched, return false (deny-by-default). /// - If no rules at all, return true (unrestricted). @@ -298,7 +392,7 @@ pub fn http_acl_check( host: &str, path: &str, ) -> bool { - // Deny rules checked first + // Block rules checked first for rule in deny { if rule.matches(method, host, path) { return false; @@ -309,7 +403,7 @@ pub fn http_acl_check( return true; // unrestricted } if allow.is_empty() { - // Only deny rules exist; anything not denied is allowed + // Only block rules exist; anything not denied is allowed return true; } for rule in allow { @@ -328,9 +422,8 @@ pub struct Policy { pub fs_readable: Vec, pub fs_denied: Vec, - // Syscall filtering - pub deny_syscalls: Option>, - pub allow_syscalls: Option>, + // Extra syscall filtering on top of Sandlock's default blocklist. + pub block_syscalls: Vec, // Network /// Outbound endpoint allowlist as a list of `(host?, ports)` rules. @@ -445,6 +538,22 @@ impl Policy { } } +fn validate_syscall_names(names: &[String]) -> Result<(), PolicyError> { + let unknown: Vec<&str> = names + .iter() + .map(String::as_str) + .filter(|name| crate::context::syscall_name_to_nr(name).is_none()) + .collect(); + if unknown.is_empty() { + Ok(()) + } else { + Err(PolicyError::Invalid(format!( + "unknown syscall name(s): {}", + unknown.join(", ") + ))) + } +} + /// Fluent builder for `Policy`. #[derive(Default)] pub struct PolicyBuilder { @@ -452,8 +561,7 @@ pub struct PolicyBuilder { fs_readable: Vec, fs_denied: Vec, - deny_syscalls: Option>, - allow_syscalls: Option>, + block_syscalls: Vec, /// Raw `--net-allow` specs; parsed in `build()` to surface errors. net_allow: Vec, @@ -528,13 +636,8 @@ impl PolicyBuilder { self } - pub fn deny_syscalls(mut self, calls: Vec) -> Self { - self.deny_syscalls = Some(calls); - self - } - - pub fn allow_syscalls(mut self, calls: Vec) -> Self { - self.allow_syscalls = Some(calls); + pub fn block_syscalls(mut self, calls: Vec) -> Self { + self.block_syscalls.extend(calls); self } @@ -743,10 +846,7 @@ impl PolicyBuilder { } pub fn build(self) -> Result { - // Validate: deny_syscalls and allow_syscalls are mutually exclusive - if self.deny_syscalls.is_some() && self.allow_syscalls.is_some() { - return Err(PolicyError::MutuallyExclusiveSyscalls); - } + validate_syscall_names(&self.block_syscalls)?; // Validate: max_cpu must be 1-100 if let Some(cpu) = self.max_cpu { @@ -835,8 +935,7 @@ impl PolicyBuilder { fs_writable: self.fs_writable, fs_readable: self.fs_readable, fs_denied: self.fs_denied, - deny_syscalls: self.deny_syscalls, - allow_syscalls: self.allow_syscalls, + block_syscalls: self.block_syscalls, net_allow, net_bind: self.net_bind, allow_udp: self.allow_udp, diff --git a/crates/sandlock-core/src/profile.rs b/crates/sandlock-core/src/profile.rs index 58e8b9a..f388257 100644 --- a/crates/sandlock-core/src/profile.rs +++ b/crates/sandlock-core/src/profile.rs @@ -1,4 +1,4 @@ -use crate::policy::{Policy, ByteSize}; +use crate::policy::{ByteSize, Policy}; use crate::error::SandlockError; use std::path::PathBuf; @@ -44,6 +44,12 @@ pub fn parse_profile(content: &str) -> Result { "profile field 'name' is not policy; pass the sandbox name at run time".into(), ))); } + if sandbox.contains_key("syscall_policy") { + return Err(SandlockError::Policy(crate::error::PolicyError::Invalid( + "profile field 'syscall_policy' was removed; Sandlock always applies its \ + default syscall blocklist, and 'block_syscalls' only adds entries".into(), + ))); + } let mut builder = Policy::builder(); @@ -99,7 +105,7 @@ pub fn parse_profile(content: &str) -> Result { if let Some(v) = sandbox.get("allow_sysv_ipc").and_then(|v| v.as_bool()) { builder = builder.allow_sysv_ipc(v); } -if let Some(v) = sandbox.get("clean_env").and_then(|v| v.as_bool()) { + if let Some(v) = sandbox.get("clean_env").and_then(|v| v.as_bool()) { builder = builder.clean_env(v); } if let Some(v) = sandbox.get("deterministic_dirs").and_then(|v| v.as_bool()) { @@ -116,10 +122,10 @@ if let Some(v) = sandbox.get("clean_env").and_then(|v| v.as_bool()) { for p in ports { if let Some(n) = p.as_integer() { builder = builder.net_bind_port(n as u16); } } } - // Parse syscall lists - if let Some(syscalls) = sandbox.get("deny_syscalls").and_then(|v| v.as_array()) { + // Parse extra syscall blocklist entries. + if let Some(syscalls) = sandbox.get("block_syscalls").and_then(|v| v.as_array()) { let names: Vec = syscalls.iter().filter_map(|v| v.as_str().map(String::from)).collect(); - builder = builder.deny_syscalls(names); + builder = builder.block_syscalls(names); } builder.build().map_err(|e| SandlockError::Policy(e)) @@ -202,6 +208,13 @@ max_processes = 10 assert!(err.to_string().contains("not policy")); } + #[test] + fn reject_removed_syscall_policy_in_profile() { + let err = parse_profile(r#"syscall_policy = "none""#).unwrap_err(); + assert!(err.to_string().contains("syscall_policy")); + assert!(err.to_string().contains("removed")); + } + #[test] fn list_profiles_empty_dir() { // With no profile dir, should return empty vec diff --git a/crates/sandlock-core/src/sandbox.rs b/crates/sandlock-core/src/sandbox.rs index 8ad2d21..6595116 100644 --- a/crates/sandlock-core/src/sandbox.rs +++ b/crates/sandlock-core/src/sandbox.rs @@ -220,7 +220,7 @@ impl Sandbox { /// confinement. /// /// Validation happens up-front (before fork): each `syscall` is checked - /// through `Syscall::checked`, and the deny-list contract is enforced via + /// through `Syscall::checked`, and the blocklist contract is enforced via /// [`crate::seccomp::dispatch::validate_handler_syscalls_against_policy`]. /// /// # Example @@ -410,7 +410,7 @@ impl Sandbox { let _ = crate::landlock::confine(&policy); - let deny = crate::context::deny_syscall_numbers(&policy); + let deny = crate::context::blocklist_syscall_numbers(&policy); let args = crate::context::arg_filters(&policy); let filter = match crate::seccomp::bpf::assemble_filter(&[], &deny, &args) { Ok(f) => f, @@ -1277,7 +1277,7 @@ impl Sandbox { /// Convert a user-supplied iterator of `(syscall, handler)` pairs into /// the internal `Vec<(i64, Arc)>` shape used by the -/// supervisor, validating each syscall up-front against the deny list. +/// supervisor, validating each syscall up-front against the blocklist. fn collect_extra_handlers( extra_handlers: I, policy: &Policy, diff --git a/crates/sandlock-core/src/seccomp/bpf.rs b/crates/sandlock-core/src/seccomp/bpf.rs index b52d1e5..2ab1c69 100644 --- a/crates/sandlock-core/src/seccomp/bpf.rs +++ b/crates/sandlock-core/src/seccomp/bpf.rs @@ -5,7 +5,7 @@ // [arg filter block] variable length (pre-built SockFilter instructions) // [LD syscall nr] 1 instruction // [notif JEQ instructions] 1 per notif syscall -// [deny JEQ instructions] 1 per deny syscall +// [deny JEQ instructions] 1 per blocklisted syscall // [RET ALLOW] index = ret_allow_idx (default fall-through) // [RET USER_NOTIF] index = ret_notif_idx // [RET ERRNO(EPERM)] index = ret_errno_idx @@ -45,7 +45,7 @@ pub(crate) fn jump(code: u16, k: u32, jt: u8, jf: u8) -> SockFilter { /// Assemble a cBPF program for `seccomp(SECCOMP_SET_MODE_FILTER, ...)`. /// /// * `notif_syscalls` — syscalls that generate SECCOMP_RET_USER_NOTIF -/// * `deny_syscalls` — syscalls that return ERRNO(EPERM) +/// * `block_syscalls` — syscalls that return ERRNO(EPERM) /// * `arg_block` — pre-built arg filter instructions (from `context::arg_filters`) /// /// Returns an error if the resulting program would exceed the kernel's @@ -56,7 +56,7 @@ pub(crate) fn jump(code: u16, k: u32, jt: u8, jf: u8) -> SockFilter { /// changes could silently truncate offsets. pub fn assemble_filter( notif_syscalls: &[u32], - deny_syscalls: &[u32], + block_syscalls: &[u32], arg_block: &[SockFilter], ) -> Result, std::io::Error> { // ---- compute final layout sizes ---- @@ -64,7 +64,7 @@ pub fn assemble_filter( let arg_block_len = arg_block.len(); let load_nr = 1usize; let notif_jmps = notif_syscalls.len(); - let deny_jmps = deny_syscalls.len(); + let deny_jmps = block_syscalls.len(); let ret_section = 4usize; // ALLOW, USER_NOTIF, ERRNO, KILL let total = arch_block + arg_block_len + load_nr + notif_jmps + deny_jmps + ret_section; @@ -106,7 +106,7 @@ pub fn assemble_filter( // ---- 5. Deny syscall JEQ instructions ---- let ret_errno_idx = total - 2; let deny_base = notif_base + notif_jmps; - for (i, &nr) in deny_syscalls.iter().enumerate() { + for (i, &nr) in block_syscalls.iter().enumerate() { let pos = deny_base + i; let jt = (ret_errno_idx - (pos + 1)) as u8; prog.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr, jt, 0)); diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs index b293ed4..d6b560d 100644 --- a/crates/sandlock-core/src/seccomp/dispatch.rs +++ b/crates/sandlock-core/src/seccomp/dispatch.rs @@ -133,9 +133,8 @@ pub enum HandlerError { InvalidSyscall(#[from] SyscallError), #[error( - "handler on syscall {syscall_nr} conflicts with the deny list \ - (DEFAULT_DENY_SYSCALLS or policy.deny_syscalls) and would let \ - user code bypass it via SECCOMP_USER_NOTIF_FLAG_CONTINUE" + "handler on syscall {syscall_nr} conflicts with the policy syscall blocklist \ + and would let user code bypass it via SECCOMP_USER_NOTIF_FLAG_CONTINUE" )] OnDenySyscall { syscall_nr: i64 }, } @@ -145,21 +144,13 @@ pub enum HandlerError { /// /// The cBPF program emits notif JEQs *before* deny JEQs, so a syscall /// present in both lists hits `SECCOMP_RET_USER_NOTIF` first. A handler -/// registered on a syscall that is on the deny list would therefore +/// registered on a syscall that is on the blocklist would therefore /// convert a kernel-deny into a user-supervised path: a handler returning /// `NotifAction::Continue` becomes `SECCOMP_USER_NOTIF_FLAG_CONTINUE` and /// the kernel actually runs the syscall — silently bypassing deny. /// -/// The deny list is whatever [`crate::context::deny_syscall_numbers`] -/// resolves: `policy.deny_syscalls` if set, otherwise -/// `DEFAULT_DENY_SYSCALLS` when neither `deny_syscalls` nor -/// `allow_syscalls` is set; both branches are guarded by this function. -/// -/// **Allowlist mode** (`policy.allow_syscalls = Some(_)`): the resolved -/// deny list is empty, so this function returns `Ok(())` for any syscall. -/// That is sound because the BPF deny block is empty in this mode too — -/// confinement comes from the allowlist enforced at the kernel level, -/// and there is no notif/deny overlap to bypass. +/// The blocklist is whatever [`crate::context::blocklist_syscall_numbers`] +/// resolves from Sandlock's default syscall blocklist plus policy extras. /// /// Takes only the syscall numbers because that's all it needs to check. /// Called from the `run_with_extra_handlers` entry points before any @@ -171,10 +162,10 @@ pub(crate) fn validate_handler_syscalls_against_policy( syscall_nrs: &[i64], policy: &crate::policy::Policy, ) -> Result<(), i64> { - let deny: std::collections::HashSet = - crate::context::deny_syscall_numbers(policy).into_iter().collect(); + let blocklist: std::collections::HashSet = + crate::context::blocklist_syscall_numbers(policy).into_iter().collect(); for &nr in syscall_nrs { - if deny.contains(&(nr as u32)) { + if blocklist.contains(&(nr as u32)) { return Err(nr); } } @@ -1169,25 +1160,25 @@ mod extra_handler_tests { } /// `validate_handler_syscalls_against_policy` must reject handlers whose - /// syscall is in the policy's user-specified `deny_syscalls` list, with - /// the same rationale as DEFAULT_DENY: the BPF program emits notif JEQs - /// before deny JEQs, so a user handler returning `Continue` would - /// translate into `SECCOMP_USER_NOTIF_FLAG_CONTINUE` and silently bypass - /// the kernel-level deny. + /// syscall is in the policy's user-specified blocklist, with the same + /// rationale as DEFAULT_BLOCKLIST: the BPF program emits notif JEQs before + /// deny JEQs, so a user handler returning `Continue` would translate into + /// `SECCOMP_USER_NOTIF_FLAG_CONTINUE` and silently bypass the kernel-level + /// block. /// /// Uses `mremap` because it is in `syscall_name_to_nr` but not in - /// `DEFAULT_DENY_SYSCALLS` — putting it into `deny_syscalls` is the only - /// way it ends up on the deny list, so the test isolates the user-supplied - /// path of `deny_syscall_numbers` from the default branch covered by - /// `extra_handler_on_default_deny_syscall_is_rejected`. + /// `DEFAULT_BLOCKLIST_SYSCALLS` — putting it into `block_syscalls` is the only + /// way it ends up on the extra blocklist, so the test isolates the user-supplied + /// path of `blocklist_syscall_numbers` from the default branch covered by + /// `extra_handler_on_default_blocklist_syscall_is_rejected`. /// /// Pure-logic counterpart to the integration test of the same name — /// runs without a live sandbox so the contract is enforced even on /// hosts where seccomp integration tests are skipped. #[test] - fn validate_extras_rejects_user_specified_deny() { + fn validate_extras_rejects_user_specified_blocklist() { let policy = crate::policy::Policy::builder() - .deny_syscalls(vec!["mremap".into()]) + .block_syscalls(vec!["mremap".into()]) .build() .expect("policy builds"); @@ -1195,7 +1186,7 @@ mod extra_handler_tests { assert_eq!( result, Err(libc::SYS_mremap), - "handler on user-specified deny must be rejected, naming the offending syscall" + "handler on user-specified blocklist must be rejected, naming the offending syscall" ); } diff --git a/crates/sandlock-core/src/sys/structs.rs b/crates/sandlock-core/src/sys/structs.rs index f08b32b..02f8f4c 100644 --- a/crates/sandlock-core/src/sys/structs.rs +++ b/crates/sandlock-core/src/sys/structs.rs @@ -263,10 +263,10 @@ pub const EAGAIN: i32 = 11; pub const ECONNREFUSED: i32 = 111; // ============================================================ -// Default deny syscall list +// Default blocklisted syscall list // ============================================================ -/// SysV IPC syscalls. Appended to the kernel-level deny list when +/// SysV IPC syscalls. Appended to the kernel-level blocklist when /// `policy.allow_sysv_ipc` is false. Sandlock does not use an IPC /// namespace, so without these denials two sandboxes on the same host /// share a SysV keyspace and can rendezvous via a well-known key. @@ -275,7 +275,7 @@ pub const ECONNREFUSED: i32 = 111; /// just `open("/dev/shm/")`, gated by Landlock filesystem rules. /// POSIX message queues (`mq_open` and friends) are also out of scope /// for this flag. -pub const SYSV_IPC_DENY_SYSCALLS: &[&str] = &[ +pub const SYSV_IPC_BLOCKLIST_SYSCALLS: &[&str] = &[ "shmget", "shmat", "shmdt", @@ -290,7 +290,7 @@ pub const SYSV_IPC_DENY_SYSCALLS: &[&str] = &[ "semtimedop", ]; -pub const DEFAULT_DENY_SYSCALLS: &[&str] = &[ +pub const DEFAULT_BLOCKLIST_SYSCALLS: &[&str] = &[ "mount", "umount2", "pivot_root", @@ -330,12 +330,12 @@ pub const DEFAULT_DENY_SYSCALLS: &[&str] = &[ /// Deny list for --no-supervisor mode. /// -/// More relaxed than DEFAULT_DENY_SYSCALLS because a full sandbox supervisor +/// More relaxed than DEFAULT_BLOCKLIST_SYSCALLS because a full sandbox supervisor /// may run inside the outer no-supervisor sandbox and needs syscalls like /// ptrace, process_vm_readv/writev, unshare, mount, and setns. /// /// Only blocks syscalls that could damage the host or escape all containment. -pub const NO_SUPERVISOR_DENY_SYSCALLS: &[&str] = &[ +pub const NO_SUPERVISOR_BLOCKLIST_SYSCALLS: &[&str] = &[ // Swap / reboot / shutdown — host-wide damage "swapon", "swapoff", diff --git a/crates/sandlock-core/tests/integration/test_extra_handlers.rs b/crates/sandlock-core/tests/integration/test_extra_handlers.rs index 3116809..8fa0bc1 100644 --- a/crates/sandlock-core/tests/integration/test_extra_handlers.rs +++ b/crates/sandlock-core/tests/integration/test_extra_handlers.rs @@ -388,15 +388,15 @@ async fn chain_of_extras_runs_in_insertion_order() { ); } -/// Default-deny bypass guard: registering an extra on a syscall in -/// `DEFAULT_DENY_SYSCALLS` (e.g. `mount`) MUST be rejected at registration +/// Default-blocklist bypass guard: registering an extra on a syscall in +/// `DEFAULT_BLOCKLIST_SYSCALLS` (e.g. `mount`) MUST be rejected at registration /// time. Without this check the extra-syscall ends up in the BPF notif /// block, which is matched *before* the deny block, so a user handler /// returning `Continue` would translate into /// `SECCOMP_USER_NOTIF_FLAG_CONTINUE` and the kernel would actually run -/// `mount` — silently bypassing default deny. +/// `mount` — silently bypassing default blocklist. #[tokio::test] -async fn extra_handler_on_default_deny_syscall_is_rejected() { +async fn extra_handler_on_default_blocklist_syscall_is_rejected() { let policy = base_policy().build().unwrap(); let handler = |_cx: &HandlerCtx| async { NotifAction::Continue }; @@ -410,32 +410,32 @@ async fn extra_handler_on_default_deny_syscall_is_rejected() { assert!( result.is_err(), - "extras on a default-deny syscall must be rejected up-front" + "extras on a default-blocklist syscall must be rejected up-front" ); let msg = format!("{}", result.unwrap_err()); assert!( - msg.contains("deny") || msg.contains("bypass"), + msg.contains("blocklist") || msg.contains("bypass"), "error must explain why the registration is rejected, got: {}", msg ); } -/// User-supplied `policy.deny_syscalls` must be honoured by the same guard -/// that protects DEFAULT_DENY: an extra registered on a syscall the caller -/// explicitly asked to deny would otherwise let a `Continue` from the +/// User-supplied `block_syscalls` entries must be honoured by the same guard +/// that protects DEFAULT_BLOCKLIST: an extra registered on a syscall the caller +/// explicitly asked to block would otherwise let a `Continue` from the /// handler reach the deny-JEQ via the notif path and bypass the kernel /// rejection at user-space discretion. /// -/// Counterpart to `extra_handler_on_default_deny_syscall_is_rejected`, -/// driving the user-list branch of `deny_syscall_numbers` (see +/// Counterpart to `extra_handler_on_default_blocklist_syscall_is_rejected`, +/// driving the user-list branch of `blocklist_syscall_numbers` (see /// `crates/sandlock-core/src/context.rs`). Uses `SYS_mremap` because it is -/// in `syscall_name_to_nr` but **not** in DEFAULT_DENY — putting it into -/// `deny_syscalls` is the only way it lands on the deny list, isolating the -/// user-supplied branch under test from the default-deny branch. +/// in `syscall_name_to_nr` but **not** in DEFAULT_BLOCKLIST — putting it into +/// `block_syscalls` is the only way it lands on the blocklist, isolating the +/// user-supplied branch under test from the default-blocklist branch. #[tokio::test] -async fn extra_handler_on_user_specified_deny_is_rejected() { +async fn extra_handler_on_user_specified_blocklist_is_rejected() { let policy = base_policy() - .deny_syscalls(vec!["mremap".into()]) + .block_syscalls(vec!["mremap".into()]) .build() .unwrap(); let handler = |_cx: &HandlerCtx| async { NotifAction::Continue }; @@ -450,7 +450,7 @@ async fn extra_handler_on_user_specified_deny_is_rejected() { assert!( result.is_err(), - "extras on a user-specified deny syscall must be rejected up-front" + "extras on a user-specified blocklist syscall must be rejected up-front" ); let msg = format!("{}", result.unwrap_err()); assert!( @@ -678,15 +678,15 @@ async fn run_with_extra_handlers_preserves_insertion_order_in_sandbox_chain() { let _ = std::fs::remove_file(&out); } -/// `run_with_extra_handlers` on a default-deny syscall MUST return +/// `run_with_extra_handlers` on a default-blocklist syscall MUST return /// `HandlerError::OnDenySyscall` up-front (before fork) — closes the /// kernel-deny -> NOTIF_FLAG_CONTINUE bypass attack. #[tokio::test] -async fn run_with_extra_handlers_rejects_handler_on_default_deny_syscall() { +async fn run_with_extra_handlers_rejects_handler_on_default_blocklist_syscall() { let policy = base_policy().build().unwrap(); let handler = |_cx: &HandlerCtx| async { NotifAction::Continue }; - // SYS_mount is in DEFAULT_DENY_SYSCALLS. + // SYS_mount is in DEFAULT_BLOCKLIST_SYSCALLS. let result = Sandbox::run_with_extra_handlers(&policy, None, &["true"], [(libc::SYS_mount, handler)]).await; diff --git a/crates/sandlock-core/tests/integration/test_policy.rs b/crates/sandlock-core/tests/integration/test_policy.rs index b2c6d0f..265cb5e 100644 --- a/crates/sandlock-core/tests/integration/test_policy.rs +++ b/crates/sandlock-core/tests/integration/test_policy.rs @@ -4,6 +4,7 @@ use sandlock_core::policy::{ByteSize, FsIsolation, BranchAction, Policy}; fn test_default_policy() { let policy = Policy::builder().build().unwrap(); assert_eq!(policy.max_processes, 64); + assert!(policy.block_syscalls.is_empty()); assert!(!policy.allow_udp, "UDP is denied by default"); assert!(!policy.allow_icmp, "ICMP raw is denied by default"); assert!(policy.uid.is_none()); @@ -64,10 +65,9 @@ fn test_builder_resource_limits() { } #[test] -fn test_mutually_exclusive_syscalls() { +fn test_unknown_syscall_is_rejected() { let result = Policy::builder() - .deny_syscalls(vec!["mount".into()]) - .allow_syscalls(vec!["read".into()]) + .block_syscalls(vec!["definitely_not_a_syscall".into()]) .build(); assert!(result.is_err()); } diff --git a/crates/sandlock-core/tests/integration/test_sandbox.rs b/crates/sandlock-core/tests/integration/test_sandbox.rs index 748a288..54b93a1 100644 --- a/crates/sandlock-core/tests/integration/test_sandbox.rs +++ b/crates/sandlock-core/tests/integration/test_sandbox.rs @@ -57,7 +57,7 @@ async fn test_denied_syscall() { .fs_read("/dev") .build() .unwrap(); - // mount is in DEFAULT_DENY_SYSCALLS; redirect stderr to /dev/null + // mount is in DEFAULT_BLOCKLIST_SYSCALLS; redirect stderr to /dev/null // (need /dev readable for this) let result = Sandbox::run( &policy, Some("test"), diff --git a/crates/sandlock-core/tests/integration/test_seccomp_enforce.rs b/crates/sandlock-core/tests/integration/test_seccomp_enforce.rs index 6838280..07116e2 100644 --- a/crates/sandlock-core/tests/integration/test_seccomp_enforce.rs +++ b/crates/sandlock-core/tests/integration/test_seccomp_enforce.rs @@ -25,7 +25,7 @@ fn temp_out(name: &str) -> PathBuf { } // ------------------------------------------------------------------ -// 1. mount() is blocked by default seccomp deny list +// 1. mount() is blocked by default seccomp blocklist // ------------------------------------------------------------------ #[tokio::test] async fn test_mount_blocked() { @@ -377,7 +377,7 @@ async fn test_sysv_shmget_allowed_when_opted_in() { } // ------------------------------------------------------------------ -// 8. TCP always allowed (default deny posture for raw + UDP) +// 8. TCP always allowed (default blocklist posture for raw + UDP) // ------------------------------------------------------------------ #[tokio::test] async fn test_tcp_always_allowed() { diff --git a/crates/sandlock-ffi/src/lib.rs b/crates/sandlock-ffi/src/lib.rs index 058df9f..3cf42a9 100644 --- a/crates/sandlock-ffi/src/lib.rs +++ b/crates/sandlock-ffi/src/lib.rs @@ -505,27 +505,14 @@ pub unsafe extern "C" fn sandlock_policy_builder_time_start( /// # Safety /// `b` must be a valid builder pointer. `names` is a comma-separated NUL-terminated string. #[no_mangle] -pub unsafe extern "C" fn sandlock_policy_builder_deny_syscalls( +pub unsafe extern "C" fn sandlock_policy_builder_block_syscalls( b: *mut PolicyBuilder, names: *const c_char, ) -> *mut PolicyBuilder { if b.is_null() || names.is_null() { return b; } let builder = *Box::from_raw(b); let s = CStr::from_ptr(names).to_str().unwrap_or(""); let calls: Vec = s.split(',').map(|s| s.trim().to_string()).filter(|s| !s.is_empty()).collect(); - Box::into_raw(Box::new(builder.deny_syscalls(calls))) -} - -/// # Safety -/// `b` must be a valid builder pointer. `names` is a comma-separated NUL-terminated string. -#[no_mangle] -pub unsafe extern "C" fn sandlock_policy_builder_allow_syscalls( - b: *mut PolicyBuilder, names: *const c_char, -) -> *mut PolicyBuilder { - if b.is_null() || names.is_null() { return b; } - let builder = *Box::from_raw(b); - let s = CStr::from_ptr(names).to_str().unwrap_or(""); - let calls: Vec = s.split(',').map(|s| s.trim().to_string()).filter(|s| !s.is_empty()).collect(); - Box::into_raw(Box::new(builder.allow_syscalls(calls))) + Box::into_raw(Box::new(builder.block_syscalls(calls))) } /// # Safety @@ -656,7 +643,11 @@ pub unsafe extern "C" fn sandlock_confine( ) -> c_int { if policy.is_null() { return -1; } let policy = &(*policy)._private; - match sandlock_core::confine_current_process(policy) { + let policy = match sandlock_core::ConfinePolicy::try_from(policy) { + Ok(policy) => policy, + Err(_) => return -1, + }; + match sandlock_core::confine(&policy) { Ok(()) => 0, Err(_) => -1, } diff --git a/docs/extension-handlers.md b/docs/extension-handlers.md index cfafb75..cd24e2e 100644 --- a/docs/extension-handlers.md +++ b/docs/extension-handlers.md @@ -162,8 +162,8 @@ Errors at registration time, before fork: - `SyscallError::Negative` / `SyscallError::UnknownForArch` from `Syscall::checked` (wrapped in `HandlerError::InvalidSyscall`, then in `SandlockError::Handler`). -- `HandlerError::OnDenySyscall` if any registered syscall is in `policy.deny_syscalls` or - `DEFAULT_DENY_SYSCALLS` (see [Security boundary](#security-boundary)). +- `HandlerError::OnDenySyscall` if any registered syscall is in Sandlock's default syscall + blocklist or the policy's extra `block_syscalls` list (see [Security boundary](#security-boundary)). ### Interactive mode @@ -427,29 +427,28 @@ builtin and a user handler produces a single JEQ in the assembled program. Validation runs at registration time (before fork). If `Syscall::checked` fails, `run_with_extra_handlers` returns the error without enqueueing the handler. -### Deny-list bypass guard +### Blocklist Bypass Guard -The cBPF program emits notif JEQs *before* deny JEQs, so a syscall present in both lists hits -`SECCOMP_RET_USER_NOTIF` first. A handler registered on a syscall in -[`DEFAULT_DENY_SYSCALLS`](../crates/sandlock-core/src/sys/structs.rs) — or in -`policy.deny_syscalls` — would convert a kernel-deny into a user-supervised path; a handler -returning `NotifAction::Continue` would become `SECCOMP_USER_NOTIF_FLAG_CONTINUE` and the kernel -would actually run the syscall, silently bypassing deny. +The cBPF program emits notif JEQs *before* deny JEQs, so a syscall present in both lists +hits `SECCOMP_RET_USER_NOTIF` first. A handler registered on a syscall in +[`DEFAULT_BLOCKLIST_SYSCALLS`](../crates/sandlock-core/src/sys/structs.rs) — or in the policy's +extra `block_syscalls` list — would convert a kernel-deny into a user-supervised +path; a handler returning `NotifAction::Continue` would become +`SECCOMP_USER_NOTIF_FLAG_CONTINUE` and the kernel would actually run the syscall, silently +bypassing deny. `run_with_extra_handlers` rejects this configuration at registration time and returns `HandlerError::OnDenySyscall { syscall_nr }`. The check is implemented in [`validate_handler_syscalls_against_policy`](../crates/sandlock-core/src/seccomp/dispatch.rs) -and covers both the default-deny branch (`DEFAULT_DENY_SYSCALLS`) and the user-specified branch -(`policy.deny_syscalls`); both branches are tested -(`validate_extras_rejects_user_specified_deny`, -`extra_handler_on_default_deny_syscall_is_rejected`, -`run_with_extra_handlers_rejects_handler_on_default_deny_syscall`, +and covers both the default blocklist (`DEFAULT_BLOCKLIST_SYSCALLS`) and the +user-specified extras (`block_syscalls`); both branches are tested +(`validate_extras_rejects_user_specified_blocklist`, +`extra_handler_on_default_blocklist_syscall_is_rejected`, +`run_with_extra_handlers_rejects_handler_on_default_blocklist_syscall`, `run_with_extra_handlers_rejects_negative_syscall`, `run_with_extra_handlers_rejects_arch_unknown_syscall`). -In allowlist mode (`policy.allow_syscalls = Some(_)`) the resolved deny list is empty and the -guard is a no-op — but so is the BPF deny block, and confinement comes entirely from the -kernel-enforced allowlist, so there is no overlap to bypass. +Sandlock always installs its default syscall blocklist, so this guard is always active. ## Panics diff --git a/python/README.md b/python/README.md index 999b293..31e09b0 100644 --- a/python/README.md +++ b/python/README.md @@ -72,12 +72,12 @@ Unset fields mean "no restriction" unless noted otherwise. Enforce method + host + path rules on HTTP traffic via a transparent MITM proxy. When `http_allow` is set, all non-matching HTTP requests are -denied by default. Deny rules are checked first and take precedence. +denied by default. Block rules are checked first and take precedence. | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `http_allow` | `list[str]` | `[]` | Allow rules in `"METHOD host/path"` format | -| `http_deny` | `list[str]` | `[]` | Deny rules in `"METHOD host/path"` format | +| `http_deny` | `list[str]` | `[]` | Block rules in `"METHOD host/path"` format | | `http_ports` | `list[int]` | `[80]` | TCP ports to intercept (443 added when `https_ca` is set) | | `https_ca` | `str \| None` | `None` | CA certificate for HTTPS MITM | | `https_key` | `str \| None` | `None` | CA private key for HTTPS MITM | @@ -144,10 +144,9 @@ policy = Policy( | Parameter | Type | Default | Description | |-----------|------|---------|-------------| -| `deny_syscalls` | `list[str] \| None` | `None` | Syscall names to block (blocklist mode) | -| `allow_syscalls` | `list[str] \| None` | `None` | Syscall names to allow (allowlist mode) | +| `block_syscalls` | `list[str]` | `[]` | Extra syscalls to block in addition to Sandlock defaults | -Set one or neither, not both. +Sandlock always applies its default syscall blocklist. #### Deterministic execution diff --git a/python/src/sandlock/_profile.py b/python/src/sandlock/_profile.py index 5c75f1b..3170196 100644 --- a/python/src/sandlock/_profile.py +++ b/python/src/sandlock/_profile.py @@ -27,9 +27,8 @@ "fs_writable": list, "fs_readable": list, "fs_denied": list, - # Syscall filtering - "deny_syscalls": list, - "allow_syscalls": list, + # Extra syscall blocklist entries + "block_syscalls": list, # Network "net_allow": list, "net_bind": list, @@ -127,7 +126,6 @@ def policy_from_dict(data: dict, source: str = "") -> Policy: raise PolicyError( f"unknown fields in {source}: {', '.join(sorted(unknown))}" ) - kwargs: dict = {} for key, value in data.items(): expected = _SIMPLE_FIELDS[key] @@ -151,7 +149,6 @@ def policy_from_dict(data: dict, source: str = "") -> Policy: f"got {value!r}" ) continue - # Type checking if not isinstance(value, expected): raise PolicyError( diff --git a/python/src/sandlock/_sdk.py b/python/src/sandlock/_sdk.py index 458998d..97564bf 100644 --- a/python/src/sandlock/_sdk.py +++ b/python/src/sandlock/_sdk.py @@ -103,8 +103,7 @@ def _builder_fn(name, *extra_args): _b_clean_env = _builder_fn("sandlock_policy_builder_clean_env", ctypes.c_bool) _b_env_var = _builder_fn("sandlock_policy_builder_env_var", ctypes.c_char_p, ctypes.c_char_p) _b_time_start = _builder_fn("sandlock_policy_builder_time_start", ctypes.c_uint64) -_b_deny_syscalls = _builder_fn("sandlock_policy_builder_deny_syscalls", ctypes.c_char_p) -_b_allow_syscalls = _builder_fn("sandlock_policy_builder_allow_syscalls", ctypes.c_char_p) +_b_block_syscalls = _builder_fn("sandlock_policy_builder_block_syscalls", ctypes.c_char_p) _b_max_open_files = _builder_fn("sandlock_policy_builder_max_open_files", ctypes.c_uint32) _b_no_randomize_memory = _builder_fn("sandlock_policy_builder_no_randomize_memory", ctypes.c_bool) _b_no_huge_pages = _builder_fn("sandlock_policy_builder_no_huge_pages", ctypes.c_bool) @@ -184,11 +183,12 @@ def confine(policy: "PolicyDataclass") -> None: """Confine the calling process with Landlock restrictions. Applies PR_SET_NO_NEW_PRIVS and Landlock rules from the policy's - filesystem, IPC, and signal isolation fields. The confinement is - **irreversible**. + filesystem fields. IPC and signal isolation are always enabled. The + confinement is **irreversible**. - Only filesystem paths are used (IPC and signal isolation are always enabled). - Network, resource limits, and other policy fields are ignored. + Only filesystem paths are accepted. Policies containing supervisor, + seccomp, network, resource, environment, or COW settings are rejected + rather than silently ignored. This does NOT fork or exec — it confines the current process in-place. @@ -202,7 +202,7 @@ def confine(policy: "PolicyDataclass") -> None: ret = _lib.sandlock_confine(native.ptr) if ret != 0: from .exceptions import ConfinementError - raise ConfinementError("confine_current_process failed") + raise ConfinementError("confine failed") _lib.sandlock_policy_build.restype = _c_policy_p @@ -750,7 +750,7 @@ def __del__(self): "http_allow", "http_deny", "http_ports", "https_ca", "https_key", "uid", "random_seed", "time_start", "clean_env", "env", - "deny_syscalls", "allow_syscalls", "max_open_files", + "block_syscalls", "max_open_files", "no_randomize_memory", "no_huge_pages", "no_coredump", "deterministic_dirs", # Managed outside _build_from_policy: "notif_policy", @@ -868,10 +868,8 @@ def _build_from_policy(policy: PolicyDataclass): for k, v in (policy.env or {}).items(): b = _b_env_var(b, _encode(k), _encode(v)) - if policy.deny_syscalls: - b = _b_deny_syscalls(b, _encode(",".join(policy.deny_syscalls))) - if policy.allow_syscalls: - b = _b_allow_syscalls(b, _encode(",".join(policy.allow_syscalls))) + if policy.block_syscalls: + b = _b_block_syscalls(b, _encode(",".join(policy.block_syscalls or []))) if policy.max_open_files is not None: b = _b_max_open_files(b, policy.max_open_files) diff --git a/python/src/sandlock/policy.py b/python/src/sandlock/policy.py index f9ac8e9..8edead2 100644 --- a/python/src/sandlock/policy.py +++ b/python/src/sandlock/policy.py @@ -120,8 +120,8 @@ class DryRunResult: class Policy: """Immutable sandbox policy. - All fields are optional — unset fields mean "no restriction" - (except ``deny_syscalls`` which defaults to a safe blocklist). + Most fields are optional — unset fields mean "no restriction". Sandlock's + default syscall blocklist is always applied. """ # Filesystem (Landlock) @@ -134,13 +134,8 @@ class Policy: fs_denied: Sequence[str] = field(default_factory=list) """Paths explicitly denied (neither read nor write).""" - # Syscall filtering (seccomp) — set one or neither, not both - deny_syscalls: Sequence[str] | None = None - """Syscall names to block (blocklist mode). None = default blocklist.""" - - allow_syscalls: Sequence[str] | None = None - """Syscall names to allow (allowlist mode). Everything else is blocked. - Stricter than deny_syscalls — unknown/new syscalls are denied by default.""" + block_syscalls: Sequence[str] = field(default_factory=list) + """Additional syscall names to block on top of Sandlock's default blocklist.""" # Network — endpoint allowlist (IP × port via seccomp on-behalf path) net_allow: Sequence[str] = field(default_factory=list) @@ -193,7 +188,7 @@ class Policy: A transparent MITM proxy is spawned in the supervisor.""" http_deny: Sequence[str] = field(default_factory=list) - """HTTP deny rules. Checked before allow rules. Format: "METHOD host/path".""" + """HTTP block rules. Checked before allow rules. Format: "METHOD host/path".""" http_ports: Sequence[int] = field(default_factory=list) """TCP ports to intercept for HTTP ACL. Defaults to [80] (plus 443 with diff --git a/python/tests/test_policy.py b/python/tests/test_policy.py index e47fbbd..1956ba6 100644 --- a/python/tests/test_policy.py +++ b/python/tests/test_policy.py @@ -50,7 +50,7 @@ def test_defaults(self): assert p.fs_writable == [] assert p.fs_readable == [] assert p.fs_denied == [] - assert p.deny_syscalls is None + assert p.block_syscalls == [] assert p.net_bind == [] assert p.net_allow == [] assert p.max_memory is None @@ -211,5 +211,3 @@ def test_specs_preserved_as_strings(self): ":8080", ] - - diff --git a/python/tests/test_sandbox.py b/python/tests/test_sandbox.py index 10c3c89..5fd5263 100644 --- a/python/tests/test_sandbox.py +++ b/python/tests/test_sandbox.py @@ -403,8 +403,8 @@ def test_time_start(self): assert result.success assert result.stdout.strip() == b"2000" - def test_deny_syscalls(self): - p = _policy(deny_syscalls=["mount"]) + def test_block_syscalls(self): + p = _policy(block_syscalls=["mount"]) result = Sandbox(p).run(["echo", "ok"]) assert result.success assert result.stdout.strip() == b"ok"