diff --git a/Cargo.lock b/Cargo.lock index 3d4a27f1..7ef52e71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3754,6 +3754,7 @@ dependencies = [ "fs2", "futures-lite", "hex", + "nix 0.30.1", "regex", "reqwest", "seahash", diff --git a/Cargo.toml b/Cargo.toml index 1613ed30..17820094 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,7 @@ glob = "0.3" hex = "0.4" http = "1.1" indicatif = "0.17" -nix = { version = "0.30", features = ["signal"] } +nix = { version = "0.30", features = ["signal", "resource"] } pem = "3" pyo3 = { version = "0.28", features = ["extension-module"] } promptly = "0.3" diff --git a/crates/tower-runtime/src/local.rs b/crates/tower-runtime/src/local.rs index ec2ea64f..ed1a6e13 100644 --- a/crates/tower-runtime/src/local.rs +++ b/crates/tower-runtime/src/local.rs @@ -293,6 +293,11 @@ async fn execute_local_app( return Err(Error::Cancelled); } + // Pre-allocate disk space that we can free if the app fills the disk, + // so the runner still has room to report the failure. + let balloon_path = working_dir.join(".tower-disk-reserve"); + create_balloon_file(&balloon_path).await; + let mut child = uv.run(&working_dir, &program_path, &env_vars).await?; // Drain the logs to the output channel. @@ -312,7 +317,9 @@ async fn execute_local_app( BufReader::new(stderr), )); - let _ = sx.send(wait_for_process(ctx.clone(), &cancel_token, child).await); + let code = wait_for_process(ctx.clone(), &cancel_token, child).await; + let _ = tokio::fs::remove_file(&balloon_path).await; + let _ = sx.send(code); } // Everything was properly executed I suppose. @@ -548,7 +555,20 @@ async fn wait_for_process( if let Ok(res) = timeout { if let Ok(status) = res { - break status.code().expect("no status code"); + // On Unix, a process killed by a signal has no exit code. + // Return the negative signal number (e.g. -9 for SIGKILL) + // instead of panicking. + break status.code().unwrap_or_else(|| { + #[cfg(unix)] + { + use std::os::unix::process::ExitStatusExt; + status.signal().map(|s| -(s as i32)).unwrap_or(-1) + } + #[cfg(not(unix))] + { + -1 + } + }); } else { // something went wrong. debug!(ctx: &ctx, "failed to get status due to some kind of IO error: {}" , res.err().expect("no error somehow")); @@ -584,3 +604,40 @@ async fn drain_output( fn is_bash_package(package: &Package) -> bool { return package.manifest.invoke.ends_with(".sh"); } + +const BALLOON_SIZE: usize = 8 * 1024 * 1024; // 8 MB + +async fn create_balloon_file(path: &PathBuf) { + let data = vec![0u8; BALLOON_SIZE]; + let _ = tokio::fs::write(path, &data).await; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[cfg(unix)] + #[tokio::test] + async fn test_signal_killed_process_returns_negative_code() { + use nix::sys::signal::{kill, Signal}; + use nix::unistd::Pid; + + let child = Command::new("sleep") + .arg("60") + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .expect("failed to spawn sleep"); + + let pid = child.id().expect("no pid") as i32; + let cancel_token = CancellationToken::new(); + let ctx = tower_telemetry::Context::new("test".to_string()); + + // Kill the child with SIGKILL before waiting + kill(Pid::from_raw(pid), Signal::SIGKILL).expect("failed to send SIGKILL"); + + let code = wait_for_process(ctx, &cancel_token, child).await; + assert_eq!(code, -9, "SIGKILL should produce exit code -9"); + } +} diff --git a/crates/tower-uv/Cargo.toml b/crates/tower-uv/Cargo.toml index 5c42f83e..80581436 100644 --- a/crates/tower-uv/Cargo.toml +++ b/crates/tower-uv/Cargo.toml @@ -19,3 +19,4 @@ seahash = { workspace = true } tokio = { workspace = true } tokio-tar = { workspace = true } tower-telemetry = { workspace = true } +nix = { workspace = true } diff --git a/crates/tower-uv/src/lib.rs b/crates/tower-uv/src/lib.rs index 94e9b6f9..4688f870 100644 --- a/crates/tower-uv/src/lib.rs +++ b/crates/tower-uv/src/lib.rs @@ -11,6 +11,7 @@ use tokio::process::{Child, Command}; use tower_telemetry::debug; pub mod install; +mod resource_limits; // UV_VERSION is the version of UV to download and install when setting up a local UV deployment. pub const UV_VERSION: &str = "0.9.27"; @@ -427,6 +428,19 @@ impl Uv { #[cfg(unix)] { cmd.process_group(0); + + // Cap the child's virtual memory so an OOM in the app kills the + // child but leaves the runner alive to report the failure. + if let Some(limit) = resource_limits::detect_memory_limit() { + unsafe { + cmd.pre_exec(move || { + use nix::sys::resource::{setrlimit, Resource}; + let rlim = nix::sys::resource::rlim_t::from(limit); + setrlimit(Resource::RLIMIT_AS, rlim, rlim) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e)) + }); + } + } } if self.protected_mode { diff --git a/crates/tower-uv/src/resource_limits.rs b/crates/tower-uv/src/resource_limits.rs new file mode 100644 index 00000000..cd9243e9 --- /dev/null +++ b/crates/tower-uv/src/resource_limits.rs @@ -0,0 +1,50 @@ +/// Subtracted from the detected limit before applying RLIMIT_AS to the child. +const HEADROOM_BYTES: u64 = 5 * 1024 * 1024; + +/// Detect the effective memory limit for this environment. +/// +/// Returns a byte count the subprocess should be constrained to, leaving +/// headroom for the parent process to report errors. +/// +/// Detection order (all unix-only, returns `None` on other platforms): +/// 1. `RLIMIT_AS` — if already set to a finite value, subtract headroom +/// 2. cgroup v2 `memory.max` +/// 3. cgroup v1 `memory.limit_in_bytes` +#[cfg(unix)] +pub fn detect_memory_limit() -> Option { + use nix::sys::resource::{getrlimit, Resource}; + + if let Ok((soft, _hard)) = getrlimit(Resource::RLIMIT_AS) { + let unlimited = nix::libc::RLIM_INFINITY; + if soft != unlimited && soft > HEADROOM_BYTES { + return Some(soft - HEADROOM_BYTES); + } + } + + if let Ok(contents) = std::fs::read_to_string("/sys/fs/cgroup/memory.max") { + let trimmed = contents.trim(); + if trimmed != "max" { + if let Ok(limit) = trimmed.parse::() { + if limit > HEADROOM_BYTES { + return Some(limit - HEADROOM_BYTES); + } + } + } + } + + const CGROUP_V1_SENTINEL: u64 = 0x7FFF_FFFF_FFFF_F000; + if let Ok(contents) = std::fs::read_to_string("/sys/fs/cgroup/memory/memory.limit_in_bytes") { + if let Ok(limit) = contents.trim().parse::() { + if limit < CGROUP_V1_SENTINEL && limit > HEADROOM_BYTES { + return Some(limit - HEADROOM_BYTES); + } + } + } + + None +} + +#[cfg(not(unix))] +pub fn detect_memory_limit() -> Option { + None +}