diff --git a/bench/pause-window/PROBE-multi-branch-anomaly.md b/bench/pause-window/PROBE-multi-branch-anomaly.md index 8b811a1..93d1430 100644 --- a/bench/pause-window/PROBE-multi-branch-anomaly.md +++ b/bench/pause-window/PROBE-multi-branch-anomaly.md @@ -145,9 +145,89 @@ jump was an IO or kernel-bitmap issue. Given this probe: - Phase 3 changes from "1 s tick" to "cap per-VM diff BRANCHes to N, recycle source via Full BRANCH + restore beyond N". +## Follow-up: thread-level attribution (2026-05-21) + +Original probe used strace `-c` on the whole process, which can't +distinguish between *user-space CPU* and *off-CPU blocked waiting*. +Re-probed with two extra tools: + +- **bpftrace `profile:hz:199`** on the FC pid, sampling on-CPU stacks + for 30 s while firing a deliberately slow BRANCH inside that window. + Result: only ~18 samples landed during the ~1.6 s slow BRANCH out of + ~320 expected → **FC was off-CPU ~94 % of the BRANCH window**. +- **`/proc/$pid/task/*/stack` polled at 30 ms intervals** across all + threads. Histogram of the top kernel-frame across the slow window: + +| Count | Top kernel frame | Thread role | Meaning | +|---:|---|---|---| +| 90 | `ep_poll` | main | HTTP API socket idle — expected | +| 88 | `[kvm]` (in `kvm_vcpu_halt`) | vCPU | guest paused — expected, this *is* the pause | +| 50 | `vhost_task_fn` | vhost-net | idle — expected | +| 17 | `futex_wait_queue` | (unknown FC thread) | **blocked on a userspace futex/mutex** | +| 3 | `submit_bio_wait` | snapshot writer | block-layer IO completion wait | +| 2 | `jbd2_log_wait_commit` | snapshot writer | ext4 journal commit wait | + +Full vCPU kernel stack for the 88 samples: + +``` +kvm_vcpu_block ← kvm_vcpu_halt ← kvm_arch_vcpu_ioctl_run + ← vcpu_run ← kvm_vcpu_ioctl ← __x64_sys_ioctl +``` + +Full futex stack for the 17 samples: + +``` +futex_wait_queue ← futex_wait ← __futex_wait + ← do_futex ← __x64_sys_futex +``` + +Standard "thread asleep on a userspace futex"; the kernel can't +identify *which* futex from a static stack. + +### Revised picture + +The pause-time growth has at least three contributors, not just user-space CPU: + +1. **Userspace futex contention** (17/250 in-kernel-sleep samples). + Some FC thread waits on a mutex/futex; bpftrace can't identify the + specific futex without more invasive instrumentation. Hypothesis: + the snapshot worker contends with vhost / vCPU teardown threads + that are holding a shared state lock; lock hold-time may grow with + accumulated snapshot count. +2. **ext4 journal + block IO writeback** (5 in-kernel-sleep samples, + only ~2 % of off-CPU time but present every BRANCH). Snapshot's + memory.bin write goes through the page cache → flusher → jbd2 + journal commit. On SSD this is bounded but visible. +3. **User-space CPU on the FC snapshot worker thread** — the + un-attributable remainder (~70 % of off-CPU thread-poll points + returned an empty kernel stack, meaning the thread was in + user-mode). bpftrace's frame-pointer-based stack unwind fails on + FC's static-pie release build → can't symbolize. Needs either + DWARF unwinding (perf, when we can install it) or a debug FC + rebuild. + +### Implications for #118 Phase 2/3 — refined + +- **Phase 2 (`io_uring` writer)** still doesn't address the dominant + causes here. The block IO path is only ~2 % of the BRANCH window; + futex contention and user CPU dominate. +- **Phase 3 (1 s pre-emptive tick)** would compound the futex + contention if the snapshot worker holds the contended lock during + its background work. Need to identify the lock before designing + Phase 3. +- **New candidate work** surfaced by this probe: a one-week effort to + identify the futex (use bpftrace's `tracepoint:syscalls:sys_enter_futex` + with `args.uaddr` capture, correlate with stack of waker) → if + contention is fixable, may cut multi-BRANCH pause growth without + touching the IO path at all. + ## Files -- `bench/pause-window/probe-multi-branch-strace.sh` — the script - that produced the dataset above. Strace `-c` summary per BRANCH; - cheap; runs in ~50 s for N=10. +- `bench/pause-window/probe-multi-branch-strace.sh` — the original + per-BRANCH strace summary. Cheap; runs in ~50 s for N=10. +- `bench/pause-window/probe-bpftrace-fc.sh` — bpftrace user-stack + sampling at 199 Hz; established the 94 % off-CPU finding. +- `bench/pause-window/probe-syscall-poll.sh` — /proc/$pid/syscall + poll loop (200 Hz); did not pinpoint a single syscall (consistent + with the off-CPU finding). - This document. diff --git a/bench/pause-window/probe-bpftrace-fc.sh b/bench/pause-window/probe-bpftrace-fc.sh new file mode 100644 index 0000000..ad12b3f --- /dev/null +++ b/bench/pause-window/probe-bpftrace-fc.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# probe-bpftrace-fc.sh — bpftrace-sample firecracker during a slow BRANCH +# to identify the user-space function the multi-BRANCH anomaly burns +# time in. Companion to PROBE-multi-branch-anomaly.md. +# +# Approach: +# 1. spawn a source sandbox +# 2. take N-1 BRANCHes to warm into the slow regime (BRANCH 6+ is +# ~800-1500 ms vs ~250 ms for BRANCH 1-2 on coding-agent-fork-prewarm-v1) +# 3. start bpftrace sampling user-space stacks at 199 Hz on FC pid +# 4. fire the Nth BRANCH (slow); bpftrace captures stacks throughout +# 5. detach bpftrace, print top stacks by sample count +# +# Output: /tmp/fc-stacks-.txt with the sampled stacks. +# The hottest stack tells us what user-space code Firecracker is in +# when the BRANCH N >= 6 takes ~5x longer than BRANCH 1-2. +set -euo pipefail + +FORKD_URL=${FORKD_URL:-http://127.0.0.1:8889} +FORKD_TOKEN=${FORKD_TOKEN:-$(cat "${FORKD_TOKEN_FILE:-/etc/forkd/token}" 2>/dev/null || echo "")} +TAG=${TAG:-coding-agent-fork-prewarm-v1} +WARMUP_BRANCHES=${WARMUP_BRANCHES:-6} +GAP_SECS=${GAP_SECS:-3} +OUT="/tmp/fc-bpftrace-$(date +%s).txt" + +auth=(-H "Authorization: Bearer $FORKD_TOKEN") + +echo "[probe] starting, will write to $OUT" >&2 + +# Spawn source +echo "[probe] spawn source from $TAG" >&2 +spawn=$(curl -fsS "${auth[@]}" -H "Content-Type: application/json" \ + -d "{\"snapshot_tag\":\"$TAG\",\"n\":1,\"per_child_netns\":true}" \ + "$FORKD_URL/v1/sandboxes") +sb_id=$(echo "$spawn" | jq -r '.[0].id') +fc_pid=$(echo "$spawn" | jq -r '.[0].pid') +echo "[probe] sandbox=$sb_id fc_pid=$fc_pid" >&2 +sleep 2 + +# Warmup: take WARMUP_BRANCHES-1 BRANCHes so the next one is in the slow regime +for i in $(seq 1 $((WARMUP_BRANCHES - 1))); do + sleep "$GAP_SECS" + btag="warmup-${i}-$(date +%s%N)" + resp=$(curl -fsS "${auth[@]}" -H "Content-Type: application/json" \ + -d "{\"tag\":\"$btag\",\"diff\":true}" \ + "$FORKD_URL/v1/sandboxes/$sb_id/branch") + pause_ms=$(echo "$resp" | jq -r '.pause_ms') + echo "[probe] warmup BRANCH $i: pause_ms=$pause_ms" >&2 +done + +# Start bpftrace BEFORE the final BRANCH; sample user stacks at 199 Hz. +# Filter to just our FC pid. ustack(perf,128) tries hard to symbolize. +echo "[probe] starting bpftrace on pid $fc_pid (199 Hz, user stacks)" >&2 +sudo bpftrace -e " +profile:hz:199 / pid == $fc_pid / +{ + @[ustack(perf, 32)] = count(); +} +interval:s:30 { exit(); } +" > "$OUT" 2>&1 & +bp_pid=$! +sleep 0.5 + +# Fire the slow BRANCH +echo "[probe] firing final (slow) BRANCH" >&2 +sleep "$GAP_SECS" +btag="profiled-$(date +%s%N)" +t0_ns=$(date +%s%N) +resp=$(curl -fsS "${auth[@]}" -H "Content-Type: application/json" \ + -d "{\"tag\":\"$btag\",\"diff\":true}" \ + "$FORKD_URL/v1/sandboxes/$sb_id/branch") +t1_ns=$(date +%s%N) +wall_ms=$(( (t1_ns - t0_ns) / 1000000 )) +pause_ms=$(echo "$resp" | jq -r '.pause_ms') +diff_ms=$(echo "$resp" | jq -r '.diff_ms') +echo "[probe] profiled BRANCH: wall=${wall_ms}ms pause_ms=${pause_ms} diff_ms=${diff_ms}" >&2 + +# Stop bpftrace +sudo kill -INT "$bp_pid" 2>/dev/null || true +wait "$bp_pid" 2>/dev/null || true + +# Cleanup +curl -fsS -X DELETE "${auth[@]}" "$FORKD_URL/v1/sandboxes/$sb_id" > /dev/null || true + +echo "" >&2 +echo "[probe] done. Top stacks by sample count:" >&2 +echo "[probe] output saved at $OUT" >&2 + +# Print top hottest stacks. bpftrace -e ... > out produces output in @{...} +# format. We grep + sort. +echo "" >&2 +echo "==== top stacks ====" >&2 +# Find lines with @[..] = N and sort by N desc. +awk '/^@\[/,/^\]: [0-9]+$/' "$OUT" | tail -100 >&2 || true diff --git a/bench/pause-window/probe-syscall-poll.sh b/bench/pause-window/probe-syscall-poll.sh new file mode 100644 index 0000000..ac9944e --- /dev/null +++ b/bench/pause-window/probe-syscall-poll.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash +# probe-syscall-poll.sh — poll /proc/$pid/syscall during a slow BRANCH to +# see which syscall (if any) firecracker is blocked in when wall time +# explodes. Companion to PROBE-multi-branch-anomaly.md. +# +# The bpftrace probe (probe-bpftrace-fc.sh) showed FC was off-CPU ~95% +# of the slow BRANCH, contradicting the "user-space CPU" interpretation +# in PROBE-multi-branch-anomaly.md. This script polls /proc//syscall +# at 200 Hz to see *which* syscall FC is sleeping in. +# +# Output: /tmp/syscall-poll-.txt — one line per sample: +# ... +# or "running" if FC was on-CPU at that moment. +set -euo pipefail + +FORKD_URL=${FORKD_URL:-http://127.0.0.1:8889} +FORKD_TOKEN=${FORKD_TOKEN:-$(cat "${FORKD_TOKEN_FILE:-/etc/forkd/token}" 2>/dev/null || echo "")} +TAG=${TAG:-coding-agent-fork-prewarm-v1} +WARMUP_BRANCHES=${WARMUP_BRANCHES:-5} +GAP_SECS=${GAP_SECS:-3} +OUT="/tmp/syscall-poll-$(date +%s).txt" +auth=(-H "Authorization: Bearer $FORKD_TOKEN") + +echo "[probe] output → $OUT" >&2 + +spawn=$(curl -fsS "${auth[@]}" -H "Content-Type: application/json" \ + -d "{\"snapshot_tag\":\"$TAG\",\"n\":1,\"per_child_netns\":true}" \ + "$FORKD_URL/v1/sandboxes") +sb_id=$(echo "$spawn" | jq -r '.[0].id') +fc_pid=$(echo "$spawn" | jq -r '.[0].pid') +echo "[probe] sandbox=$sb_id fc_pid=$fc_pid" >&2 +sleep 2 + +# Warmup +for i in $(seq 1 "$WARMUP_BRANCHES"); do + sleep "$GAP_SECS" + btag="warmup-${i}-$(date +%s%N)" + resp=$(curl -fsS "${auth[@]}" -H "Content-Type: application/json" \ + -d "{\"tag\":\"$btag\",\"diff\":true}" \ + "$FORKD_URL/v1/sandboxes/$sb_id/branch") + echo "[probe] warmup BRANCH $i: pause_ms=$(echo "$resp" | jq -r .pause_ms)" >&2 +done + +# Poll syscall in background. ~200 Hz (sleep 0.005). +sleep 1 +( + end_ns=$(( $(date +%s%N) + 3000000000 )) # poll for 3s max + while [ "$(date +%s%N)" -lt "$end_ns" ]; do + ts=$(date +%s%N) + sc=$(sudo cat /proc/$fc_pid/syscall 2>/dev/null || echo "gone") + printf "%s %s\n" "$ts" "$sc" + sleep 0.005 + done +) > "$OUT" & +poll_pid=$! + +# Fire the slow BRANCH while polling runs +sleep "$GAP_SECS" +echo "[probe] firing profiled BRANCH" >&2 +t0_ns=$(date +%s%N) +resp=$(curl -fsS "${auth[@]}" -H "Content-Type: application/json" \ + -d "{\"tag\":\"profiled-$(date +%s%N)\",\"diff\":true}" \ + "$FORKD_URL/v1/sandboxes/$sb_id/branch") +t1_ns=$(date +%s%N) +echo "[probe] BRANCH pause_ms=$(echo "$resp" | jq -r .pause_ms) wall=$(( (t1_ns - t0_ns) / 1000000 ))ms" >&2 + +# Wait for poller (3s window) and clean up +wait "$poll_pid" 2>/dev/null || true +curl -fsS -X DELETE "${auth[@]}" "$FORKD_URL/v1/sandboxes/$sb_id" > /dev/null || true + +# Print histogram of syscall_nr across the samples +echo "" >&2 +echo "===== syscall_nr histogram (during the slow window) =====" >&2 +awk -v t0="$t0_ns" -v t1="$t1_ns" ' +$1 >= t0 && $1 <= t1 { + # /proc/pid/syscall format: " ... " or "running" + if ($2 == "running") { + nr = "running" + } else { + nr = $2 + } + counts[nr]++ + total++ +} +END { + if (total == 0) { + print "no samples during the slow window" + } else { + for (k in counts) { + printf " %-12s %5d %.1f%%\n", k, counts[k], counts[k]*100.0/total + } + printf " %-12s %5d total samples\n", "(all)", total + } +}' "$OUT" | sort -k2 -n -r >&2 + +echo "" >&2 +echo "Raw poll log: $OUT" >&2 +echo "Syscall numbers on x86_64 (key ones for KVM):" >&2 +echo " 16 = ioctl (KVM ioctls land here)" >&2 +echo " 35 = nanosleep" >&2 +echo " 17 = pread64" >&2 +echo " 18 = pwrite64" >&2 +echo " 74 = fsync" >&2 +echo " 202 = futex" >&2 diff --git a/crates/forkd-cli/src/hub.rs b/crates/forkd-cli/src/hub.rs index 3b220a5..04226b4 100644 --- a/crates/forkd-cli/src/hub.rs +++ b/crates/forkd-cli/src/hub.rs @@ -431,11 +431,15 @@ fn hex(bytes: &[u8]) -> String { } /// Render a list-of-local-snapshots line for `forkd images list`. Walks -/// `snapshots/` under the data dir and reports tag + total size. +/// `snapshots/` under the data dir and reports tag + total size + +/// memory.bin size + dir mtime. pub struct LocalSnapshotInfo { pub tag: String, pub total_bytes: u64, + pub memory_bytes: u64, pub has_rootfs: bool, + /// Unix seconds. Best-effort: directory mtime; 0 if unreadable. + pub created_at_unix: u64, } pub fn list_local(snapshots_root: &Path) -> Result> { @@ -453,6 +457,7 @@ pub fn list_local(snapshots_root: &Path) -> Result> { let tag = entry.file_name().to_string_lossy().into_owned(); let dir = entry.path(); let mut total: u64 = 0; + let mut memory: u64 = 0; let mut has_rootfs = false; for name in SNAPSHOT_FILES { let p = dir.join(name); @@ -460,19 +465,58 @@ pub fn list_local(snapshots_root: &Path) -> Result> { total += m.len(); if *name == "rootfs.ext4" { has_rootfs = true; + } else if *name == "memory.bin" { + memory = m.len(); } } } + let created_at_unix = std::fs::metadata(&dir) + .ok() + .and_then(|m| m.modified().ok()) + .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) + .map(|d| d.as_secs()) + .unwrap_or(0); out.push(LocalSnapshotInfo { tag, total_bytes: total, + memory_bytes: memory, has_rootfs, + created_at_unix, }); } - out.sort_by(|a, b| a.tag.cmp(&b.tag)); + // Most recent first; ties broken by tag. + out.sort_by(|a, b| { + b.created_at_unix + .cmp(&a.created_at_unix) + .then_with(|| a.tag.cmp(&b.tag)) + }); Ok(out) } +/// Format a unix timestamp as a human-readable "age" relative to now. +/// Examples: "3m ago", "12h ago", "2d ago", "—" if unknown. +pub fn human_age(created_at_unix: u64) -> String { + if created_at_unix == 0 { + return "—".to_string(); + } + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0); + let dt = now.saturating_sub(created_at_unix); + if dt < 60 { + format!("{dt}s ago") + } else if dt < 3600 { + format!("{}m ago", dt / 60) + } else if dt < 86400 { + format!("{}h ago", dt / 3600) + } else if dt < 86400 * 30 { + format!("{}d ago", dt / 86400) + } else { + format!("{}mo ago", dt / 86400 / 30) + } +} + /// Pretty MiB / GiB formatter for `forkd images list`. pub fn human_bytes(n: u64) -> String { let n = n as f64; diff --git a/crates/forkd-cli/src/main.rs b/crates/forkd-cli/src/main.rs index 453c953..5548073 100644 --- a/crates/forkd-cli/src/main.rs +++ b/crates/forkd-cli/src/main.rs @@ -243,6 +243,24 @@ enum Cmd { #[arg(long)] mem_size_mib: Option, }, + /// Remove one or more snapshot tags. Tries the daemon's DELETE + /// /v1/snapshots/:tag first (clean: removes registry entry + on-disk + /// files atomically); falls back to direct disk removal if the + /// daemon isn't running. + /// + /// Examples: + /// forkd rmi pyagent + /// forkd rmi pyagent langgraph python-numpy + Rmi { + /// Snapshot tags to remove. + tags: Vec, + /// Controller daemon base URL. + #[arg(long, env = "FORKD_URL", default_value = "http://127.0.0.1:8889")] + daemon_url: String, + /// Bearer token (matches the daemon's --token-file). + #[arg(long, env = "FORKD_TOKEN")] + daemon_token: Option, + }, /// List live sandboxes (GET /v1/sandboxes). Table output. Ls { /// Controller daemon base URL. @@ -643,6 +661,11 @@ fn main() -> Result<()> { boot_wait_secs, mem_size_mib, ), + Cmd::Rmi { + tags, + daemon_url, + daemon_token, + } => rmi_cmd(&daemon_url, daemon_token, tags), Cmd::Ls { daemon_url, daemon_token, @@ -1118,6 +1141,79 @@ fn push_cmd( Ok(()) } +/// `forkd rmi ...` — delete snapshots. Daemon-first; falls back +/// to direct disk removal when the daemon is unreachable. +fn rmi_cmd(daemon_url: &str, token: Option, tags: Vec) -> Result<()> { + if tags.is_empty() { + bail!("no tags provided. Usage: forkd rmi ..."); + } + let agent = ureq::AgentBuilder::new() + .timeout(std::time::Duration::from_secs(10)) + .build(); + let snapshots_root = data_dir().join("snapshots"); + let mut errs = 0usize; + + for tag in &tags { + let result = (|| -> Result<&'static str> { + // 1. Try daemon DELETE first. + let url = format!("{}/v1/snapshots/{}", daemon_url.trim_end_matches('/'), tag); + let mut req = agent.delete(&url); + if let Some(t) = token.as_deref() { + req = req.set("Authorization", &format!("Bearer {t}")); + } + match req.call() { + Ok(_) => Ok("daemon"), + Err(ureq::Error::Status(404, _)) => { + // Daemon doesn't know it; try disk fallback. + fallback_remove(&snapshots_root, tag)?; + Ok("disk") + } + Err(ureq::Error::Status(code, r)) => { + let body = r.into_string().unwrap_or_default(); + bail!("daemon HTTP {code}: {body}"); + } + Err(_transport) => { + // Daemon down → disk fallback. + fallback_remove(&snapshots_root, tag)?; + Ok("disk (daemon unreachable)") + } + } + })(); + match result { + Ok(src) => println!(" ✓ {tag} ({src})"), + Err(e) => { + println!(" ✗ {tag} ({e})"); + errs += 1; + } + } + } + if errs > 0 { + bail!("{errs} of {} removals failed", tags.len()); + } + Ok(()) +} + +fn fallback_remove(snapshots_root: &std::path::Path, tag: &str) -> Result<()> { + // Validate tag against the same rules the daemon enforces. + if tag.is_empty() + || tag.len() > 64 + || !tag + .chars() + .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_') + { + bail!("invalid tag (must be 1-64 chars, alnum / dash / underscore)"); + } + let dir = snapshots_root.join(tag); + if !dir.exists() { + bail!( + "snapshot {tag} not found (no daemon entry, no disk dir at {})", + dir.display() + ); + } + std::fs::remove_dir_all(&dir).with_context(|| format!("rm -rf {}", dir.display()))?; + Ok(()) +} + fn images_cmd() -> Result<()> { let root = data_dir().join("snapshots"); let infos = hub::list_local(&root)?; @@ -1128,15 +1224,34 @@ fn images_cmd() -> Result<()> { ); return Ok(()); } - println!("{:<32} {:>12} ROOTFS?", "TAG", "SIZE"); - for info in infos { + let tag_w = infos.iter().map(|i| i.tag.len()).max().unwrap_or(8).max(3); + println!( + " {:10} {:>10} {:>10} ROOTFS", + "TAG", + "SIZE", + "MEMORY", + "CREATED", + tag_w = tag_w, + ); + let mut total_bytes: u64 = 0; + for info in &infos { println!( - "{:<32} {:>12} {}", + " {:10} {:>10} {:>10} {}", info.tag, hub::human_bytes(info.total_bytes), - if info.has_rootfs { "yes" } else { "—" } + hub::human_bytes(info.memory_bytes), + hub::human_age(info.created_at_unix), + if info.has_rootfs { "yes" } else { "—" }, + tag_w = tag_w, ); + total_bytes += info.total_bytes; } + println!( + "\n {} snapshot{} · {} total", + infos.len(), + if infos.len() == 1 { "" } else { "s" }, + hub::human_bytes(total_bytes), + ); Ok(()) }