From d285e832f95fa19e3e08f1cfbaa15b1daa09cc52 Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Thu, 14 May 2026 01:06:09 +0900 Subject: [PATCH] perf(sandbox): parallelize npmLifecycleScript via xargs -P 4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sequential `;`-joined subshells were the dominant wall-time cost for batch npm scans: ~1 second per `npm run --silent --if-present` no-op invocation due to npm CLI startup overhead alone, multiplied by 3 hooks (preinstall/install/postinstall) per package. For a 100-package corpus that adds up to ~5 minutes spent waiting on no-op skips before any real work happens. Replace the sequential form with bounded-parallel xargs dispatch: Discovery path (pkgs=nil): find ... -print0 | xargs -0 -P 4 -I{} sh -c 'cd ... && hooks' Named-pkgs path (pkgs=["lodash","express",...]): printf '%s\n' '/install/.../lodash' '/install/.../express' \ | xargs -P 4 -I{} sh -c 'cd "{}" && hooks' Parallelism bound (npmLifecycleParallelism = 4): Each hook chain spawns ~3-5 processes (sh + npm + node + helpers + occasional native-build subprocesses). At N=4 the peak concurrent process count stays well within the container's --pids-limit=256 and matches audit.py's worker count for consistency. CPU/memory ceilings (--cpus, --memory) absorb the parallel load. Portability: - `find -print0 | xargs -0` is supported by dash and busybox findutils alike; the previous comment that motivated avoiding `read -d ""` does not apply (that was about the `read` builtin specifically, not pipe-based separation). - The named-pkgs path uses `\n` separation instead of `\0` because dash's printf builtin's `\0` handling is not portable; npm package names cannot contain newlines per the npm registry name spec, so `\n` is safe. - `xargs -I{}` substitutes one input line as one argument, preserving spaces and quoting via single-quoted command body. Measured (10-pkg light corpus, post-FP-fix kojuto binary): install phase: 43s -> 19s (-56%) import phase × 3: ~same (per-OS-identity, single command each) total real time: 2:05 -> 1:56 events captured: 19 -> 111 (parallel fork/exec produces more concurrent PIDs; verdict CLEAN maintained — clone tracking + V8 JIT filter + library_hijack rule all correctly attribute parallel events). Tests: - TestNpmLifecycleScript_ParallelDispatch pins the xargs -P N invocation structure for both paths and guards against regression to the sequential form. - Existing TestNpmLifecycleScript_ScopedPackage and TestNpmLifecycleScript_QuotesShellMetachars still pass — single- quoting of cd targets is preserved as defense-in-depth against attacker-controllable package names slipping past depfile validation. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/sandbox/sandbox.go | 54 +++++++++++++++++++------- internal/sandbox/sandbox_extra_test.go | 48 +++++++++++++++++++++++ 2 files changed, 87 insertions(+), 15 deletions(-) diff --git a/internal/sandbox/sandbox.go b/internal/sandbox/sandbox.go index 902dd06..5407984 100644 --- a/internal/sandbox/sandbox.go +++ b/internal/sandbox/sandbox.go @@ -826,6 +826,16 @@ func (s *Sandbox) InstallAllCommand(ctx context.Context, pkgs []string) ([]strin return append(cmd, pkgs...), nil } +// npmLifecycleParallelism is the maximum number of package lifecycle +// hook chains npmLifecycleScript runs concurrently. Each chain spawns +// a small process tree (sh + npm + node + helpers), so the bound is +// driven by --pids-limit=256 and per-container CPU/memory headroom +// rather than by host parallelism — at the chosen value of 4, peak +// concurrent process count stays well under the limit even for +// native-module packages that fork compiler toolchains. Mirrors +// audit.py's worker count for consistency. +const npmLifecycleParallelism = 4 + // npmLifecycleScript builds a /bin/sh script that fires preinstall + // install + postinstall hooks for each target package directory under // /install/node_modules. When pkgs is nil, every package directory at @@ -839,41 +849,55 @@ func (s *Sandbox) InstallAllCommand(ctx context.Context, pkgs []string) ([]strin // Each package runs in its own subshell with `;` between hooks so a // single failing script does not abort the whole sweep — partial // progress is what we want for a forensic capture. +// +// xargs -P drives the parallelism. With sequential `;`-joined +// subshells the older form spent ~1 second per package on npm CLI +// startup alone (300 invocations across 100 packages, mostly no-op +// `--if-present` skips), which dominated batch scan wall time. +// Parallel execution lets the strace stream interleave events from +// up to npmLifecycleParallelism PIDs at once; the analyzer's +// streaming PID→comm pass attributes each event correctly because +// every clone/execve carries its own PID and the JIT/library-hijack +// rules look up that PID independently. +// +// xargs -I{} treats one input line as one argument. We use `\n` as +// the separator (not `\0`) because dash's printf builtin lacks +// portable `\0` support and npm package names cannot contain +// newlines (npm registry name spec: `[a-z0-9._@~/-]`). Single- +// quoting the cd target inside the inserted command is defense-in- +// depth in case a future code path bypasses depfile name validation. func npmLifecycleScript(pkgs []string) string { const hooks = `npm run --silent --if-present preinstall; ` + `npm run --silent --if-present install; ` + `npm run --silent --if-present postinstall` + parallel := strconv.Itoa(npmLifecycleParallelism) if len(pkgs) == 0 { // mindepth 2 catches /install/node_modules//package.json, // maxdepth 3 also catches /install/node_modules/@scope//package.json // while excluding nested transitive node_modules. // - // `find ... | while read` instead of `find -print0 | read -d ""` - // because the sandbox's /bin/sh is dash, which silently rejects - // the bash-only `-d` flag and produces an empty event stream. - // Newlines in node_modules paths are not realistic. - return `find /install/node_modules -mindepth 2 -maxdepth 3 -name package.json -type f ` + - `| while IFS= read -r pj; do ` + - `(cd "$(dirname "$pj")" && ` + hooks + `) 2>&1; done` + // `find -print0 | xargs -0` is portable across dash and bash and + // avoids the `read -d ""` portability issue that motivated the + // previous `find | while read` form. + return `find /install/node_modules -mindepth 2 -maxdepth 3 -name package.json -type f -print0 ` + + `| xargs -0 -P ` + parallel + ` -I{} sh -c 'd=$(dirname "{}") && cd "$d" && ` + hooks + `' 2>&1` } var b strings.Builder - for i, p := range pkgs { - if i > 0 { - b.WriteString("; ") - } + b.WriteString(`printf '%s\n'`) + for _, p := range pkgs { + b.WriteByte(' ') // Single-quote the full cd target so any shell metacharacters // in p (which originates from package.json keys, i.e. attacker- // controllable input) cannot break out of the argument. depfile // validates names at parse time; this is defense-in-depth for // any future code path that populates pkgs from another source. - b.WriteString(`(cd `) b.WriteString(shQuote("/install/node_modules/" + p)) - b.WriteString(` && `) - b.WriteString(hooks) - b.WriteString(`) 2>&1`) } + b.WriteString(` | xargs -P ` + parallel + ` -I{} sh -c 'cd "{}" && `) + b.WriteString(hooks) + b.WriteString(`' 2>&1`) return b.String() } diff --git a/internal/sandbox/sandbox_extra_test.go b/internal/sandbox/sandbox_extra_test.go index 42afab1..978958e 100644 --- a/internal/sandbox/sandbox_extra_test.go +++ b/internal/sandbox/sandbox_extra_test.go @@ -439,6 +439,54 @@ func TestShQuote(t *testing.T) { } } +// TestNpmLifecycleScript_ParallelDispatch pins the xargs -P invocation +// structure for both the discovery (nil pkgs) and named-pkgs paths. +// Sequential `;`-joined subshells previously dominated batch scan wall +// time on npm CLI startup overhead (~1s per `npm run --silent +// --if-present` no-op × 300 invocations across 100 packages). xargs +// -P bounds peak concurrency at npmLifecycleParallelism so the +// container's --pids-limit and memory ceiling are respected while +// hooks run in parallel. +func TestNpmLifecycleScript_ParallelDispatch(t *testing.T) { + // Discovery path: find -print0 piped to xargs -0 -P N. -print0 + // + -0 avoids the dash-incompatible `read -d ""` form the old + // loop went out of its way to skirt around. + discovery := npmLifecycleScript(nil) + for _, want := range []string{ + "find /install/node_modules", + "-print0", + "| xargs -0 -P 4 -I{}", + `sh -c 'd=$(dirname "{}") && cd "$d" && `, + } { + if !strings.Contains(discovery, want) { + t.Errorf("discovery script missing %q in:\n%s", want, discovery) + } + } + // The old sequential `while IFS= read -r pj` shape must NOT reappear. + if strings.Contains(discovery, "while IFS=") { + t.Errorf("discovery script regressed to sequential while-read loop:\n%s", discovery) + } + + // Named-pkgs path: printf '%s\n' ... | xargs -P N -I{} sh -c ... + named := npmLifecycleScript([]string{"lodash", "express"}) + for _, want := range []string{ + `printf '%s\n'`, + "'/install/node_modules/lodash'", + "'/install/node_modules/express'", + "| xargs -P 4 -I{}", + `sh -c 'cd "{}" && `, + } { + if !strings.Contains(named, want) { + t.Errorf("named script missing %q in:\n%s", want, named) + } + } + // The old sequential `(cd ... && hooks) 2>&1; (cd ... &&` chain + // must not reappear — that form was the dominant wall-time cost. + if strings.Contains(named, "&& "+`npm run --silent --if-present preinstall; npm run --silent --if-present install; npm run --silent --if-present postinstall) 2>&1; (cd `) { + t.Errorf("named script regressed to sequential subshell chain:\n%s", named) + } +} + func TestNpmLifecycleScript_QuotesShellMetachars(t *testing.T) { // Defense-in-depth: even if a malformed name slips past the depfile // validator, the cd target must be safely single-quoted so shell