Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 39 additions & 15 deletions internal/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,16 @@ func (s *Sandbox) InstallAllCommand(ctx context.Context, pkgs []string) ([]strin
return append(cmd, pkgs...), nil
}

// npmLifecycleParallelism is the maximum number of package lifecycle
// hook chains npmLifecycleScript runs concurrently. Each chain spawns
// a small process tree (sh + npm + node + helpers), so the bound is
// driven by --pids-limit=256 and per-container CPU/memory headroom
// rather than by host parallelism — at the chosen value of 4, peak
// concurrent process count stays well under the limit even for
// native-module packages that fork compiler toolchains. Mirrors
// audit.py's worker count for consistency.
const npmLifecycleParallelism = 4

// npmLifecycleScript builds a /bin/sh script that fires preinstall +
// install + postinstall hooks for each target package directory under
// /install/node_modules. When pkgs is nil, every package directory at
Expand All @@ -839,41 +849,55 @@ func (s *Sandbox) InstallAllCommand(ctx context.Context, pkgs []string) ([]strin
// Each package runs in its own subshell with `;` between hooks so a
// single failing script does not abort the whole sweep — partial
// progress is what we want for a forensic capture.
//
// xargs -P drives the parallelism. With sequential `;`-joined
// subshells the older form spent ~1 second per package on npm CLI
// startup alone (300 invocations across 100 packages, mostly no-op
// `--if-present` skips), which dominated batch scan wall time.
// Parallel execution lets the strace stream interleave events from
// up to npmLifecycleParallelism PIDs at once; the analyzer's
// streaming PID→comm pass attributes each event correctly because
// every clone/execve carries its own PID and the JIT/library-hijack
// rules look up that PID independently.
//
// xargs -I{} treats one input line as one argument. We use `\n` as
// the separator (not `\0`) because dash's printf builtin lacks
// portable `\0` support and npm package names cannot contain
// newlines (npm registry name spec: `[a-z0-9._@~/-]`). Single-
// quoting the cd target inside the inserted command is defense-in-
// depth in case a future code path bypasses depfile name validation.
func npmLifecycleScript(pkgs []string) string {
const hooks = `npm run --silent --if-present preinstall; ` +
`npm run --silent --if-present install; ` +
`npm run --silent --if-present postinstall`
parallel := strconv.Itoa(npmLifecycleParallelism)

if len(pkgs) == 0 {
// mindepth 2 catches /install/node_modules/<pkg>/package.json,
// maxdepth 3 also catches /install/node_modules/@scope/<pkg>/package.json
// while excluding nested transitive node_modules.
//
// `find ... | while read` instead of `find -print0 | read -d ""`
// because the sandbox's /bin/sh is dash, which silently rejects
// the bash-only `-d` flag and produces an empty event stream.
// Newlines in node_modules paths are not realistic.
return `find /install/node_modules -mindepth 2 -maxdepth 3 -name package.json -type f ` +
`| while IFS= read -r pj; do ` +
`(cd "$(dirname "$pj")" && ` + hooks + `) 2>&1; done`
// `find -print0 | xargs -0` is portable across dash and bash and
// avoids the `read -d ""` portability issue that motivated the
// previous `find | while read` form.
return `find /install/node_modules -mindepth 2 -maxdepth 3 -name package.json -type f -print0 ` +
`| xargs -0 -P ` + parallel + ` -I{} sh -c 'd=$(dirname "{}") && cd "$d" && ` + hooks + `' 2>&1`
}

var b strings.Builder
for i, p := range pkgs {
if i > 0 {
b.WriteString("; ")
}
b.WriteString(`printf '%s\n'`)
for _, p := range pkgs {
b.WriteByte(' ')
// Single-quote the full cd target so any shell metacharacters
// in p (which originates from package.json keys, i.e. attacker-
// controllable input) cannot break out of the argument. depfile
// validates names at parse time; this is defense-in-depth for
// any future code path that populates pkgs from another source.
b.WriteString(`(cd `)
b.WriteString(shQuote("/install/node_modules/" + p))
b.WriteString(` && `)
b.WriteString(hooks)
b.WriteString(`) 2>&1`)
}
b.WriteString(` | xargs -P ` + parallel + ` -I{} sh -c 'cd "{}" && `)
b.WriteString(hooks)
b.WriteString(`' 2>&1`)
return b.String()
}

Expand Down
48 changes: 48 additions & 0 deletions internal/sandbox/sandbox_extra_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,54 @@ func TestShQuote(t *testing.T) {
}
}

// TestNpmLifecycleScript_ParallelDispatch pins the xargs -P invocation
// structure for both the discovery (nil pkgs) and named-pkgs paths.
// Sequential `;`-joined subshells previously dominated batch scan wall
// time on npm CLI startup overhead (~1s per `npm run --silent
// --if-present` no-op × 300 invocations across 100 packages). xargs
// -P bounds peak concurrency at npmLifecycleParallelism so the
// container's --pids-limit and memory ceiling are respected while
// hooks run in parallel.
func TestNpmLifecycleScript_ParallelDispatch(t *testing.T) {
// Discovery path: find -print0 piped to xargs -0 -P N. -print0
// + -0 avoids the dash-incompatible `read -d ""` form the old
// loop went out of its way to skirt around.
discovery := npmLifecycleScript(nil)
for _, want := range []string{
"find /install/node_modules",
"-print0",
"| xargs -0 -P 4 -I{}",
`sh -c 'd=$(dirname "{}") && cd "$d" && `,
} {
if !strings.Contains(discovery, want) {
t.Errorf("discovery script missing %q in:\n%s", want, discovery)
}
}
// The old sequential `while IFS= read -r pj` shape must NOT reappear.
if strings.Contains(discovery, "while IFS=") {
t.Errorf("discovery script regressed to sequential while-read loop:\n%s", discovery)
}

// Named-pkgs path: printf '%s\n' ... | xargs -P N -I{} sh -c ...
named := npmLifecycleScript([]string{"lodash", "express"})
for _, want := range []string{
`printf '%s\n'`,
"'/install/node_modules/lodash'",
"'/install/node_modules/express'",
"| xargs -P 4 -I{}",
`sh -c 'cd "{}" && `,
} {
if !strings.Contains(named, want) {
t.Errorf("named script missing %q in:\n%s", want, named)
}
}
// The old sequential `(cd ... && hooks) 2>&1; (cd ... &&` chain
// must not reappear — that form was the dominant wall-time cost.
if strings.Contains(named, "&& "+`npm run --silent --if-present preinstall; npm run --silent --if-present install; npm run --silent --if-present postinstall) 2>&1; (cd `) {
t.Errorf("named script regressed to sequential subshell chain:\n%s", named)
}
}

func TestNpmLifecycleScript_QuotesShellMetachars(t *testing.T) {
// Defense-in-depth: even if a malformed name slips past the depfile
// validator, the cd target must be safely single-quoted so shell
Expand Down
Loading