From 65123fff542d9af7268b4d4ce290c330e800b97a Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Wed, 13 May 2026 02:02:42 +0900 Subject: [PATCH 1/9] fix(probe): trim space-padded child PIDs before parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit strace right-pads child PIDs with spaces to align columns (e.g. `[pid 12]`, `[pid 1]`). strconv.ParseUint rejects leading whitespace, so every container PID parsed as 0 — silently disabling downstream PID-aware analysis. TrimSpace before parse. Surfaces during clean-corpus FP measurement: the planned V8 JIT mprotect filter (long-standing TODO in strace_parse.go) attributes events by PID, so without this fix the filter would be a no-op for every event in a real scan. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/probe/strace_parse.go | 8 +++++++- internal/probe/strace_parse_extra_test.go | 7 +++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/internal/probe/strace_parse.go b/internal/probe/strace_parse.go index 5e8fd3e..2bc1f1f 100644 --- a/internal/probe/strace_parse.go +++ b/internal/probe/strace_parse.go @@ -599,7 +599,13 @@ func extractPID(line string) uint32 { return 0 } - p, err := strconv.ParseUint(pidStr[:endIdx], 10, 32) + // strace right-pads the PID with spaces to align columns, e.g. + // "[pid 12]" or "[pid 1]". strconv.ParseUint is strict and + // rejects leading spaces, so without TrimSpace every PID under 5 + // digits parses as 0 — which historically broke PID-aware analysis + // (V8 JIT filtering, process-tree correlation) for every container + // PID, since container PIDs are almost always small. + p, err := strconv.ParseUint(strings.TrimSpace(pidStr[:endIdx]), 10, 32) if err != nil { return 0 } diff --git a/internal/probe/strace_parse_extra_test.go b/internal/probe/strace_parse_extra_test.go index f27c6c0..a66676b 100644 --- a/internal/probe/strace_parse_extra_test.go +++ b/internal/probe/strace_parse_extra_test.go @@ -220,6 +220,13 @@ func TestExtractPID(t *testing.T) { {`connect(...)`, 0}, {`[pid abc] connect(...)`, 0}, {`[pid ] connect(...)`, 0}, + // strace right-pads small PIDs with spaces to align columns. + // Without TrimSpace handling, ParseUint would reject these and + // every container PID (almost always small) would extract as + // 0, silently disabling all PID-aware analysis downstream. + {`[pid 12] mprotect(0x7f..., 4096, PROT_READ|PROT_WRITE|PROT_EXEC) = 0`, 12}, + {`[pid 1] execve("/usr/bin/sh", ...) = 0`, 1}, + {`[pid 999] openat(AT_FDCWD, "/etc/passwd", ...) = 3`, 999}, } for _, tc := range cases { From 972ead45b2a8c93857673d1a8760ad4895f96452 Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Wed, 13 May 2026 02:02:57 +0900 Subject: [PATCH 2/9] fix(sandbox): redirect package-manager caches + stage install script as file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two sandbox-side changes that eliminate distinct false-positive classes surfaced by clean-corpus measurement, without weakening any analyzer rule: 1. Package-manager caches pinned outside HOME via a dedicated /var/cache/kojuto tmpfs: - --tmpfs=/var/cache/kojuto:nosuid,mode=1777,size=200m - --env=NPM_CONFIG_CACHE=/var/cache/kojuto/npm - --env=PIP_CACHE_DIR=/var/cache/kojuto/pip npm's _logs and _cacache, plus pip's wheel cache, previously wrote under /home/dev/.npm and /home/dev/.cache/pip. Both are correctly flagged by the persistence backstop (any /home/ write is illegitimate). Redirecting at the sandbox layer keeps the strict detection rule intact instead of relaxing it via a path-based allowlist, which would have opened a "smuggle payload under a benign-looking cache prefix" bypass. 2. Probe install script is staged to /var/cache/kojuto/install.sh and invoked as `sh /var/cache/kojuto/install.sh` rather than `sh -c `. The cmdline shape difference matters: the analyzer's classifyExecve treats `sh -c ` as a positive attack signature when isShellCmdBenign rejects the content. Kojuto's own install loop (find + while + npm run ...) cannot pass the benign check, so it produced a guaranteed FP on every npm scan. Switching to `sh ` lets isBenignExec recognize sh from /bin/ as benign and filter the outer probe shell entirely — without any allowlist, marker, or PID-based filtering. Attackers cannot mimic the shape because npm/yarn/pnpm always spawn lifecycle hooks as `sh -c `; the file-path form is reserved for kojuto's own launch path. InstallCommand and InstallAllCommand signatures change to take context.Context and return (cmd, error) — the new stageInstallScript helper writes the script via dockerWriteFile before strace attaches. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/root.go | 28 ++++++- internal/sandbox/sandbox.go | 87 ++++++++++++++++--- internal/sandbox/sandbox_extra_test.go | 110 +++++++++++++++++-------- internal/sandbox/sandbox_mock_test.go | 6 ++ 4 files changed, 180 insertions(+), 51 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index dcf9670..f47e1c7 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -316,6 +316,19 @@ func scanSinglePackage(pkg, version, ecosystem string) (*pinnedDep, error) { return &pinnedDep{Name: pkg, Version: resolvedVersion}, nil } +// benchLog emits a single stderr line with event count and heap stats when +// KOJUTO_BENCH=1. Used by bench/ harness to chart analyzer load and memory +// ceiling across install/import/analyze phases. No-op outside bench mode. +func benchLog(phase string, eventCount int) { + if os.Getenv("KOJUTO_BENCH") != "1" { + return + } + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + fmt.Fprintf(os.Stderr, "BENCH phase=%s events=%d heap_mb=%d sys_mb=%d\n", + phase, eventCount, ms.HeapAlloc/(1024*1024), ms.Sys/(1024*1024)) +} + func runBatchScan(_ []string) error { deps, ecosystem, err := depfileParse(flagFile) if err != nil { @@ -421,7 +434,11 @@ func runBatchScreening(deps []depfile.Dep, ecosystem string) (string, error) { // Install all packages at once with strace. installPhase := startPhase("install", fmt.Sprintf("%d packages", len(pkgNames))) cp := probe.NewContainerStrace() - installOut, installErr := cp.StartAndInstall(ctx, sb.ContainerID(), sb.InstallAllCommand(pkgNames)) + installCmd, installCmdErr := sb.InstallAllCommand(ctx, pkgNames) + if installCmdErr != nil { + return "", fmt.Errorf("staging install command: %w", installCmdErr) + } + installOut, installErr := cp.StartAndInstall(ctx, sb.ContainerID(), installCmd) if installErr != nil { fmt.Fprintf(os.Stderr, "[!] Install output:\n%s\n", string(installOut)) return "", fmt.Errorf("batch install failed: %w", installErr) @@ -432,6 +449,7 @@ func runBatchScreening(deps []depfile.Dep, ecosystem string) (string, error) { for evt := range cp.Events() { events = append(events, evt) } + benchLog("install_drain", len(events)) // Import all packages under simulated OS identities (3 scripts total). if err := sb.WriteProbeScriptsMulti(ctx, pkgNames); err != nil { @@ -453,9 +471,11 @@ func runBatchScreening(deps []depfile.Dep, ecosystem string) (string, error) { events = append(events, evt) } importPhase.end() + benchLog("import_drain_"+osNames[i], len(events)) } verdict, filtered := analyzer.Analyze(events) + benchLog("analyze_done", len(filtered)) phaseInfo("screening", fmt.Sprintf("verdict=%s (%d events)", verdict, len(filtered))) return verdict, nil @@ -1034,7 +1054,11 @@ func runContainerStraceProbe(ctx context.Context, sb *sandbox.Sandbox, _ string) cp := probe.NewContainerStrace() installPhase := startPhase("install", "") - installOut, err := cp.StartAndInstall(ctx, sb.ContainerID(), sb.InstallCommand()) + installCmd, err := sb.InstallCommand(ctx) + if err != nil { + return nil, fmt.Errorf("staging install command: %w", err) + } + installOut, err := cp.StartAndInstall(ctx, sb.ContainerID(), installCmd) if err != nil { fmt.Fprintf(os.Stderr, "[!] Install output:\n%s\n", string(installOut)) diff --git a/internal/sandbox/sandbox.go b/internal/sandbox/sandbox.go index 85f5867..902dd06 100644 --- a/internal/sandbox/sandbox.go +++ b/internal/sandbox/sandbox.go @@ -176,6 +176,13 @@ func (s *Sandbox) containerArgs() ([]string, error) { "--tmpfs=/usr/local/bin:nosuid,exec,mode=0755,size=32m", "--tmpfs=/run:nosuid,size=1m", "--tmpfs=/home/dev:nosuid,mode=1777,size=32m", + // Dedicated cache tmpfs outside HOME. npm and pip are pinned here via + // NPM_CONFIG_CACHE / PIP_CACHE_DIR so their legitimate writes (logs, + // _cacache, wheel cache) never land under /home/ and never trip the + // persistence backstop. Keeps the "no /home/ writes" structural + // guarantee strict without path-based allowlists, which would let + // malicious packages smuggle artifacts under a benign-looking prefix. + "--tmpfs=/var/cache/kojuto:nosuid,mode=1777,size=200m", "--memory="+mem, "--cpus="+cpus, "--pids-limit=256", @@ -219,7 +226,19 @@ func (s *Sandbox) containerArgs() ([]string, error) { // Audit hook: load kojuto-require.js before any user code in Node.js. // This intercepts eval/Function/vm dynamic code execution. - args = append(args, "--env=NODE_OPTIONS=--require /opt/kojuto/kojuto-require.js") + // + // NPM_CONFIG_CACHE / PIP_CACHE_DIR pin package-manager caches to the + // dedicated /var/cache/kojuto tmpfs. Without these, npm writes + // /home/dev/.npm/_logs and pip writes /home/dev/.cache/pip — both + // correctly flagged as persistence by the /home/ structural backstop + // in the analyzer. Redirecting at the sandbox layer is preferable to + // relaxing the detection rule: the rule stays strict, while + // legitimate cache I/O goes to a path the analyzer never inspects. + args = append(args, + "--env=NODE_OPTIONS=--require /opt/kojuto/kojuto-require.js", + "--env=NPM_CONFIG_CACHE=/var/cache/kojuto/npm", + "--env=PIP_CACHE_DIR=/var/cache/kojuto/pip", + ) // Tell sitecustomize.py which packages are being audited so its // frame-walking logic can flag dynamic exec originating in those @@ -707,11 +726,52 @@ func (s *Sandbox) Exec(ctx context.Context, command []string) ([]byte, error) { // InstallPackage runs the install command inside the sandbox. func (s *Sandbox) InstallPackage(ctx context.Context) ([]byte, error) { - return s.Exec(ctx, s.InstallCommand()) + cmd, err := s.InstallCommand(ctx) + if err != nil { + return nil, err + } + return s.Exec(ctx, cmd) } -// InstallCommand returns the install command for the ecosystem. -func (s *Sandbox) InstallCommand() []string { +// installScriptPath is the in-container location where the probe stages +// its install script before strace attaches. Sits on the dedicated +// /var/cache/kojuto tmpfs configured in containerArgs. +// +// The probe is invoked as `sh ` rather than the +// previous `sh -c `. The shape difference matters: the +// analyzer's classifyExecve treats `sh -c ...` as a positive attack +// signature when the contents fail isShellCmdBenign, which produces a +// guaranteed false positive on every npm scan because kojuto's own +// install loop (find + while + npm run ...) cannot pass the benign +// check. Switching to `sh ` lets isBenignExec recognize sh from +// /bin/ as benign and filter the outer probe shell entirely without +// any allowlist, marker, or PID-based filtering. +// +// Attackers cannot mimic this shape: the cmdline of a shell spawned +// from a package's preinstall hook is determined by npm/yarn/pnpm +// (`sh -c `), not by the package itself. +const installScriptPath = "/var/cache/kojuto/install.sh" + +// stageInstallScript writes content to installScriptPath inside the +// running container. Used by InstallCommand/InstallAllCommand to stage +// the probe script before strace attaches. The write happens via a +// separate docker exec session, so the syscalls it produces are not +// observed by the install-phase strace. +func (s *Sandbox) stageInstallScript(ctx context.Context, content string) ([]string, error) { + if err := s.dockerWriteFile(ctx, installScriptPath, content); err != nil { + return nil, fmt.Errorf("stage install script: %w", err) + } + return []string{"sh", installScriptPath}, nil +} + +// InstallCommand returns the install command for the ecosystem. For +// ecosystems that need a shell-driven install (npm lifecycle hooks, +// local-mode pip glob expansion), this method writes the install +// script to the container's tmpfs first and returns a file-path-based +// command so the outer probe shell does not trigger the analyzer's +// `sh -c` attack-signature branch. See installScriptPath for the +// design rationale. +func (s *Sandbox) InstallCommand(ctx context.Context) ([]string, error) { if s.ecosystem == types.EcosystemNpm { // The host has already resolved deps into node_modules (with // --ignore-scripts). Inside the sandbox we fire each package's @@ -720,7 +780,7 @@ func (s *Sandbox) InstallCommand() []string { // script and rebuilds native modules — it skips preinstall and // postinstall, which is exactly where most npm supply chain // attacks place their payload (axios, crypto-js, Shai-Hulud). - return []string{"sh", "-c", npmLifecycleScript(nil)} + return s.stageInstallScript(ctx, npmLifecycleScript(nil)) } // Local mode: install directly from the file in the mount point. @@ -729,10 +789,8 @@ func (s *Sandbox) InstallCommand() []string { if s.localMode { // Find the actual file in the mount point and install it directly. // This handles both wheels (.whl) and source distributions (.tar.gz). - return []string{ - "sh", "-c", - "pip install --no-index --no-deps --no-build-isolation " + s.mountPoint + "/*", - } + return s.stageInstallScript(ctx, + "pip install --no-index --no-deps --no-build-isolation "+s.mountPoint+"/*") } // Install with dependencies — all wheels in the mount point are installed @@ -743,17 +801,20 @@ func (s *Sandbox) InstallCommand() []string { "--no-index", "--find-links=" + s.mountPoint, "--", s.pkg, - } + }, nil } // InstallAllCommand returns a pip install command that installs multiple packages at once. // All wheels must already be in the mount point directory. -func (s *Sandbox) InstallAllCommand(pkgs []string) []string { +// +// For npm, this writes the install script to the container tmpfs and +// returns a file-path-based command — see InstallCommand for rationale. +func (s *Sandbox) InstallAllCommand(ctx context.Context, pkgs []string) ([]string, error) { if s.ecosystem == types.EcosystemNpm { // Fire lifecycle scripts only for the target packages (not all // transitive deps). Transitive deps without lifecycle scripts // are covered by the import phase which loads them via require(). - return []string{"sh", "-c", npmLifecycleScript(pkgs)} + return s.stageInstallScript(ctx, npmLifecycleScript(pkgs)) } cmd := []string{ @@ -762,7 +823,7 @@ func (s *Sandbox) InstallAllCommand(pkgs []string) []string { "--find-links=" + s.mountPoint, "--", } - return append(cmd, pkgs...) + return append(cmd, pkgs...), nil } // npmLifecycleScript builds a /bin/sh script that fires preinstall + diff --git a/internal/sandbox/sandbox_extra_test.go b/internal/sandbox/sandbox_extra_test.go index 953f9ad..42afab1 100644 --- a/internal/sandbox/sandbox_extra_test.go +++ b/internal/sandbox/sandbox_extra_test.go @@ -53,7 +53,10 @@ func TestInstallCommand_PyPI(t *testing.T) { sb := New("/mnt/packages", "requests", false, types.EcosystemPyPI, "") sb.mountPoint = testMountPoint - cmd := sb.InstallCommand() + cmd, err := sb.InstallCommand(context.Background()) + if err != nil { + t.Fatalf("InstallCommand: %v", err) + } if len(cmd) == 0 { t.Fatal("InstallCommand returned empty") } @@ -83,25 +86,46 @@ func TestInstallCommand_PyPI(t *testing.T) { } func TestInstallCommand_Npm(t *testing.T) { - sb := New("/mnt/packages", "lodash", false, types.EcosystemNpm, "") + // npm install stages its script to /var/cache/kojuto/install.sh via + // dockerWriteFile, so execCommand needs to be intercepted. The script + // itself is exercised directly via TestNpmLifecycleScript_*. + var stagedScript string + orig := execCommand + execCommand = func(ctx context.Context, _ string, args ...string) *exec.Cmd { + // dockerWriteFile pipes the script via stdin to `sh -c "cat > path"`. + // Capture the stdin reader by wrapping the returned cmd. + c := exec.CommandContext(ctx, "true") + // args carries: exec -i --user=root sh -c "cat > path" + _ = args + return c + } + t.Cleanup(func() { execCommand = orig }) - cmd := sb.InstallCommand() - if len(cmd) != 3 || cmd[0] != "sh" || cmd[1] != "-c" { - t.Fatalf("InstallCommand = %v, want [sh -c