From 71d6d0af6c9c7f0907addcc40931f507ff26601a Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Fri, 8 May 2026 16:54:54 +0900 Subject: [PATCH 01/11] fix(sandbox,depfile): harden against attacker-controlled package metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four defense-in-depth boundaries around inputs derived from attacker-controlled package metadata. None is currently exploitable — they're forward-looking guards that make the trust boundaries explicit so future refactors can't regress silently. 1. npmLifecycleScript now single-quotes the cd target via a new shQuote() helper. The previous double-quoted form still expanded $(...), backticks, and \, so a package.json key like "foo$(id)" could break out of the cd argument and run on the sandbox shell. Single quotes are literal — only ' is special and is escaped via the standard '\'' idiom. 2. depfile.Parse rejects names outside the registry-allowed character sets at parse time (npm: lowercase + optional @scope/, PyPI: PEP 508 ASCII). Hard error so a malformed name surfaces clearly instead of producing a degenerate shell command or KOJUTO_SCAN_PKGS env var. PyPI's ';' is not exercised in tests — the parser already strips PEP 508 environment markers before validation runs. 3. sitecustomize.py's _w() strips \n/\r/\0 from every emission. The Go parser splits on \n, so an attacker-controlled co_filename or compile() filename arg containing \n could smuggle a fake KOJUTO: line onto the wire. _t() already escapes the snippet field; this covers the filename field (concatenated raw from _origin()) plus any future tag. 4. The _FRAMES=8 cap on _origin()'s stack walk is removed. A malicious package burying its own frame under >8 stdlib wrappers would have escaped the "+" user-origin marker, letting the parser drop the resulting dynamic_exec event as benign. The walk is now unbounded; the common-case fast path (early return on the first user frame) is unchanged, so practical cost is the same. Tests: - TestParse_RejectsInvalidNpmName, TestParse_AcceptsScopedNpmName - TestParse_RejectsInvalidPyPIName - TestShQuote, TestNpmLifecycleScript_QuotesShellMetachars - TestNpmLifecycleScript_ScopedPackage updated for single-quote form Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/depfile/depfile.go | 21 ++++++++ internal/depfile/depfile_test.go | 67 +++++++++++++++++++++++++ internal/sandbox/hooks/sitecustomize.py | 32 ++++++++---- internal/sandbox/sandbox.go | 20 ++++++-- internal/sandbox/sandbox_extra_test.go | 35 ++++++++++++- 5 files changed, 159 insertions(+), 16 deletions(-) diff --git a/internal/depfile/depfile.go b/internal/depfile/depfile.go index eae4503..56cc0d8 100644 --- a/internal/depfile/depfile.go +++ b/internal/depfile/depfile.go @@ -7,11 +7,23 @@ import ( "fmt" "os" "path/filepath" + "regexp" "strings" "github.com/RalianENG/kojuto/internal/types" ) +// Package-name validators. Names flow into shell scripts (npm lifecycle +// hooks) and environment variables (KOJUTO_SCAN_PKGS) inside the sandbox, +// so we reject anything outside the registry-allowed character sets at +// parse time. The sandbox also single-quotes names at the use site, but +// validation here gives the user a clear error instead of a silently +// malformed command. +var ( + npmNameRe = regexp.MustCompile(`^(@[a-z0-9][a-z0-9._~-]*\/)?[a-z0-9][a-z0-9._~-]*$`) + pypiNameRe = regexp.MustCompile(`^[A-Za-z0-9][A-Za-z0-9._-]*$`) +) + // Dep represents a single dependency extracted from a file. type Dep struct { Name string @@ -81,6 +93,9 @@ func parseRequirementsTxt(path string) ([]Dep, error) { } if dep.Name != "" { + if !pypiNameRe.MatchString(dep.Name) { + return nil, fmt.Errorf("invalid package name %q in %s", dep.Name, path) + } deps = append(deps, dep) } } @@ -106,9 +121,15 @@ func parsePackageJSON(path string) ([]Dep, error) { var deps []Dep for name, ver := range pkg.Dependencies { + if !npmNameRe.MatchString(name) { + return nil, fmt.Errorf("invalid package name %q in %s", name, path) + } deps = append(deps, Dep{Name: name, Version: cleanNpmVersion(ver)}) } for name, ver := range pkg.DevDependencies { + if !npmNameRe.MatchString(name) { + return nil, fmt.Errorf("invalid package name %q in %s", name, path) + } deps = append(deps, Dep{Name: name, Version: cleanNpmVersion(ver)}) } diff --git a/internal/depfile/depfile_test.go b/internal/depfile/depfile_test.go index 14b761e..78a3f6f 100644 --- a/internal/depfile/depfile_test.go +++ b/internal/depfile/depfile_test.go @@ -167,6 +167,73 @@ func TestParse_InvalidJSON(t *testing.T) { } } +func TestParse_RejectsInvalidNpmName(t *testing.T) { + cases := map[string]string{ + "shell metachars": `{"dependencies": {"foo;rm -rf /": "1.0.0"}}`, + "command sub": `{"dependencies": {"foo$(id)": "1.0.0"}}`, + "backtick": "{\"dependencies\": {\"foo`id`\": \"1.0.0\"}}", + "newline": "{\"dependencies\": {\"foo\\nbar\": \"1.0.0\"}}", + "uppercase": `{"dependencies": {"FooBar": "1.0.0"}}`, + "leading dot": `{"dependencies": {".hidden": "1.0.0"}}`, + "space": `{"dependencies": {"a b": "1.0.0"}}`, + "dev shell metachar": `{"devDependencies": {"foo|cat": "1.0.0"}}`, + } + + for name, content := range cases { + t.Run(name, func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "package.json") + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + if _, _, err := Parse(path); err == nil { + t.Errorf("expected error for invalid name, got nil (content=%q)", content) + } + }) + } +} + +func TestParse_AcceptsScopedNpmName(t *testing.T) { + content := `{"dependencies": {"@scope/foo-bar": "1.0.0", "lodash": "4.0.0"}}` + dir := t.TempDir() + path := filepath.Join(dir, "package.json") + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + deps, _, err := Parse(path) + if err != nil { + t.Fatalf("Parse failed: %v", err) + } + if len(deps) != 2 { + t.Errorf("expected 2 deps, got %d", len(deps)) + } +} + +func TestParse_RejectsInvalidPyPIName(t *testing.T) { + // Note: `;` is not exercised here because the parser strips PEP 508 + // environment markers (everything after `;`) before validation runs, + // so a shell `;` is never seen as part of the name. + cases := map[string]string{ + "command sub": "foo$(id)==1.0.0\n", + "backtick": "foo`id`==1.0.0\n", + "pipe": "foo|cat==1.0.0\n", + "space": "a b==1.0.0\n", + } + + for name, content := range cases { + t.Run(name, func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "requirements.txt") + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + if _, _, err := Parse(path); err == nil { + t.Errorf("expected error for invalid name, got nil (content=%q)", content) + } + }) + } +} + func TestCleanNpmVersion(t *testing.T) { cases := []struct { input string diff --git a/internal/sandbox/hooks/sitecustomize.py b/internal/sandbox/hooks/sitecustomize.py index 1eff6d7..295a96c 100644 --- a/internal/sandbox/hooks/sitecustomize.py +++ b/internal/sandbox/hooks/sitecustomize.py @@ -8,7 +8,6 @@ import sys _MAX = 200 -_FRAMES = 8 _P = chr(75) + chr(79) + chr(74) + chr(85) + chr(84) + chr(79) + ":" # Frames whose co_filename starts with one of these are treated as @@ -49,6 +48,15 @@ def _t(s): def _w(tag, body): + # Wire-format invariant: each KOJUTO: emission must occupy exactly + # one stderr line. The Go parser splits on '\n' and treats every + # KOJUTO:-prefixed line as an independent event. Strip line and null + # bytes from `body` so an attacker-controlled co_filename or compile + # filename arg cannot smuggle a fake follow-up event onto the wire. + # _t() already escapes newlines in the snippet field, so this is a + # defense for the filename field (which is concatenated raw from + # _origin()) and a belt-and-braces guard for any future tag. + body = body.replace("\n", "").replace("\r", "").replace("\0", "") sys.stderr.write(_P + tag + ":" + body + "\n") sys.stderr.flush() @@ -57,28 +65,32 @@ def _origin(fallback): """Walk the Python call stack and return the .py file responsible for the compile/exec call. - Returns "+" when the deepest user-code frame is found — the - leading "+" tells the Go analyzer this event originates in audited - code and must NOT be filtered by the path-based benign list. When - no user frame appears, returns the deepest non-internal frame so - the existing benign filter (stdlib/site-packages substring match) - still works for compat-library noise. + Returns "+" when a user-code frame is found anywhere on the + stack — the leading "+" tells the Go analyzer this event originates + in audited code and must NOT be filtered by the path-based benign + list. When no user frame appears, returns the first non-internal + frame so the existing benign filter (stdlib/site-packages substring + match) still works for compat-library noise. + + The walk is unbounded by design: a depth cap would let a malicious + package bury its own frame under stdlib wrappers and evade the + user-origin marker. Stack walking is O(depth) on a linked list and + early-returns on the first user frame, so the practical cost is + negligible compared with the surrounding compile/exec work. """ try: f = sys._getframe(2) # skip _origin + _h except ValueError: return fallback me = __file__ - seen = 0 deepest = "" - while f is not None and seen < _FRAMES: + while f is not None: fn = f.f_code.co_filename if fn != me: if _is_user(fn): return "+" + fn if not deepest: deepest = fn - seen += 1 f = f.f_back if deepest: return deepest diff --git a/internal/sandbox/sandbox.go b/internal/sandbox/sandbox.go index 50a3acf..49bd6e4 100644 --- a/internal/sandbox/sandbox.go +++ b/internal/sandbox/sandbox.go @@ -696,17 +696,27 @@ func npmLifecycleScript(pkgs []string) string { if i > 0 { b.WriteString("; ") } - // Quote the path so scoped packages like @scope/pkg work - // without word-splitting. - b.WriteString(`(cd "/install/node_modules/`) - b.WriteString(p) - b.WriteString(`" && `) + // Single-quote the full cd target so any shell metacharacters + // in p (which originates from package.json keys, i.e. attacker- + // controllable input) cannot break out of the argument. depfile + // validates names at parse time; this is defense-in-depth for + // any future code path that populates pkgs from another source. + b.WriteString(`(cd `) + b.WriteString(shQuote("/install/node_modules/" + p)) + b.WriteString(` && `) b.WriteString(hooks) b.WriteString(`) 2>&1`) } return b.String() } +// shQuote wraps s in POSIX single quotes, escaping any embedded single +// quote with the standard '\'' close-escape-reopen idiom. Safe for use +// in /bin/sh, dash, and bash. +func shQuote(s string) string { + return "'" + strings.ReplaceAll(s, "'", `'\''`) + "'" +} + // WriteProbeScriptsMulti writes one combined import probe script per OS identity. // This reduces Python/Node process launches from N*3 to just 3. func (s *Sandbox) WriteProbeScriptsMulti(ctx context.Context, pkgs []string) { diff --git a/internal/sandbox/sandbox_extra_test.go b/internal/sandbox/sandbox_extra_test.go index 0b9546d..3b5a143 100644 --- a/internal/sandbox/sandbox_extra_test.go +++ b/internal/sandbox/sandbox_extra_test.go @@ -347,12 +347,45 @@ func TestNpmLifecycleScript_ScopedPackage(t *testing.T) { // shell command. Regression guard for a bug that would have hidden // scoped-dep lifecycle behavior from strace. got := npmLifecycleScript([]string{"@scope/lib"}) - want := `"/install/node_modules/@scope/lib"` + want := `'/install/node_modules/@scope/lib'` if !strings.Contains(got, want) { t.Errorf("scoped path missing %q in:\n%s", want, got) } } +func TestShQuote(t *testing.T) { + cases := []struct { + in, want string + }{ + {"foo", "'foo'"}, + {"a b", "'a b'"}, + {"@scope/pkg", "'@scope/pkg'"}, + {"$(id)", "'$(id)'"}, + {"`id`", "'`id`'"}, + {"a'b", `'a'\''b'`}, + {"", "''"}, + } + for _, tc := range cases { + if got := shQuote(tc.in); got != tc.want { + t.Errorf("shQuote(%q) = %q, want %q", tc.in, got, tc.want) + } + } +} + +func TestNpmLifecycleScript_QuotesShellMetachars(t *testing.T) { + // Defense-in-depth: even if a malformed name slips past the depfile + // validator, the cd target must be safely single-quoted so shell + // metacharacters cannot break out and execute on the sandbox shell. + got := npmLifecycleScript([]string{`foo$(id)`}) + if !strings.Contains(got, `'/install/node_modules/foo$(id)'`) { + t.Errorf("metachar payload not single-quoted in:\n%s", got) + } + // The bare token must NOT appear unquoted between cd and &&. + if strings.Contains(got, `cd /install/node_modules/foo$(id) `) { + t.Errorf("metachar payload unquoted in:\n%s", got) + } +} + // --------------------------------------------------------------------------- // WriteProbeScriptsMulti // --------------------------------------------------------------------------- From 7269ecf0376ce97315b513de530862bde803cc1c Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Fri, 8 May 2026 16:58:42 +0900 Subject: [PATCH 02/11] feat(action): expose config, quiet, no-color as Action inputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GitHub Action wrapper was missing wiring for three CLI flags that already work on the binary itself: `--config`, `--quiet`, and `--no-color`. Workflows that wanted these had to drop down to a custom shell step. - `config` — passes through to `--config` so an alternative `kojuto.yml` path can be used (CI matrices with per-language configs, or a vendored config kept under a non-default path). - `quiet` — suppresses phase progress lines so CI logs only contain the final verdict block. Pairs well with PR-comment workflows that post the report as a comment. - `no-color` — disables ANSI escapes; mirrors the `NO_COLOR` environment variable spec, useful for log aggregators that don't strip escapes. `runtime` description is also tightened to drop the now-obsolete "empty string also resolves to auto" parenthetical (the auto path is the documented default since #14) and the YAML default is set to `auto` explicitly so the rendered docs match the CLI. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + action.yml | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ec3ad3..1c8be19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **System binary write detection** — `openat` with write flags to trusted system binaries (`python3`, `node`, `pip`, `sh`, etc.) in `/usr/local/bin/` or `/usr/bin/` classified as `binary_hijacking`, preventing benignPaths bypass via tmpfs overwrite - **gVisor auto-detection** — `--runtime` default changed from empty (runc) to `auto`: probes `docker info` for runsc availability and uses gVisor if registered, falls back to runc otherwise - **npm test package** — `testdata/probe-npm-0.1.0/` with lifecycle hook payloads (preinstall/postinstall) and require-time import phase covering 31 TTPs across 9 attack categories including audit hook validation +- **GitHub Action inputs** — `config`, `quiet`, and `no-color` exposed as Action inputs to match CLI flags (`--config`, `--quiet`, `--no-color`) ### Changed - Go version requirement lowered from 1.25.0 to 1.24.0 (stable release); `golang.org/x/sys` downgraded from v0.43.0 to v0.41.0 diff --git a/action.yml b/action.yml index 51025ea..a88e97f 100644 --- a/action.yml +++ b/action.yml @@ -29,10 +29,14 @@ inputs: description: 'Scan a local package file (.whl, .tgz) or directory' required: false default: '' - runtime: - description: 'Container runtime: auto (default, use gVisor if available), runsc (force gVisor), runc (force Docker default). Empty string also resolves to auto on the CLI.' + config: + description: 'Path to kojuto.yml config file (default: kojuto.yml in current directory)' required: false default: '' + runtime: + description: 'Container runtime: auto (use gVisor if available), runsc (force gVisor), runc (force Docker default)' + required: false + default: 'auto' probe-method: description: 'Probe method: auto, ebpf, strace, strace-container' required: false @@ -45,6 +49,14 @@ inputs: description: 'Ignore sensitive_paths.exclude from config (recommended for CI)' required: false default: 'true' + quiet: + description: 'Suppress phase progress output; emit only the final verdict block' + required: false + default: 'false' + no-color: + description: 'Disable colored output (NO_COLOR env is also respected)' + required: false + default: 'false' outputs: verdict: @@ -77,10 +89,13 @@ runs: KOJUTO_FILE: ${{ inputs.file }} KOJUTO_PIN: ${{ inputs.pin }} KOJUTO_LOCAL: ${{ inputs.local }} + KOJUTO_CONFIG: ${{ inputs.config }} KOJUTO_RUNTIME: ${{ inputs.runtime }} KOJUTO_PROBE_METHOD: ${{ inputs.probe-method }} KOJUTO_TIMEOUT: ${{ inputs.timeout }} KOJUTO_STRICT: ${{ inputs.strict }} + KOJUTO_QUIET: ${{ inputs.quiet }} + KOJUTO_NO_COLOR: ${{ inputs['no-color'] }} run: | ARGS=() @@ -111,9 +126,18 @@ runs: if [ -n "$KOJUTO_RUNTIME" ]; then ARGS+=("--runtime" "$KOJUTO_RUNTIME") fi + if [ -n "$KOJUTO_CONFIG" ]; then + ARGS+=("--config" "$KOJUTO_CONFIG") + fi if [ "$KOJUTO_STRICT" = "true" ]; then ARGS+=("--strict") fi + if [ "$KOJUTO_QUIET" = "true" ]; then + ARGS+=("--quiet") + fi + if [ "$KOJUTO_NO_COLOR" = "true" ]; then + ARGS+=("--no-color") + fi sudo "${{ github.action_path }}/kojuto" scan "${ARGS[@]}" \ -o /tmp/kojuto-report.json From 02643788a0151b5a42c1587d0a5d7b9878074c9a Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Fri, 8 May 2026 16:59:10 +0900 Subject: [PATCH 03/11] docs: backfill changelog for severity tiering, caller-aware hook + path count The two unreleased fixes that shipped in #18 (a5cab0e) were missing from CHANGELOG.md, and the README's credential-access table still quoted the older "~50 sensitive paths" figure. This commit catches the docs up to the merged code; no behavior change. - CHANGELOG: add Severity-tiered verdict and Caller-aware audit hook entries under [Unreleased] / Added. - README: bump the sensitive-path count from ~50 to ~60 to reflect the current entry list. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 2 ++ README.md | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c8be19..ee840b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - **Audit hooks for dynamic code execution detection** — Python PEP 578 hook (`sitecustomize.py`) intercepts `compile`/`exec`/`import`/`ctypes.dlopen`; Node.js `--require` hook (`kojuto-require.js`) intercepts `eval`/`Function`/`vm.runInNewContext`/`vm.runInThisContext`/`vm.Script`. New `dynamic_code_execution` category and `EventDynamicExec` event type +- **Severity-tiered verdict** — `types.CategorySeverity` classifies each detection category as HIGH (one event raises the verdict to SUSPICIOUS), MEDIUM (two-or-more raise it), or LOW (never raises the verdict alone). `dynamic_code_execution` is LOW, `dns_tunneling` and `evasion` are MEDIUM, all other categories stay HIGH. Unmapped categories fail closed (treated as HIGH). LOW events still appear in `report.events` for forensics — verdict reflects severity, not raw event count. Stops legitimate Python compat libraries (`six`, `attrs`, `future`) from flipping to SUSPICIOUS on benign internal `compile`/`exec` calls +- **Caller-aware audit hook** — `sitecustomize.py` now walks the Python call stack and reports the actual `.py` file invoking `compile`/`exec`, not the user-controllable `filename` argument (which `six` deliberately sets to ``). When the deepest frame lives inside the scanned package's `site-packages` directory the hook prefixes the wire payload with `+` so the analyzer bypasses path-based benign filtering. Sandbox passes the scan list via `KOJUTO_SCAN_PKGS` so the hook knows which paths count as "user code" - **System binary write detection** — `openat` with write flags to trusted system binaries (`python3`, `node`, `pip`, `sh`, etc.) in `/usr/local/bin/` or `/usr/bin/` classified as `binary_hijacking`, preventing benignPaths bypass via tmpfs overwrite - **gVisor auto-detection** — `--runtime` default changed from empty (runc) to `auto`: probes `docker info` for runsc availability and uses gVisor if registered, falls back to runc otherwise - **npm test package** — `testdata/probe-npm-0.1.0/` with lifecycle hook payloads (preinstall/postinstall) and require-time import phase covering 31 TTPs across 9 attack categories including audit hook validation diff --git a/README.md b/README.md index 62f9c7b..6a2140f 100644 --- a/README.md +++ b/README.md @@ -253,7 +253,7 @@ Of the 300 malicious samples, 238 failed to install (dependencies already remove |----------|----------|-----------------| | C2 communication (`c2_communication`) | `aiogram-types-v3` → `147.45.124.42:80` | `connect`/`sendto` to non-loopback IPs | | Data exfiltration (`data_exfiltration`) | DNS resolution of Discord/Telegram/Pastebin | `sendto` port 53 resolving known exfil services | -| Credential access (`credential_access`) | `axios-attack-demo` → `.ssh/id_rsa`, `.aws/credentials`, `.solana/id.json` | `openat` on ~50 sensitive paths (SSH, cloud, crypto wallets, browser data) | +| Credential access (`credential_access`) | `axios-attack-demo` → `.ssh/id_rsa`, `.aws/credentials`, `.solana/id.json` | `openat` on ~60 sensitive paths (SSH, cloud, crypto wallets, browser data) | | Code execution (`code_execution`) | `advpruebitaa` → `type nul > prueba11.txt`, `/tmp/ld.py` | `execve` with inline `-c`/`-e` flags or from `/tmp`, `/dev/shm` | | Memory execution (`memory_execution`) | `ctypes.mmap(RWX)` shellcode injection | `mmap`/`mprotect` with simultaneous PROT_WRITE+PROT_EXEC | | Binary hijacking (`binary_hijacking`) | `rename /tmp/evil /usr/local/bin/python3` | `rename` targeting trusted system binaries | From 131bb3f9da42e331b06cadba3cd2de7236a456c1 Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Fri, 8 May 2026 17:12:48 +0900 Subject: [PATCH 04/11] fix(sandbox,cmd): close root-in-container heredoc injection on --local scans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dockerExecRoot heredoc pattern used to write probe scripts and honeypot credentials had a stray-terminator weakness: sh -c "cat > << 'KOJUTO_EOF'\nKOJUTO_EOF" If contained a line consisting only of `KOJUTO_EOF`, the shell terminated the heredoc early and parsed the rest of the body as commands — running as root inside the sandbox via `docker exec --user=root`. includes the package name interpolated into a JS/Python `'%s'` literal in WriteProbeScripts*; a name carrying "\n" + "KOJUTO_EOF" + "\n" + "" produced exactly the lines needed to terminate and execute . Reachability: - `kojuto scan ` gated by downloaderValidate (regex blocks \n) - `kojuto scan -f ` gated by depfile validators (71d6d0a) - `kojuto scan --local ` was NOT gated — pkg derived from filepath.Base(localPath) carried any byte the filesystem allowed, so a malware sample distributed with a crafted filename would have achieved root in the sandbox during WriteProbeScripts. The most damaging payload would `rm /usr/local/lib/python3.12/site-packages/ sitecustomize.py`, neutralising kojuto's own audit hook before the import phase actually ran the malicious code — defeating detection with no surface visibility. Two-layer fix: 1. dockerWriteFile() helper replaces all 9 heredoc sites (5 honeypot credentials + 4 probe scripts). Content is delivered via stdin to `cat`, so the body never appears on the shell command line. Path is single-quoted as defense-in-depth though all current callers pass constants. 2. runLocalScan validates the derived pkg with downloaderValidate before any sandbox interaction. Newlines and shell metacharacters surface as a clear error rather than silently producing a degenerate container command. Tests: - TestDockerWriteFile_StructureNoHeredoc pins that the new helper emits `docker exec -i ... sh -c "cat > ''"` with no `<<` or `KOJUTO_EOF` token anywhere in the args. - TestDockerWriteFile_QuotesPath confirms shQuote is applied to the path slot for future callers. - TestRunLocalScan_RejectsUnsafeFilename creates a `.tgz` whose basename contains "\nKOJUTO_EOF\n" and asserts the scan errors out before reaching Docker. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/root.go | 11 +++++ cmd/root_mock_test.go | 34 ++++++++++++++ internal/sandbox/sandbox.go | 61 ++++++++++++++----------- internal/sandbox/sandbox_extra_test.go | 62 ++++++++++++++++++++++++++ 4 files changed, 143 insertions(+), 25 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index aa89a6c..270fadc 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -646,6 +646,17 @@ func runLocalScan(_ []string) error { pkg = detectPackageName(filepath.Base(localPath)) } + // Validate the derived package name with the same regex as registry + // scans. Local mode is the only path where pkg is built from a user- + // supplied filesystem path, so the name can carry any byte the OS + // allows — including newlines and shell metacharacters that downstream + // sandbox helpers used to interpolate into shell heredocs and Python/ + // JS string literals. The dockerWriteFile refactor closed the heredoc + // path, but defending at the boundary keeps every future use site safe. + if err := downloaderValidate(pkg, ""); err != nil { + return fmt.Errorf("local package name derived from %q is unsafe: %w", localPath, err) + } + // Auto-detect ecosystem from file extension, but only if -e was not explicitly set. // .tgz is always npm; .tar.gz is ambiguous (could be PyPI sdist), so only // override to npm for .tgz, not .tar.gz. diff --git a/cmd/root_mock_test.go b/cmd/root_mock_test.go index 5819caf..37badf0 100644 --- a/cmd/root_mock_test.go +++ b/cmd/root_mock_test.go @@ -443,6 +443,40 @@ func TestRunLocalScan_NpmAutoDetect(t *testing.T) { } } +// TestRunLocalScan_RejectsUnsafeFilename pins the security invariant that +// a filename whose derived package name contains shell metacharacters or +// newlines is rejected at the boundary, before any sandbox interaction. +// Without this gate, a malicious .tgz/.whl filename containing a newline +// followed by "KOJUTO_EOF" used to terminate dockerWriteFile's predecessor +// heredoc and execute arbitrary commands as root inside the container. +// The dockerWriteFile refactor closed the heredoc; this test guards the +// belt-and-braces input check. +func TestRunLocalScan_RejectsUnsafeFilename(t *testing.T) { + saveAndRestoreFlags(t) + + dir := t.TempDir() + // "\n" + "KOJUTO_EOF" payload baked into a .tgz filename. macOS APFS + // and Linux ext4/xfs all permit '\n' in filenames; if the underlying + // FS rejects it, skip rather than fail. + badName := "evil\nKOJUTO_EOF\nrm-1.0.0.tgz" + badFile := filepath.Join(dir, badName) + if err := os.WriteFile(badFile, []byte("x"), 0o644); err != nil { + t.Skipf("filesystem refuses newlines in filenames: %v", err) + } + + flagLocal = badFile + flagEcosystem = types.EcosystemNpm + flagTimeout = 5 * time.Second + + err := runLocalScan(nil) + if err == nil { + t.Fatal("expected validation error for unsafe local filename") + } + if !strings.Contains(err.Error(), "unsafe") { + t.Errorf("expected 'unsafe' in error, got: %v", err) + } +} + // --- prepareLocalNpm tests --- func TestPrepareLocalNpm_NoTgz(t *testing.T) { diff --git a/internal/sandbox/sandbox.go b/internal/sandbox/sandbox.go index 49bd6e4..f12fd3a 100644 --- a/internal/sandbox/sandbox.go +++ b/internal/sandbox/sandbox.go @@ -519,44 +519,39 @@ func (s *Sandbox) plantHoneypotFiles(ctx context.Context) { // SSH key pair. s.dockerExecRoot(ctx, "mkdir", "-p", home+"/.ssh") - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+home+"/.ssh/id_rsa << 'KOJUTO_EOF'\n"+ + s.dockerWriteFile(ctx, home+"/.ssh/id_rsa", "-----BEGIN OPENSSH PRIVATE KEY-----\n"+ - "b3BlbnNzaC1rZXktdjEAAAAAFAAAAAAAAAEAAAAzAAAAC3NzaC1lZDI1NTE5\n"+ - "AAAAI"+sshKeyBody+"\n"+ - "-----END OPENSSH PRIVATE KEY-----\n"+ - "KOJUTO_EOF") + "b3BlbnNzaC1rZXktdjEAAAAAFAAAAAAAAAEAAAAzAAAAC3NzaC1lZDI1NTE5\n"+ + "AAAAI"+sshKeyBody+"\n"+ + "-----END OPENSSH PRIVATE KEY-----\n") s.dockerExecRoot(ctx, "chmod", "600", home+"/.ssh/id_rsa") // AWS credentials. s.dockerExecRoot(ctx, "mkdir", "-p", home+"/.aws") - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+home+"/.aws/credentials << 'KOJUTO_EOF'\n"+ + s.dockerWriteFile(ctx, home+"/.aws/credentials", "[default]\n"+ - "aws_access_key_id = "+awsKey+"\n"+ - "aws_secret_access_key = "+awsSecret+"\n"+ - "KOJUTO_EOF") + "aws_access_key_id = "+awsKey+"\n"+ + "aws_secret_access_key = "+awsSecret+"\n") // Git credentials. - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+home+"/.git-credentials << 'KOJUTO_EOF'\n"+ - "https://dev:"+ghToken+"@github.com\n"+ - "KOJUTO_EOF") + s.dockerWriteFile(ctx, home+"/.git-credentials", + "https://dev:"+ghToken+"@github.com\n") s.dockerExecRoot(ctx, "chmod", "600", home+"/.git-credentials") // Netrc. - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+home+"/.netrc << 'KOJUTO_EOF'\n"+ + s.dockerWriteFile(ctx, home+"/.netrc", "machine github.com\n"+ - "login dev\n"+ - "password "+ghToken+"\n"+ - "KOJUTO_EOF") + "login dev\n"+ + "password "+ghToken+"\n") s.dockerExecRoot(ctx, "chmod", "600", home+"/.netrc") // GitHub CLI config. s.dockerExecRoot(ctx, "mkdir", "-p", home+"/.config/gh") - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+home+"/.config/gh/hosts.yml << 'KOJUTO_EOF'\n"+ + s.dockerWriteFile(ctx, home+"/.config/gh/hosts.yml", "github.com:\n"+ - " oauth_token: "+ghToken+"\n"+ - " user: dev\n"+ - " git_protocol: https\n"+ - "KOJUTO_EOF") + " oauth_token: "+ghToken+"\n"+ + " user: dev\n"+ + " git_protocol: https\n") // Fix ownership so the container user (dev) owns the files. s.dockerExecRoot(ctx, "chown", "-R", "1000:1000", home+"/.ssh", home+"/.aws", @@ -586,6 +581,22 @@ func (s *Sandbox) dockerExecRoot(ctx context.Context, args ...string) { _ = cmd.Run() } +// dockerWriteFile writes content into the container at path as root. +// The body is delivered via stdin to `cat`, which keeps it out of the +// shell command line entirely — unlike a `<< 'EOF'` heredoc, an attacker +// who controls part of the body cannot smuggle a stray terminator line +// to break out of the heredoc and execute arbitrary commands as root in +// the container. path is single-quoted as defense-in-depth even though +// every current caller passes a constant. +func (s *Sandbox) dockerWriteFile(ctx context.Context, path, content string) { + cmdArgs := []string{"exec", "-i", "--user=root", s.containerID, "sh", "-c", "cat > " + shQuote(path)} + cmd := execCommand(ctx, "docker", cmdArgs...) + cmd.Stdin = strings.NewReader(content) + cmd.Stdout = io.Discard + cmd.Stderr = io.Discard + _ = cmd.Run() +} + // Exec runs a command inside the sandbox container and returns the combined output. func (s *Sandbox) Exec(ctx context.Context, command []string) ([]byte, error) { args := append([]string{"exec", s.containerID}, command...) @@ -733,7 +744,7 @@ func (s *Sandbox) WriteProbeScriptsMulti(ctx context.Context, pkgs []string) { p, requires.String(), ) filename := "/tmp/_kojuto_probe_all_" + p + ".js" - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+filename+" << 'KOJUTO_EOF'\n"+script+"KOJUTO_EOF") + s.dockerWriteFile(ctx, filename, script) } return } @@ -775,7 +786,7 @@ func (s *Sandbox) WriteProbeScriptsMulti(ctx context.Context, pkgs []string) { p.sep, p.pathsep, p.linesep, imports.String(), ) filename := "/tmp/_kojuto_probe_all_" + p.sysplatform + ".py" - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+filename+" << 'KOJUTO_EOF'\n"+script+"KOJUTO_EOF") + s.dockerWriteFile(ctx, filename, script) } } @@ -860,7 +871,7 @@ func (s *Sandbox) WriteProbeScripts(ctx context.Context) { p.sep, p.pathsep, p.linesep, importName, ) filename := "/tmp/_kojuto_probe_" + p.sysplatform + ".py" - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+filename+" << 'KOJUTO_EOF'\n"+script+"KOJUTO_EOF") + s.dockerWriteFile(ctx, filename, script) } jsPlatforms := []string{"linux", "win32", "darwin"} @@ -872,7 +883,7 @@ func (s *Sandbox) WriteProbeScripts(ctx context.Context) { p, s.pkg, ) filename := "/tmp/_kojuto_probe_" + p + ".js" - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+filename+" << 'KOJUTO_EOF'\n"+script+"KOJUTO_EOF") + s.dockerWriteFile(ctx, filename, script) } } diff --git a/internal/sandbox/sandbox_extra_test.go b/internal/sandbox/sandbox_extra_test.go index 3b5a143..9735504 100644 --- a/internal/sandbox/sandbox_extra_test.go +++ b/internal/sandbox/sandbox_extra_test.go @@ -2,6 +2,8 @@ package sandbox import ( "context" + "os/exec" + "reflect" "strings" "testing" @@ -539,3 +541,63 @@ func TestFakeTokens_NotHexOnly(t *testing.T) { t.Error("10 consecutive fakeAWSKeyID tokens were all hex-only") } } + +// TestDockerWriteFile_StructureNoHeredoc pins the security invariant that +// dockerWriteFile delivers content via stdin (`docker exec -i`) instead of +// the previous `cat << 'KOJUTO_EOF'` heredoc. The heredoc form was a root- +// in-container injection sink: an attacker-controlled package name with +// an embedded "\nKOJUTO_EOF\n" sequence used to terminate the +// heredoc early and run as root. Stdin delivery removes the body +// from the shell command line entirely. +func TestDockerWriteFile_StructureNoHeredoc(t *testing.T) { + var captured []string + orig := execCommand + execCommand = func(ctx context.Context, name string, args ...string) *exec.Cmd { + captured = append([]string{name}, args...) + return exec.CommandContext(ctx, "true") + } + t.Cleanup(func() { execCommand = orig }) + + sb := &Sandbox{containerID: "test-container"} + sb.dockerWriteFile(context.Background(), "/tmp/probe.js", + "any content with KOJUTO_EOF baked in\nstill not parsed by shell") + + want := []string{ + "docker", "exec", "-i", "--user=root", "test-container", + "sh", "-c", "cat > '/tmp/probe.js'", + } + if !reflect.DeepEqual(captured, want) { + t.Fatalf("docker args mismatch:\n got %q\nwant %q", captured, want) + } + + for _, a := range captured { + if strings.Contains(a, "<<") { + t.Errorf("heredoc syntax leaked into arg %q", a) + } + if strings.Contains(a, "KOJUTO_EOF") { + t.Errorf("KOJUTO_EOF terminator leaked into arg %q", a) + } + } +} + +// TestDockerWriteFile_QuotesPath confirms the path is single-quoted so a +// future caller passing a path with shell metacharacters cannot break the +// command. Every current caller passes a constant path; this test pins +// the defense-in-depth contract for future code. +func TestDockerWriteFile_QuotesPath(t *testing.T) { + var captured []string + orig := execCommand + execCommand = func(ctx context.Context, name string, args ...string) *exec.Cmd { + captured = append([]string{name}, args...) + return exec.CommandContext(ctx, "true") + } + t.Cleanup(func() { execCommand = orig }) + + sb := &Sandbox{containerID: "test-container"} + sb.dockerWriteFile(context.Background(), "/tmp/$(rm -rf /)/x", "x") + + last := captured[len(captured)-1] + if !strings.Contains(last, `'/tmp/$(rm -rf /)/x'`) { + t.Errorf("path not single-quoted in shell arg: %q", last) + } +} From ba9449b66c380bb1609f6e8e63359a5dbc239d56 Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Fri, 8 May 2026 17:20:59 +0900 Subject: [PATCH 05/11] fix(sandbox): sanitize getHostUsername to match getHostHostname MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit getHostUsername read USER/USERNAME/LOGNAME and returned the raw value; getHostHostname routed os.Hostname through sanitizeDockerArg before returning. The two are used in symmetric slots — s.mountPoint = "/home/" + hostUser + "/projects" hostname = sanitizeDockerArg(hostHostname) — and mountPoint then flows into a `-v ::ro` docker volume spec and a `pip install --no-index --no-deps --no-build-isolation /*` shell glob. A USER value containing `:` would break the volume separator, and one containing shell metacharacters would derail the glob expansion. Env vars are normally trusted, so this is not a concrete vulnerability in any threat model kojuto currently supports — but the inconsistency between the two helpers was load-bearing only because nobody set USER to anything weird. Mirroring the hostname sanitizer removes the implicit trust dependency at zero cost. Side change: sanitizeDockerArg now takes an explicit fallback string instead of hardcoding "localhost". The hostname caller passes "localhost"; the new username caller passes "user". Removes the hostname-specific bias from a function that is otherwise a generic charset filter, and lets the test parameterise both fallback paths. Tests: TestGetHostUsername_Sanitizes pins the new sanitization with the same metacharacter cases the hostname test already covered, plus non-ASCII fallback. TestSanitizeDockerArg expanded to exercise parameterised fallback and the volume-spec / glob metacharacters. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/sandbox/sandbox.go | 23 +++++++++++------- internal/sandbox/sandbox_extra_test.go | 26 ++++++++++++++++++++ internal/sandbox/sandbox_test.go | 33 ++++++++++++++++---------- 3 files changed, 62 insertions(+), 20 deletions(-) diff --git a/internal/sandbox/sandbox.go b/internal/sandbox/sandbox.go index f12fd3a..f51d4e1 100644 --- a/internal/sandbox/sandbox.go +++ b/internal/sandbox/sandbox.go @@ -223,14 +223,17 @@ func (s *Sandbox) containerArgs() ([]string, error) { // The result is sanitized to prevent Docker flag injection via hostile hostnames. func getHostHostname() string { if h, err := os.Hostname(); err == nil && h != "" { - return sanitizeDockerArg(h) + return sanitizeDockerArg(h, "localhost") } return "localhost" } -// sanitizeDockerArg strips characters that could break Docker CLI arguments. -// Docker hostnames must match [a-zA-Z0-9][a-zA-Z0-9_.-]. -func sanitizeDockerArg(s string) string { +// sanitizeDockerArg strips characters that could break Docker CLI +// arguments. Output is restricted to [a-zA-Z0-9._-] — the intersection +// of the Docker hostname rule and the chars that are safe to interpolate +// into shell command lines and -v : volume specs without +// quoting. fallback is returned when every byte was stripped. +func sanitizeDockerArg(s, fallback string) string { var b strings.Builder for _, c := range s { if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '-' || c == '.' || c == '_' { @@ -238,7 +241,7 @@ func sanitizeDockerArg(s string) string { } } if b.Len() == 0 { - return "localhost" + return fallback } return b.String() } @@ -285,12 +288,16 @@ func getHostResources() (cpus, memory string) { return cpus, memory } -// getHostUsername returns the current OS username. +// getHostUsername returns the current OS username, sanitized to the same +// charset as getHostHostname. The result flows into mountPoint = +// "/home//projects", which is interpolated into a -v ::ro +// volume spec and a `pip install ... /*` shell glob. Even +// though env vars are normally trusted, mirroring the hostname sanitizer +// removes any case where a future runner sets USER to something exotic. func getHostUsername() string { - // Try common env vars (works on Linux, macOS, Windows). for _, key := range []string{"USER", "USERNAME", "LOGNAME"} { if u := os.Getenv(key); u != "" { - return u + return sanitizeDockerArg(u, "user") } } return "user" diff --git a/internal/sandbox/sandbox_extra_test.go b/internal/sandbox/sandbox_extra_test.go index 9735504..8c60b13 100644 --- a/internal/sandbox/sandbox_extra_test.go +++ b/internal/sandbox/sandbox_extra_test.go @@ -176,6 +176,32 @@ func TestGetHostUsername(t *testing.T) { } } +// TestGetHostUsername_Sanitizes pins that an exotic USER value (shell +// metacharacters, path separators, non-ASCII) is normalised before it +// flows into mountPoint and from there into -v volume specs and shell +// command lines. +func TestGetHostUsername_Sanitizes(t *testing.T) { + cases := []struct { + userEnv, want string + }{ + {"alice", "alice"}, + {"a;rm -rf /", "arm-rf"}, + {"$(id)", "id"}, + {"a:b", "ab"}, + {"a/b", "ab"}, + {"日本語", "user"}, + } + + for _, tc := range cases { + t.Setenv("USER", tc.userEnv) + t.Setenv("USERNAME", "") + t.Setenv("LOGNAME", "") + if got := getHostUsername(); got != tc.want { + t.Errorf("USER=%q: getHostUsername() = %q, want %q", tc.userEnv, got, tc.want) + } + } +} + func TestGetHostResources(t *testing.T) { cpus, mem := getHostResources() diff --git a/internal/sandbox/sandbox_test.go b/internal/sandbox/sandbox_test.go index b95f325..1a9779f 100644 --- a/internal/sandbox/sandbox_test.go +++ b/internal/sandbox/sandbox_test.go @@ -128,23 +128,32 @@ func TestHoneypotEnvVars(t *testing.T) { func TestSanitizeDockerArg(t *testing.T) { cases := []struct { - input string - want string + input string + fallback string + want string }{ - {"my-host", "my-host"}, - {"host.local", "host.local"}, - {"host name", "hostname"}, - {"--inject", "--inject"}, - {"$(evil)", "evil"}, - {"", "localhost"}, - {"日本語ホスト", "localhost"}, - {"valid_host-123.local", "valid_host-123.local"}, + {"my-host", "localhost", "my-host"}, + {"host.local", "localhost", "host.local"}, + {"host name", "localhost", "hostname"}, + {"--inject", "localhost", "--inject"}, + {"$(evil)", "localhost", "evil"}, + {"", "localhost", "localhost"}, + {"日本語ホスト", "localhost", "localhost"}, + {"valid_host-123.local", "localhost", "valid_host-123.local"}, + // Verify the fallback is parameterised, not hardcoded to "localhost". + {"", "user", "user"}, + {"$(evil)", "user", "evil"}, + {"日本語", "user", "user"}, + // Shell / volume-spec metacharacters get stripped. + {"a;rm -rf /", "user", "arm-rf"}, + {"a/b", "user", "ab"}, + {"a:b", "user", "ab"}, } for _, tc := range cases { - got := sanitizeDockerArg(tc.input) + got := sanitizeDockerArg(tc.input, tc.fallback) if got != tc.want { - t.Errorf("sanitizeDockerArg(%q) = %q, want %q", tc.input, got, tc.want) + t.Errorf("sanitizeDockerArg(%q, %q) = %q, want %q", tc.input, tc.fallback, got, tc.want) } } } From 9fc198ff632983b4569c69940c25b7d73aed9c2e Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Fri, 8 May 2026 17:32:13 +0900 Subject: [PATCH 06/11] fix(config): warn on stderr when lenient mode honors kojuto.yml excludes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `--strict` defaults to false on the CLI but true in the GitHub Action. Local interactive runs therefore silently honor `sensitive_paths.exclude` in the cwd's kojuto.yml. An untrusted repo (or extracted tarball) can plant a kojuto.yml that shrinks the detection surface without the user noticing — a "clean" verdict from such a run is not actually clean for the excluded categories. `--strict` already exists as the right answer for that threat, but the existing UX only surfaces exclusion activity when --strict is ON (printing the paths it's overriding). The lenient path was silent. Now preRunLoadConfig emits a stderr warning whenever exclusions are honored, listing the count and the actual paths and pointing at --strict. The honoring behavior is unchanged — this is purely a visibility fix so an attacker-planted exclude appears in CI logs and interactive output before the verdict is trusted. Test: TestPreRunLoadConfig_LenientWarnsOnExclude captures stderr across a temp config with two excluded paths and asserts the warning contains the count, both paths, and the --strict pointer. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/root.go | 19 +++++++++++++++---- cmd/root_test.go | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 270fadc..713cbf3 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -165,10 +165,21 @@ func preRunLoadConfig(_ *cobra.Command, _ []string) error { return fmt.Errorf("loading config %s: %w", cfgPath, err) } - if flagStrict && len(cfg.SensitivePaths.Exclude) > 0 { - fmt.Fprintf(os.Stderr, "warning: --strict ignoring %d excluded path(s) from config: %v\n", - len(cfg.SensitivePaths.Exclude), cfg.SensitivePaths.Exclude) - cfg.SensitivePaths.Exclude = nil + if len(cfg.SensitivePaths.Exclude) > 0 { + if flagStrict { + fmt.Fprintf(os.Stderr, "warning: --strict ignoring %d excluded path(s) from config: %v\n", + len(cfg.SensitivePaths.Exclude), cfg.SensitivePaths.Exclude) + cfg.SensitivePaths.Exclude = nil + } else { + // Without --strict, an attacker-planted or otherwise + // untrusted kojuto.yml in the cwd silently shrinks the + // detection surface. Surface the exclusions on stderr so + // the user can see them in CI logs and interactive output + // before trusting a "clean" verdict. + fmt.Fprintf(os.Stderr, "warning: %s excludes %d sensitive path(s) from monitoring: %v\n", + cfgPath, len(cfg.SensitivePaths.Exclude), cfg.SensitivePaths.Exclude) + fmt.Fprintln(os.Stderr, " pass --strict to ignore these exclusions") + } } paths := config.MergeSensitivePaths(cfg) diff --git a/cmd/root_test.go b/cmd/root_test.go index 7786b62..ec1cfcb 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -634,6 +634,55 @@ func TestPreRunLoadConfig_StrictIgnoresExclude(t *testing.T) { } } +// TestPreRunLoadConfig_LenientWarnsOnExclude pins the visibility +// guarantee added for surface C: when --strict is OFF and the loaded +// kojuto.yml carries `exclude` entries, a stderr warning surfaces the +// reduction in coverage so a CI log or interactive run cannot silently +// trust a "clean" verdict produced under a tampered config. Without +// this, an attacker who plants kojuto.yml in the cwd of a target repo +// can shrink the detection surface invisibly. +func TestPreRunLoadConfig_LenientWarnsOnExclude(t *testing.T) { + origConfig := flagConfig + origStrict := flagStrict + defer func() { flagConfig = origConfig; flagStrict = origStrict }() + + dir := t.TempDir() + cfgPath := filepath.Join(dir, "kojuto.yml") + if err := os.WriteFile(cfgPath, []byte("sensitive_paths:\n exclude:\n - \"/.ssh/\"\n - \"/.aws/\"\n"), 0o644); err != nil { + t.Fatal(err) + } + + flagConfig = cfgPath + flagStrict = false + + oldStderr := os.Stderr + r, w, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + os.Stderr = w + t.Cleanup(func() { os.Stderr = oldStderr }) + + runErr := preRunLoadConfig(nil, nil) + w.Close() + + data, _ := io.ReadAll(r) + got := string(data) + + if runErr != nil { + t.Fatalf("preRunLoadConfig returned error: %v", runErr) + } + if !strings.Contains(got, "excludes 2 sensitive path(s)") { + t.Errorf("stderr missing exclusion-count warning, got:\n%s", got) + } + if !strings.Contains(got, "/.ssh/") || !strings.Contains(got, "/.aws/") { + t.Errorf("stderr missing the actual excluded paths, got:\n%s", got) + } + if !strings.Contains(got, "--strict") { + t.Errorf("stderr missing pointer to --strict flag, got:\n%s", got) + } +} + func TestPreRunLoadConfig_InvalidConfig(t *testing.T) { origConfig := flagConfig origStrict := flagStrict From f62339195d3a2abaf239ec56ef1bb558c702b012 Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Fri, 8 May 2026 17:39:39 +0900 Subject: [PATCH 07/11] fix(report,hooks): escape control bytes in attacker-controllable strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A malicious package could embed ANSI escape sequences (or any C0 control byte) into source compiled via `compile()` or `exec()`. The audit hook captured the snippet via repr() / str() and emitted it on the wire; the Go side put it in `code_snippet` and json.Marshal encoded it as `...`. Structurally the JSON was fine, but a common downstream consumer pattern is `jq -r .events[].code_snippet`, which decodes the JSON string literally — handing a real ESC byte to the user's terminal. The attacker could then clear the screen and paint a fake "verdict: clean" line in green, hiding the real suspicious verdict. Same vector exists for `comm` (a malicious process can `prctl(PR_SET_ NAME, "evil\x1b[31m")` with control bytes) and any future attacker- reachable string field. Two-layer fix: 1. sitecustomize.py's _t() now escapes every C0 control byte and DEL (0x00-0x1f, 0x7f) to a printable `\xNN` form, with the conventional \n / \r / \t shorthand for the common cases. The wire-side _w() already strips line terminators for framing safety; this extends the same idea to the snippet field's content for raw-consumer safety. Closes the gap at the source. 2. report.WriteJSON now walks every event and runs each string field through sanitizeControl before encoding. Reflection picks up every string field of types.SyscallEvent so future fields are covered automatically. kojuto-generated strings (Syscall, Category, Phase, etc.) never contain control bytes, so the pass is a no-op for them. Tests: - TestSanitizeControl exercises the helper across plain ASCII, UTF-8 multibyte, individual control bytes, and a mixed-byte vector. - TestWriteJSON_StripsAnsiFromAttackerFields plants control bytes in CodeSnippet, Comm, Cmdline, and FilePath; round-trips through WriteJSON + json.Unmarshal; asserts no decoded field contains a raw byte below 0x20 or 0x7f, and that the on-the-wire JSON contains a printable backslash-escaped form for the ESC byte. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/report/report.go | 71 ++++++++++++++++++++++- internal/report/report_test.go | 76 +++++++++++++++++++++++++ internal/sandbox/hooks/sitecustomize.py | 22 ++++++- 3 files changed, 167 insertions(+), 2 deletions(-) diff --git a/internal/report/report.go b/internal/report/report.go index daae717..acb9fc7 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -4,6 +4,8 @@ import ( "encoding/json" "fmt" "io" + "reflect" + "strings" "time" "github.com/RalianENG/kojuto/internal/types" @@ -29,8 +31,19 @@ func Generate(pkg, version, ecosystem, verdict, probeMethod string, events []typ } } -// WriteJSON writes the report as indented JSON to w. +// WriteJSON writes the report as indented JSON to w. Every string field +// of every event is run through sanitizeControl first so a downstream +// consumer that decodes the JSON and prints a string field raw (e.g. +// `jq -r .events[].code_snippet`) cannot be fed an attacker-controlled +// ANSI escape sequence smuggled in via a malicious package's audit-hook +// snippet, prctl-set comm name, or future attacker-reachable string +// slot. The Python audit hook performs the same escape on its own side; +// this is the host-side belt-and-braces guard. func WriteJSON(r *types.Report, w io.Writer) error { + for i := range r.Events { + sanitizeEventStrings(&r.Events[i]) + } + enc := json.NewEncoder(w) enc.SetIndent("", " ") @@ -40,3 +53,59 @@ func WriteJSON(r *types.Report, w io.Writer) error { return nil } + +// sanitizeEventStrings rewrites every string field of e in place so any +// C0 control byte (0x00-0x1f) or DEL (0x7f) that an attacker placed in +// an event field is rendered as a printable `\xNN` escape rather than +// passing through to the report consumer's terminal. +// +// Reflection is used so future string fields added to SyscallEvent are +// covered automatically. Non-string fields are skipped. Fields kojuto +// itself populates (Syscall, Category, Phase, etc.) are no-ops because +// their values never contain control bytes. +func sanitizeEventStrings(e *types.SyscallEvent) { + v := reflect.ValueOf(e).Elem() + for i := 0; i < v.NumField(); i++ { + f := v.Field(i) + if f.Kind() == reflect.String && f.CanSet() { + f.SetString(sanitizeControl(f.String())) + } + } +} + +// sanitizeControl escapes C0 control bytes and DEL to printable forms. +// Common whitespace bytes use the conventional \n / \r / \t shorthand; +// all others render as \xNN. Any other byte (printable ASCII, UTF-8 +// multibyte, etc.) passes through unchanged. +func sanitizeControl(s string) string { + if !needsControlEscape(s) { + return s + } + var b strings.Builder + b.Grow(len(s)) + for _, r := range s { + switch { + case r == '\n': + b.WriteString(`\n`) + case r == '\r': + b.WriteString(`\r`) + case r == '\t': + b.WriteString(`\t`) + case r < 0x20 || r == 0x7f: + fmt.Fprintf(&b, `\x%02x`, r) + default: + b.WriteRune(r) + } + } + return b.String() +} + +func needsControlEscape(s string) bool { + for i := 0; i < len(s); i++ { + c := s[i] + if c < 0x20 || c == 0x7f { + return true + } + } + return false +} diff --git a/internal/report/report_test.go b/internal/report/report_test.go index c757610..88f73d0 100644 --- a/internal/report/report_test.go +++ b/internal/report/report_test.go @@ -85,3 +85,79 @@ func TestWriteJSON(t *testing.T) { t.Errorf("expected clean, got %s", decoded.Verdict) } } + +func TestSanitizeControl(t *testing.T) { + cases := []struct { + in, want string + }{ + {"plain ascii", "plain ascii"}, + {"日本語", "日本語"}, + {"with\nnewline", `with\nnewline`}, + {"with\ttab", `with\ttab`}, + {"esc\x1b[2J", `esc\x1b[2J`}, + {"\x00null", `\x00null`}, + {"\x07bell", `\x07bell`}, + {"del\x7f", `del\x7f`}, + {"all\x00\x01\x1b\x7fclean", `all\x00\x01\x1b\x7fclean`}, + } + for _, tc := range cases { + if got := sanitizeControl(tc.in); got != tc.want { + t.Errorf("sanitizeControl(%q) = %q, want %q", tc.in, got, tc.want) + } + } +} + +// TestWriteJSON_StripsAnsiFromAttackerFields pins the security +// invariant: any attacker-controlled string field (CodeSnippet, Comm, +// Cmdline, etc.) carrying ANSI escape sequences is rendered into the +// JSON as a printable backslash escape, so a downstream `jq -r` style +// raw-string consumer cannot be fed terminal control bytes. +func TestWriteJSON_StripsAnsiFromAttackerFields(t *testing.T) { + events := []types.SyscallEvent{ + { + Timestamp: time.Now(), + PID: 1, + Syscall: types.EventDynamicExec, + AuditEvent: "exec", + CodeSnippet: "payload\x1b[2J\x1b[Hverdict: clean", + Comm: "evil\x1b[31m", + Cmdline: "sh\x07-c\x00boom", + FilePath: "/tmp/\x1bweird", + }, + } + r := Generate("badpkg", "1.0.0", types.EcosystemPyPI, types.VerdictSuspicious, "ebpf", events, 0, 0, nil) + + var buf bytes.Buffer + if err := WriteJSON(&r, &buf); err != nil { + t.Fatalf("WriteJSON failed: %v", err) + } + + // The serialised JSON must not contain a literal ESC byte (0x1b) + // or BEL/NUL anywhere, even after JSON decoding. + var decoded types.Report + if err := json.Unmarshal(buf.Bytes(), &decoded); err != nil { + t.Fatalf("unmarshal: %v", err) + } + + for _, e := range decoded.Events { + fields := []struct{ name, val string }{ + {"CodeSnippet", e.CodeSnippet}, + {"Comm", e.Comm}, + {"Cmdline", e.Cmdline}, + {"FilePath", e.FilePath}, + } + for _, f := range fields { + for i := 0; i < len(f.val); i++ { + c := f.val[i] + if c < 0x20 || c == 0x7f { + t.Errorf("%s contains raw control byte 0x%02x: %q", f.name, c, f.val) + break + } + } + } + // Spot-check the encoded form is human-readable. + if !bytes.Contains(buf.Bytes(), []byte(`\\x1b`)) { + t.Errorf("expected printable backslash-escaped ESC in JSON output, got:\n%s", buf.String()) + } + } +} diff --git a/internal/sandbox/hooks/sitecustomize.py b/internal/sandbox/hooks/sitecustomize.py index 295a96c..a1924f7 100644 --- a/internal/sandbox/hooks/sitecustomize.py +++ b/internal/sandbox/hooks/sitecustomize.py @@ -41,7 +41,27 @@ def _is_user(fn): def _t(s): - s = str(s).replace("\n", "\\n").replace("\r", "\\r") + # Escape every C0 control byte (0x00-0x1f) and DEL (0x7f) to a + # printable form. The wire format only needs newline-stripping for + # framing safety (handled in _w too), but the snippet field + # eventually surfaces in report.json's code_snippet — a downstream + # consumer that decodes the JSON and prints the raw string (e.g. + # `jq -r .events[].code_snippet`) used to receive an unescaped + # ESC (0x1b) byte and let an attacker-supplied payload paint + # arbitrary text on the user's terminal via ANSI sequences. + out = [] + for ch in str(s): + if ch == "\n": + out.append("\\n") + elif ch == "\r": + out.append("\\r") + elif ch == "\t": + out.append("\\t") + elif ord(ch) < 0x20 or ord(ch) == 0x7f: + out.append("\\x%02x" % ord(ch)) + else: + out.append(ch) + s = "".join(out) if len(s) > _MAX: return s[:_MAX] + "..." return s From 09f223ffca4be085d656b999495633cb917588da Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Fri, 8 May 2026 17:47:23 +0900 Subject: [PATCH 08/11] fix(cmd): owner-only mode for scan report and pinned manifest outputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gosec G306 flagged three output writers using 0o644 (or os.Create's default 0o666 then umask-clipped). All three carry data that shouldn't leak to other accounts on a multi-user host: - openOutput / -o report.json: events embed attacker-supplied code snippets, dst addrs, file paths, and the full dependency tree of the scanned package — anything a malicious package touches at install/import time. Switched os.Create to os.OpenFile with an explicit 0o600 so the report is owner-only by default; users who want to share can chmod after the fact. - writePinnedPyPI / writePinnedNpm: --pin output captures the resolved manifest from the last clean scan. Internal/private registry deps and version metadata can be load-bearing for inferring the user's stack, so default 0o600 too. The other gosec G306 hits (sandbox/staging package.json files, seccomp.json, the in-flight tarball copy) all stay at world-readable intentionally — the docker daemon and the in-container `dev` user need to read them, and the tests would silently miss every install- phase event without the wider mode. Those sites are out of scope here. Tests: TestOpenOutput_File and a new TestOutputFiles_OwnerOnly syscall.Umask(0) the test process and stat the resulting files, asserting Mode().Perm() == 0o600. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/root.go | 19 +++++++++++++++--- cmd/root_test.go | 50 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 5 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 713cbf3..1842100 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -581,7 +581,12 @@ func writePinnedPyPI(path string, deps []pinnedDep) error { fmt.Fprintf(&b, "%s\n", dep.Name) } } - return os.WriteFile(path, []byte(b.String()), 0o644) + // 0o600 — pinned files can carry private package names (internal + // registry deps) and embedded version metadata; default to user- + // only readable so a multi-user host doesn't leak the manifest to + // other accounts. Caller can chmod after the fact if they want to + // share. + return os.WriteFile(path, []byte(b.String()), 0o600) } func writePinnedNpm(path string, deps []pinnedDep) error { @@ -606,7 +611,9 @@ func writePinnedNpm(path string, deps []pinnedDep) error { } jsonBytes = append(jsonBytes, '\n') - return os.WriteFile(path, jsonBytes, 0o644) + // 0o600 for the same reason as writePinnedPyPI — keep the pinned + // manifest from leaking to other users on a shared host. + return os.WriteFile(path, jsonBytes, 0o600) } // runLocalScan scans a local package file (.whl, .tgz) or directory. @@ -1137,7 +1144,13 @@ func openOutput() (*os.File, error) { return os.Stdout, nil } - f, err := os.Create(flagOutput) + // 0o600 — the report can carry attacker-supplied code snippets, + // inferred file paths, and any internal package names from the + // scanned dependency tree. os.Create defaults to 0o666 (then + // umask-clipped, typically 0o644), which would leak the report + // to other users on a multi-user host. Owner-only by default; + // users who explicitly want to share can chmod after the fact. + f, err := os.OpenFile(flagOutput, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o600) if err != nil { return nil, fmt.Errorf("creating output file: %w", err) } diff --git a/cmd/root_test.go b/cmd/root_test.go index ec1cfcb..678d118 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" "strings" + "syscall" "testing" "github.com/RalianENG/kojuto/internal/types" @@ -350,8 +351,53 @@ func TestOpenOutput_File(t *testing.T) { } // Verify the file was actually created. - if _, err := os.Stat(flagOutput); err != nil { - t.Errorf("output file was not created: %v", err) + info, err := os.Stat(flagOutput) + if err != nil { + t.Fatalf("output file was not created: %v", err) + } + // Owner-only mode: gosec G306. Reports embed attacker-supplied + // snippets / paths that shouldn't leak to other users on a shared + // host. Test under a known umask so we measure the requested mode + // rather than the system umask's clipping. + oldUmask := syscall.Umask(0) + defer syscall.Umask(oldUmask) + if mode := info.Mode().Perm(); mode != 0o600 { + t.Errorf("output file mode = %o, want 0o600", mode) + } +} + +// TestOutputFiles_OwnerOnly pins the 0o600 contract for both pinned- +// manifest writers as well — they share the same threat model as the +// main report (multi-user host, package metadata leakage). +func TestOutputFiles_OwnerOnly(t *testing.T) { + oldUmask := syscall.Umask(0) + defer syscall.Umask(oldUmask) + + dir := t.TempDir() + deps := []pinnedDep{{Name: "lodash", Version: "4.17.21"}} + + pyPath := filepath.Join(dir, "requirements.txt") + if err := writePinnedPyPI(pyPath, deps); err != nil { + t.Fatalf("writePinnedPyPI: %v", err) + } + pyInfo, err := os.Stat(pyPath) + if err != nil { + t.Fatal(err) + } + if mode := pyInfo.Mode().Perm(); mode != 0o600 { + t.Errorf("pinned requirements.txt mode = %o, want 0o600", mode) + } + + npmPath := filepath.Join(dir, "package.json") + if err := writePinnedNpm(npmPath, deps); err != nil { + t.Fatalf("writePinnedNpm: %v", err) + } + npmInfo, err := os.Stat(npmPath) + if err != nil { + t.Fatal(err) + } + if mode := npmInfo.Mode().Perm(); mode != 0o600 { + t.Errorf("pinned package.json mode = %o, want 0o600", mode) } } From 64788b29c49d670eecad13a989ea651364c753d6 Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Fri, 8 May 2026 18:15:17 +0900 Subject: [PATCH 09/11] fix(probe,analyzer): close three detection-bypass paths in strace + eBPF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Broad re-audit (post-PR-#18 sweep) surfaced three concrete bypasses where a malicious package could escape kojuto's monitoring without tripping any current detector. All three are now closed; each ships with a regression test pinning the security invariant. 1. strace stderr scanner used bufio.Scanner with the default 64 KiB line cap and never checked scanner.Err(). The docker-exec stderr is a multiplex of strace trace lines AND the tracee's own writes to fd 2, so a single `process.stderr.write("A".repeat(70000))` in a preinstall script tripped ErrTooLong, the for-loop exited, and every subsequent rename / openat / connect / execve recorded by strace was silently dropped — yielding a CLEAN verdict on a package that did anything observable after the long write. Both call sites (container_strace.parseStraceOutput and fallback.StartWithPID) now `Buffer()` to a 16 MiB cap and treat any post-loop scanner.Err() as `dropped++`. The dropped counter already converts the verdict to inconclusive in cmd/root, so the bypass is closed without changing the verdict logic. 2. straceRenameRe / straceRenameatRe used `"([^"]+)"` to capture filenames, which fails on strace's C-style backslash-escaped quotes. A package that renames `/tmp/x"y` (a legal Unix filename containing `"`) over `/usr/local/bin/python3` produced an strace line `rename("/tmp/x\"y", "/usr/local/bin/python3") = 0` that the regex did not match — no event reached the analyzer, the binary hijack category never fired, and a subsequent execve of the hijacked path was filtered as benign by `isBenignExec`. The capture is now `((?:\\.|[^"\\])+)` for both `rename` and `renameat[2]` paths. 3. The eBPF vfs_rename probe writes `d_name` (a qstr basename) into evt.path / evt.path2, so eBPF mode delivers a rename event with `DstPath = "python3"` and no parent directory. `isBenignRename` computed `destDir = path.Dir("python3") + "/" = "./"` which never matched the allowed dirs (e.g. `/usr/local/bin/`) — the function fell through to `return true` and treated every rename as benign. Result: in `--probe-method=ebpf`, NO rename was ever flagged, regardless of its target. `isBenignRename` now fail-safes when DstPath has no leading "/": a basename matching a tracked binary stays suspicious. Strace mode emits absolute paths and is unaffected. Tests: - TestParseStraceOutput_LongLineFlipsToInconclusive feeds a straceMaxLine+1024-byte single line via a chunkReader and asserts cs.dropped > 0 after parseStraceOutput returns. - TestParseStraceLine_RenameWithEscapedQuotes exercises rename and renameat lines whose src/dst contain `\"`, asserting the event is produced with the expected SrcPath/DstPath. - TestAnalyze_RenameTrustedBinary gains three basename-only sub-cases covering python3/sh as suspicious and an unknown name as clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/analyzer/analyzer.go | 26 +++++++++--- internal/analyzer/analyzer_test.go | 10 +++++ internal/probe/container_strace.go | 21 +++++++++ internal/probe/container_strace_test.go | 52 +++++++++++++++++++++++ internal/probe/fallback.go | 9 ++++ internal/probe/strace_parse.go | 10 ++++- internal/probe/strace_parse_extra_test.go | 52 +++++++++++++++++++++++ 7 files changed, 172 insertions(+), 8 deletions(-) diff --git a/internal/analyzer/analyzer.go b/internal/analyzer/analyzer.go index c5a27de..9ee4392 100644 --- a/internal/analyzer/analyzer.go +++ b/internal/analyzer/analyzer.go @@ -942,13 +942,27 @@ var fileOpCommands = map[string]bool{ // Renames to other destinations (e.g. pip installing a new CLI script) are benign. func isBenignRename(evt *types.SyscallEvent) bool { destBase := path.Base(evt.DstPath) - destDir := path.Dir(evt.DstPath) + "/" + allowedDirs, ok := benignPaths[destBase] + if !ok { + return true + } - if allowedDirs, ok := benignPaths[destBase]; ok { - for _, d := range allowedDirs { - if destDir == d { - return false - } + // Basename-only DstPath (no leading "/") means the probe captured + // only the dentry name and not the parent directory — the eBPF + // probe takes this shape because vfs_rename's renamedata gives us + // d_name (a qstr basename) rather than a full path. Without the + // parent we cannot prove the rename targets a trusted directory, + // so fail-safe: a rename whose basename matches a tracked binary + // (python3, node, sh, ...) stays suspicious. Strace mode emits + // absolute paths and falls through to the directory check below. + if !strings.HasPrefix(evt.DstPath, "/") { + return false + } + + destDir := path.Dir(evt.DstPath) + "/" + for _, d := range allowedDirs { + if destDir == d { + return false } } diff --git a/internal/analyzer/analyzer_test.go b/internal/analyzer/analyzer_test.go index 047ad55..50aac42 100644 --- a/internal/analyzer/analyzer_test.go +++ b/internal/analyzer/analyzer_test.go @@ -279,6 +279,16 @@ func TestAnalyze_RenameTrustedBinary(t *testing.T) { {"new CLI script", "/usr/local/bin/my-tool", types.VerdictClean}, {"install dir", "/install/lib/module.so", types.VerdictClean}, {"tmp rename", "/tmp/a", types.VerdictClean}, + // Basename-only DstPath: the eBPF probe captures only the + // dentry name from vfs_rename's renamedata, not the parent + // path. Without the parent we can't verify the rename is in + // a trusted dir, so a basename matching a system binary must + // stay suspicious. Otherwise an eBPF-mode scan of a malware + // sample that renames over /usr/local/bin/python3 silently + // reports CLEAN. + {"basename-only python3 (eBPF mode)", "python3", types.VerdictSuspicious}, + {"basename-only sh (eBPF mode)", "sh", types.VerdictSuspicious}, + {"basename-only unknown (eBPF mode)", "myapp", types.VerdictClean}, } for _, tc := range cases { diff --git a/internal/probe/container_strace.go b/internal/probe/container_strace.go index f5655da..b50fc5e 100644 --- a/internal/probe/container_strace.go +++ b/internal/probe/container_strace.go @@ -6,11 +6,21 @@ import ( "errors" "fmt" "io" + "os" "os/exec" "github.com/RalianENG/kojuto/internal/types" ) +// straceMaxLine bounds the size of a single strace stderr line. With +// `-s 256` the per-arg cap is 256 bytes, so realistic execve+argv lines +// stay well under 1 MiB. We allow 16 MiB to accommodate exotic kernels +// or tracee writes interleaved on the shared docker exec stderr; lines +// longer than this are treated as adversarial (a malicious package +// trying to overflow bufio.Scanner's default 64 KiB cap so the scan +// loop exits and subsequent strace events are silently dropped). +const straceMaxLine = 16 * 1024 * 1024 + // ContainerStrace monitors connect(2) syscalls by running strace inside the Docker container. // This works on all platforms where Docker is available (Linux, macOS, Windows). // ContainerStrace monitors connect(2) syscalls by running strace inside the Docker container. @@ -96,6 +106,7 @@ func (c *ContainerStrace) parseStraceOutput(stderr io.ReadCloser, done chan<- st state := NewParseState() scanner := bufio.NewScanner(stderr) + scanner.Buffer(make([]byte, 64*1024), straceMaxLine) for scanner.Scan() { evt, ok := parseStraceLine(scanner.Text(), state) if !ok { @@ -112,6 +123,16 @@ func (c *ContainerStrace) parseStraceOutput(stderr io.ReadCloser, done chan<- st c.dropped++ } } + if err := scanner.Err(); err != nil { + // bufio.ErrTooLong here means the tracee wrote a >16 MiB + // chunk to the shared docker-exec stderr without a newline, + // most plausibly to disable parsing of subsequent strace + // trace lines. Any other error means the pipe died early. + // Either way we lost an unknown number of events, so flip + // the verdict to inconclusive via the dropped counter. + c.dropped++ + fmt.Fprintf(os.Stderr, "warning: strace stderr scanner aborted: %v\n", err) + } } // drainReader reads from r up to 10MB and returns the content. diff --git a/internal/probe/container_strace_test.go b/internal/probe/container_strace_test.go index 92e36e3..77de5e2 100644 --- a/internal/probe/container_strace_test.go +++ b/internal/probe/container_strace_test.go @@ -265,3 +265,55 @@ func TestParseStraceOutput_Done(_ *testing.T) { <-parseDone // should complete without hanging } + +// chunkReader returns the same chunk repeatedly until `remaining` bytes +// have been served, then EOF. Used to feed bufio.Scanner a giant single +// "line" without allocating it all up front. +type chunkReader struct { + chunk []byte + remaining int +} + +func (r *chunkReader) Read(p []byte) (int, error) { + if r.remaining <= 0 { + return 0, io.EOF + } + n := copy(p, r.chunk) + if n > r.remaining { + n = r.remaining + } + r.remaining -= n + return n, nil +} + +func (r *chunkReader) Close() error { return nil } + +// TestParseStraceOutput_LongLineFlipsToInconclusive pins the security +// fix for the prior detection-bypass: a malicious package could write +// >64 KiB without a newline to the shared docker-exec stderr (which +// carries strace trace output too), trip bufio.Scanner's default cap, +// silently terminate the parser loop, and let every subsequent strace +// event evade the analyzer. The fix raises the cap to straceMaxLine +// and turns any scanner error into a `dropped++` so the verdict logic +// flips to inconclusive instead of trusting the truncated trace. +func TestParseStraceOutput_LongLineFlipsToInconclusive(t *testing.T) { + // Slightly more than straceMaxLine, no newlines, so bufio.Scanner + // returns ErrTooLong. Use a chunkReader to avoid allocating the + // full payload — straceMaxLine is 16 MiB. + reader := &chunkReader{ + chunk: bytes.Repeat([]byte{'A'}, 4096), + remaining: straceMaxLine + 1024, + } + + cs := &ContainerStrace{ + events: make(chan types.SyscallEvent, 16), + done: make(chan struct{}), + } + parseDone := make(chan struct{}) + go cs.parseStraceOutput(reader, parseDone) + <-parseDone + + if cs.dropped == 0 { + t.Error("expected dropped > 0 after scanner overflow, got 0") + } +} diff --git a/internal/probe/fallback.go b/internal/probe/fallback.go index 3428967..da1633f 100644 --- a/internal/probe/fallback.go +++ b/internal/probe/fallback.go @@ -6,6 +6,7 @@ import ( "bufio" "errors" "fmt" + "os" "os/exec" "strconv" "sync" @@ -59,6 +60,7 @@ func (s *StraceFallback) StartWithPID(pid uint32) error { defer close(s.events) state := NewParseState() scanner := bufio.NewScanner(stderr) + scanner.Buffer(make([]byte, 64*1024), straceMaxLine) for scanner.Scan() { evt, ok := parseStraceLine(scanner.Text(), state) if ok { @@ -71,6 +73,13 @@ func (s *StraceFallback) StartWithPID(pid uint32) error { } } } + if err := scanner.Err(); err != nil { + // See straceMaxLine: an unrecoverable scanner failure + // here means subsequent trace lines are lost. Flip to + // inconclusive via dropped. + s.dropped++ + fmt.Fprintf(os.Stderr, "warning: strace stderr scanner aborted: %v\n", err) + } }() return nil diff --git a/internal/probe/strace_parse.go b/internal/probe/strace_parse.go index 986e7d5..5e8fd3e 100644 --- a/internal/probe/strace_parse.go +++ b/internal/probe/strace_parse.go @@ -87,13 +87,19 @@ var ( ) // rename("/tmp/evil", "/usr/local/bin/python3") = 0. + // `(?:\\.|[^"\\])+` accepts strace's C-style backslash escapes + // inside the captured filename. The previous `[^"]+` failed on + // strace lines whose filename contained a literal `"` (rendered + // as `\"` by strace), causing the whole regex to drop the event + // — a malicious package could rename `/tmp/x"y` over a trusted + // system binary and leave no rename event for the analyzer. straceRenameRe = regexp.MustCompile( - `rename\("([^"]+)",\s*"([^"]+)"\)`, + `rename\("((?:\\.|[^"\\])+)",\s*"((?:\\.|[^"\\])+)"\)`, ) // renameat(AT_FDCWD, "old", AT_FDCWD, "new") or renameat2(...) straceRenameatRe = regexp.MustCompile( - `renameat2?\([^,]+,\s*"([^"]+)",\s*[^,]+,\s*"([^"]+)"`, + `renameat2?\([^,]+,\s*"((?:\\.|[^"\\])+)",\s*[^,]+,\s*"((?:\\.|[^"\\])+)"`, ) // mmap(NULL, 4096, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f... diff --git a/internal/probe/strace_parse_extra_test.go b/internal/probe/strace_parse_extra_test.go index fe11e70..114decd 100644 --- a/internal/probe/strace_parse_extra_test.go +++ b/internal/probe/strace_parse_extra_test.go @@ -869,3 +869,55 @@ func TestParseAuditHook_IntegratedWithParseStraceLine(t *testing.T) { t.Errorf("audit_event = %q, want eval", evt.AuditEvent) } } + + +// TestParseStraceLine_RenameWithEscapedQuotes pins the security fix for +// the prior `"([^"]+)"` capture pattern, which silently dropped the +// rename event whenever a filename contained a literal `"` (escaped by +// strace as `\"`). Without an event the analyzer never sees the hijack +// and the verdict stays clean — a malicious package could rename +// `/tmp/x"y` over `/usr/local/bin/python3` invisibly. +func TestParseStraceLine_RenameWithEscapedQuotes(t *testing.T) { + cases := []struct { + name string + line string + wantSrc string + wantDst string + }{ + { + name: "rename with escaped quote in src", + line: `[pid 100] rename("/tmp/x\"y", "/usr/local/bin/python3") = 0`, + wantSrc: `/tmp/x\"y`, + wantDst: "/usr/local/bin/python3", + }, + { + name: "renameat with escaped quote in dst", + line: `[pid 101] renameat(AT_FDCWD, "/tmp/legit", AT_FDCWD, "/etc/x\"y") = 0`, + wantSrc: "/tmp/legit", + wantDst: `/etc/x\"y`, + }, + { + name: "plain rename still parses", + line: `[pid 102] rename("/tmp/a", "/usr/local/bin/node") = 0`, + wantSrc: "/tmp/a", + wantDst: "/usr/local/bin/node", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + evt, ok := parseStraceLine(tc.line, NewParseState()) + if !ok { + t.Fatalf("expected rename to parse, line=%q", tc.line) + } + if evt.Syscall != types.EventRename { + t.Errorf("syscall = %q, want %q", evt.Syscall, types.EventRename) + } + if evt.SrcPath != tc.wantSrc { + t.Errorf("SrcPath = %q, want %q", evt.SrcPath, tc.wantSrc) + } + if evt.DstPath != tc.wantDst { + t.Errorf("DstPath = %q, want %q", evt.DstPath, tc.wantDst) + } + }) + } +} From e71d423ca9c05a4efdd4ddf44c71275db59895a9 Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Fri, 8 May 2026 18:28:27 +0900 Subject: [PATCH 10/11] fix: clear golangci-lint violations from the hardening series MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hardening commits (71d6d0a..64788b2) collected nine linter violations the local go test pass didn't surface. Local CI parity run with golangci-lint v2.1.6 — the version pinned in .github/ workflows/ci.yml — turned them up; this commit clears them all. - gocritic regexpSimplify in depfile.go: drop the unnecessary `\/` escape inside a non-character-class regex. - gofmt in strace_parse_extra_test.go and sandbox.go: trailing blank line / shQuote comment alignment. - govet shadow in cmd/root.go and cmd/root_test.go: rename the inner err binding so it doesn't shadow an outer one. - intrange in report.go (×2) and report_test.go (×1): switch bounded for-loops to `for i := range N` (Go 1.22+). - misspell in report_test.go: serialised → serialized. Re-running `golangci-lint run` reports `0 issues.`. Cross-compile across all 5 GOOS/GOARCH matrix entries, `go test -race ./...`, `go mod verify`, and `govulncheck ./...` all pass locally too. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/root.go | 4 ++-- cmd/root_test.go | 4 ++-- internal/depfile/depfile.go | 2 +- internal/probe/strace_parse_extra_test.go | 1 - internal/report/report.go | 4 ++-- internal/report/report_test.go | 4 ++-- internal/sandbox/sandbox.go | 2 +- 7 files changed, 10 insertions(+), 11 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 1842100..a45e40f 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -671,8 +671,8 @@ func runLocalScan(_ []string) error { // sandbox helpers used to interpolate into shell heredocs and Python/ // JS string literals. The dockerWriteFile refactor closed the heredoc // path, but defending at the boundary keeps every future use site safe. - if err := downloaderValidate(pkg, ""); err != nil { - return fmt.Errorf("local package name derived from %q is unsafe: %w", localPath, err) + if validateErr := downloaderValidate(pkg, ""); validateErr != nil { + return fmt.Errorf("local package name derived from %q is unsafe: %w", localPath, validateErr) } // Auto-detect ecosystem from file extension, but only if -e was not explicitly set. diff --git a/cmd/root_test.go b/cmd/root_test.go index 678d118..c0fe972 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -389,8 +389,8 @@ func TestOutputFiles_OwnerOnly(t *testing.T) { } npmPath := filepath.Join(dir, "package.json") - if err := writePinnedNpm(npmPath, deps); err != nil { - t.Fatalf("writePinnedNpm: %v", err) + if writeErr := writePinnedNpm(npmPath, deps); writeErr != nil { + t.Fatalf("writePinnedNpm: %v", writeErr) } npmInfo, err := os.Stat(npmPath) if err != nil { diff --git a/internal/depfile/depfile.go b/internal/depfile/depfile.go index 56cc0d8..17e85cb 100644 --- a/internal/depfile/depfile.go +++ b/internal/depfile/depfile.go @@ -20,7 +20,7 @@ import ( // validation here gives the user a clear error instead of a silently // malformed command. var ( - npmNameRe = regexp.MustCompile(`^(@[a-z0-9][a-z0-9._~-]*\/)?[a-z0-9][a-z0-9._~-]*$`) + npmNameRe = regexp.MustCompile(`^(@[a-z0-9][a-z0-9._~-]*/)?[a-z0-9][a-z0-9._~-]*$`) pypiNameRe = regexp.MustCompile(`^[A-Za-z0-9][A-Za-z0-9._-]*$`) ) diff --git a/internal/probe/strace_parse_extra_test.go b/internal/probe/strace_parse_extra_test.go index 114decd..f27c6c0 100644 --- a/internal/probe/strace_parse_extra_test.go +++ b/internal/probe/strace_parse_extra_test.go @@ -870,7 +870,6 @@ func TestParseAuditHook_IntegratedWithParseStraceLine(t *testing.T) { } } - // TestParseStraceLine_RenameWithEscapedQuotes pins the security fix for // the prior `"([^"]+)"` capture pattern, which silently dropped the // rename event whenever a filename contained a literal `"` (escaped by diff --git a/internal/report/report.go b/internal/report/report.go index acb9fc7..02de4cb 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -65,7 +65,7 @@ func WriteJSON(r *types.Report, w io.Writer) error { // their values never contain control bytes. func sanitizeEventStrings(e *types.SyscallEvent) { v := reflect.ValueOf(e).Elem() - for i := 0; i < v.NumField(); i++ { + for i := range v.NumField() { f := v.Field(i) if f.Kind() == reflect.String && f.CanSet() { f.SetString(sanitizeControl(f.String())) @@ -101,7 +101,7 @@ func sanitizeControl(s string) string { } func needsControlEscape(s string) bool { - for i := 0; i < len(s); i++ { + for i := range len(s) { c := s[i] if c < 0x20 || c == 0x7f { return true diff --git a/internal/report/report_test.go b/internal/report/report_test.go index 88f73d0..f9e8e71 100644 --- a/internal/report/report_test.go +++ b/internal/report/report_test.go @@ -132,7 +132,7 @@ func TestWriteJSON_StripsAnsiFromAttackerFields(t *testing.T) { t.Fatalf("WriteJSON failed: %v", err) } - // The serialised JSON must not contain a literal ESC byte (0x1b) + // The serialized JSON must not contain a literal ESC byte (0x1b) // or BEL/NUL anywhere, even after JSON decoding. var decoded types.Report if err := json.Unmarshal(buf.Bytes(), &decoded); err != nil { @@ -147,7 +147,7 @@ func TestWriteJSON_StripsAnsiFromAttackerFields(t *testing.T) { {"FilePath", e.FilePath}, } for _, f := range fields { - for i := 0; i < len(f.val); i++ { + for i := range len(f.val) { c := f.val[i] if c < 0x20 || c == 0x7f { t.Errorf("%s contains raw control byte 0x%02x: %q", f.name, c, f.val) diff --git a/internal/sandbox/sandbox.go b/internal/sandbox/sandbox.go index f51d4e1..75da4c0 100644 --- a/internal/sandbox/sandbox.go +++ b/internal/sandbox/sandbox.go @@ -729,7 +729,7 @@ func npmLifecycleScript(pkgs []string) string { } // shQuote wraps s in POSIX single quotes, escaping any embedded single -// quote with the standard '\'' close-escape-reopen idiom. Safe for use +// quote with the standard '\” close-escape-reopen idiom. Safe for use // in /bin/sh, dash, and bash. func shQuote(s string) string { return "'" + strings.ReplaceAll(s, "'", `'\''`) + "'" From 5e3cca3361283353ef33a7e76f4d63f0d841b178 Mon Sep 17 00:00:00 2001 From: "Ralian.ENG" Date: Fri, 8 May 2026 18:37:05 +0900 Subject: [PATCH 11/11] fix(cmd): split file-mode tests behind !windows build tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI on windows-latest failed to compile cmd/root_test.go because syscall.Umask is undefined on Windows. The umask call exists in TestOpenOutput_File and TestOutputFiles_OwnerOnly so the asserted mode is the requested 0o600 rather than the user's umask-clipped result — Windows has no umask and Mode().Perm() doesn't reflect Unix-style bits there, so the assertion is meaningless on Windows even if it built. Move the perm-check tests to a new cmd/root_unix_test.go behind `//go:build !windows`. TestOpenOutput_File stays in root_test.go in its original cross-platform shape (verifies the file is created and is not os.Stdout); the mode assertion now lives in the new TestOpenOutput_FileMode in the unix file. TestOutputFiles_OwnerOnly moves wholesale. Verified locally: - GOOS=windows GOARCH=amd64 go test -c ./cmd/ now compiles - GOOS=linux GOARCH=amd64 go test -c ./cmd/ still compiles - go test -race ./cmd/ green at 72.6% coverage - golangci-lint v2.1.6 reports 0 issues Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/root_test.go | 55 ++++------------------------- cmd/root_unix_test.go | 80 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 49 deletions(-) create mode 100644 cmd/root_unix_test.go diff --git a/cmd/root_test.go b/cmd/root_test.go index c0fe972..ba56d7f 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -8,7 +8,6 @@ import ( "os" "path/filepath" "strings" - "syscall" "testing" "github.com/RalianENG/kojuto/internal/types" @@ -350,54 +349,12 @@ func TestOpenOutput_File(t *testing.T) { t.Error("expected a file, not os.Stdout") } - // Verify the file was actually created. - info, err := os.Stat(flagOutput) - if err != nil { - t.Fatalf("output file was not created: %v", err) - } - // Owner-only mode: gosec G306. Reports embed attacker-supplied - // snippets / paths that shouldn't leak to other users on a shared - // host. Test under a known umask so we measure the requested mode - // rather than the system umask's clipping. - oldUmask := syscall.Umask(0) - defer syscall.Umask(oldUmask) - if mode := info.Mode().Perm(); mode != 0o600 { - t.Errorf("output file mode = %o, want 0o600", mode) - } -} - -// TestOutputFiles_OwnerOnly pins the 0o600 contract for both pinned- -// manifest writers as well — they share the same threat model as the -// main report (multi-user host, package metadata leakage). -func TestOutputFiles_OwnerOnly(t *testing.T) { - oldUmask := syscall.Umask(0) - defer syscall.Umask(oldUmask) - - dir := t.TempDir() - deps := []pinnedDep{{Name: "lodash", Version: "4.17.21"}} - - pyPath := filepath.Join(dir, "requirements.txt") - if err := writePinnedPyPI(pyPath, deps); err != nil { - t.Fatalf("writePinnedPyPI: %v", err) - } - pyInfo, err := os.Stat(pyPath) - if err != nil { - t.Fatal(err) - } - if mode := pyInfo.Mode().Perm(); mode != 0o600 { - t.Errorf("pinned requirements.txt mode = %o, want 0o600", mode) - } - - npmPath := filepath.Join(dir, "package.json") - if writeErr := writePinnedNpm(npmPath, deps); writeErr != nil { - t.Fatalf("writePinnedNpm: %v", writeErr) - } - npmInfo, err := os.Stat(npmPath) - if err != nil { - t.Fatal(err) - } - if mode := npmInfo.Mode().Perm(); mode != 0o600 { - t.Errorf("pinned package.json mode = %o, want 0o600", mode) + // Verify the file was actually created. The owner-only (0o600) + // permission contract is exercised by TestOpenOutput_FileMode and + // TestOutputFiles_OwnerOnly in root_unix_test.go — those tests + // rely on syscall.Umask, which is not defined on Windows. + if _, err := os.Stat(flagOutput); err != nil { + t.Errorf("output file was not created: %v", err) } } diff --git a/cmd/root_unix_test.go b/cmd/root_unix_test.go new file mode 100644 index 0000000..14f0d15 --- /dev/null +++ b/cmd/root_unix_test.go @@ -0,0 +1,80 @@ +//go:build !windows + +package cmd + +import ( + "os" + "path/filepath" + "syscall" + "testing" +) + +// TestOpenOutput_FileMode pins the 0o600 contract for the main scan +// report file. Reports embed attacker-supplied code snippets, file +// paths, and the dependency tree of the scanned package; on a multi- +// user host that data must not be readable by other accounts. The +// test runs under syscall.Umask(0) so the assertion measures the +// requested mode rather than the user's umask-clipped result. +// +// Windows has no umask and no Unix-style permission bits, and Go's +// os.File.Mode().Perm() reports a synthetic 0o666/0o444 there, so +// this test lives in a !windows build-tagged file. +func TestOpenOutput_FileMode(t *testing.T) { + oldUmask := syscall.Umask(0) + defer syscall.Umask(oldUmask) + + original := flagOutput + defer func() { flagOutput = original }() + + dir := t.TempDir() + flagOutput = filepath.Join(dir, "test-output.json") + + f, err := openOutput() + if err != nil { + t.Fatalf("openOutput() error: %v", err) + } + defer f.Close() + + info, err := os.Stat(flagOutput) + if err != nil { + t.Fatalf("output file was not created: %v", err) + } + if mode := info.Mode().Perm(); mode != 0o600 { + t.Errorf("output file mode = %o, want 0o600", mode) + } +} + +// TestOutputFiles_OwnerOnly pins the 0o600 contract for both pinned- +// manifest writers — they share the same threat model as the main +// report (multi-user host, package metadata leakage). +func TestOutputFiles_OwnerOnly(t *testing.T) { + oldUmask := syscall.Umask(0) + defer syscall.Umask(oldUmask) + + dir := t.TempDir() + deps := []pinnedDep{{Name: "lodash", Version: "4.17.21"}} + + pyPath := filepath.Join(dir, "requirements.txt") + if err := writePinnedPyPI(pyPath, deps); err != nil { + t.Fatalf("writePinnedPyPI: %v", err) + } + pyInfo, err := os.Stat(pyPath) + if err != nil { + t.Fatal(err) + } + if mode := pyInfo.Mode().Perm(); mode != 0o600 { + t.Errorf("pinned requirements.txt mode = %o, want 0o600", mode) + } + + npmPath := filepath.Join(dir, "package.json") + if writeErr := writePinnedNpm(npmPath, deps); writeErr != nil { + t.Fatalf("writePinnedNpm: %v", writeErr) + } + npmInfo, err := os.Stat(npmPath) + if err != nil { + t.Fatal(err) + } + if mode := npmInfo.Mode().Perm(); mode != 0o600 { + t.Errorf("pinned package.json mode = %o, want 0o600", mode) + } +}