diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ec3ad3..ee840b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,9 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - **Audit hooks for dynamic code execution detection** — Python PEP 578 hook (`sitecustomize.py`) intercepts `compile`/`exec`/`import`/`ctypes.dlopen`; Node.js `--require` hook (`kojuto-require.js`) intercepts `eval`/`Function`/`vm.runInNewContext`/`vm.runInThisContext`/`vm.Script`. New `dynamic_code_execution` category and `EventDynamicExec` event type +- **Severity-tiered verdict** — `types.CategorySeverity` classifies each detection category as HIGH (one event raises the verdict to SUSPICIOUS), MEDIUM (two-or-more raise it), or LOW (never raises the verdict alone). `dynamic_code_execution` is LOW, `dns_tunneling` and `evasion` are MEDIUM, all other categories stay HIGH. Unmapped categories fail closed (treated as HIGH). LOW events still appear in `report.events` for forensics — verdict reflects severity, not raw event count. Stops legitimate Python compat libraries (`six`, `attrs`, `future`) from flipping to SUSPICIOUS on benign internal `compile`/`exec` calls +- **Caller-aware audit hook** — `sitecustomize.py` now walks the Python call stack and reports the actual `.py` file invoking `compile`/`exec`, not the user-controllable `filename` argument (which `six` deliberately sets to ``). When the deepest frame lives inside the scanned package's `site-packages` directory the hook prefixes the wire payload with `+` so the analyzer bypasses path-based benign filtering. Sandbox passes the scan list via `KOJUTO_SCAN_PKGS` so the hook knows which paths count as "user code" - **System binary write detection** — `openat` with write flags to trusted system binaries (`python3`, `node`, `pip`, `sh`, etc.) in `/usr/local/bin/` or `/usr/bin/` classified as `binary_hijacking`, preventing benignPaths bypass via tmpfs overwrite - **gVisor auto-detection** — `--runtime` default changed from empty (runc) to `auto`: probes `docker info` for runsc availability and uses gVisor if registered, falls back to runc otherwise - **npm test package** — `testdata/probe-npm-0.1.0/` with lifecycle hook payloads (preinstall/postinstall) and require-time import phase covering 31 TTPs across 9 attack categories including audit hook validation +- **GitHub Action inputs** — `config`, `quiet`, and `no-color` exposed as Action inputs to match CLI flags (`--config`, `--quiet`, `--no-color`) ### Changed - Go version requirement lowered from 1.25.0 to 1.24.0 (stable release); `golang.org/x/sys` downgraded from v0.43.0 to v0.41.0 diff --git a/README.md b/README.md index 62f9c7b..6a2140f 100644 --- a/README.md +++ b/README.md @@ -253,7 +253,7 @@ Of the 300 malicious samples, 238 failed to install (dependencies already remove |----------|----------|-----------------| | C2 communication (`c2_communication`) | `aiogram-types-v3` → `147.45.124.42:80` | `connect`/`sendto` to non-loopback IPs | | Data exfiltration (`data_exfiltration`) | DNS resolution of Discord/Telegram/Pastebin | `sendto` port 53 resolving known exfil services | -| Credential access (`credential_access`) | `axios-attack-demo` → `.ssh/id_rsa`, `.aws/credentials`, `.solana/id.json` | `openat` on ~50 sensitive paths (SSH, cloud, crypto wallets, browser data) | +| Credential access (`credential_access`) | `axios-attack-demo` → `.ssh/id_rsa`, `.aws/credentials`, `.solana/id.json` | `openat` on ~60 sensitive paths (SSH, cloud, crypto wallets, browser data) | | Code execution (`code_execution`) | `advpruebitaa` → `type nul > prueba11.txt`, `/tmp/ld.py` | `execve` with inline `-c`/`-e` flags or from `/tmp`, `/dev/shm` | | Memory execution (`memory_execution`) | `ctypes.mmap(RWX)` shellcode injection | `mmap`/`mprotect` with simultaneous PROT_WRITE+PROT_EXEC | | Binary hijacking (`binary_hijacking`) | `rename /tmp/evil /usr/local/bin/python3` | `rename` targeting trusted system binaries | diff --git a/action.yml b/action.yml index 51025ea..a88e97f 100644 --- a/action.yml +++ b/action.yml @@ -29,10 +29,14 @@ inputs: description: 'Scan a local package file (.whl, .tgz) or directory' required: false default: '' - runtime: - description: 'Container runtime: auto (default, use gVisor if available), runsc (force gVisor), runc (force Docker default). Empty string also resolves to auto on the CLI.' + config: + description: 'Path to kojuto.yml config file (default: kojuto.yml in current directory)' required: false default: '' + runtime: + description: 'Container runtime: auto (use gVisor if available), runsc (force gVisor), runc (force Docker default)' + required: false + default: 'auto' probe-method: description: 'Probe method: auto, ebpf, strace, strace-container' required: false @@ -45,6 +49,14 @@ inputs: description: 'Ignore sensitive_paths.exclude from config (recommended for CI)' required: false default: 'true' + quiet: + description: 'Suppress phase progress output; emit only the final verdict block' + required: false + default: 'false' + no-color: + description: 'Disable colored output (NO_COLOR env is also respected)' + required: false + default: 'false' outputs: verdict: @@ -77,10 +89,13 @@ runs: KOJUTO_FILE: ${{ inputs.file }} KOJUTO_PIN: ${{ inputs.pin }} KOJUTO_LOCAL: ${{ inputs.local }} + KOJUTO_CONFIG: ${{ inputs.config }} KOJUTO_RUNTIME: ${{ inputs.runtime }} KOJUTO_PROBE_METHOD: ${{ inputs.probe-method }} KOJUTO_TIMEOUT: ${{ inputs.timeout }} KOJUTO_STRICT: ${{ inputs.strict }} + KOJUTO_QUIET: ${{ inputs.quiet }} + KOJUTO_NO_COLOR: ${{ inputs['no-color'] }} run: | ARGS=() @@ -111,9 +126,18 @@ runs: if [ -n "$KOJUTO_RUNTIME" ]; then ARGS+=("--runtime" "$KOJUTO_RUNTIME") fi + if [ -n "$KOJUTO_CONFIG" ]; then + ARGS+=("--config" "$KOJUTO_CONFIG") + fi if [ "$KOJUTO_STRICT" = "true" ]; then ARGS+=("--strict") fi + if [ "$KOJUTO_QUIET" = "true" ]; then + ARGS+=("--quiet") + fi + if [ "$KOJUTO_NO_COLOR" = "true" ]; then + ARGS+=("--no-color") + fi sudo "${{ github.action_path }}/kojuto" scan "${ARGS[@]}" \ -o /tmp/kojuto-report.json diff --git a/cmd/root.go b/cmd/root.go index aa89a6c..a45e40f 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -165,10 +165,21 @@ func preRunLoadConfig(_ *cobra.Command, _ []string) error { return fmt.Errorf("loading config %s: %w", cfgPath, err) } - if flagStrict && len(cfg.SensitivePaths.Exclude) > 0 { - fmt.Fprintf(os.Stderr, "warning: --strict ignoring %d excluded path(s) from config: %v\n", - len(cfg.SensitivePaths.Exclude), cfg.SensitivePaths.Exclude) - cfg.SensitivePaths.Exclude = nil + if len(cfg.SensitivePaths.Exclude) > 0 { + if flagStrict { + fmt.Fprintf(os.Stderr, "warning: --strict ignoring %d excluded path(s) from config: %v\n", + len(cfg.SensitivePaths.Exclude), cfg.SensitivePaths.Exclude) + cfg.SensitivePaths.Exclude = nil + } else { + // Without --strict, an attacker-planted or otherwise + // untrusted kojuto.yml in the cwd silently shrinks the + // detection surface. Surface the exclusions on stderr so + // the user can see them in CI logs and interactive output + // before trusting a "clean" verdict. + fmt.Fprintf(os.Stderr, "warning: %s excludes %d sensitive path(s) from monitoring: %v\n", + cfgPath, len(cfg.SensitivePaths.Exclude), cfg.SensitivePaths.Exclude) + fmt.Fprintln(os.Stderr, " pass --strict to ignore these exclusions") + } } paths := config.MergeSensitivePaths(cfg) @@ -570,7 +581,12 @@ func writePinnedPyPI(path string, deps []pinnedDep) error { fmt.Fprintf(&b, "%s\n", dep.Name) } } - return os.WriteFile(path, []byte(b.String()), 0o644) + // 0o600 — pinned files can carry private package names (internal + // registry deps) and embedded version metadata; default to user- + // only readable so a multi-user host doesn't leak the manifest to + // other accounts. Caller can chmod after the fact if they want to + // share. + return os.WriteFile(path, []byte(b.String()), 0o600) } func writePinnedNpm(path string, deps []pinnedDep) error { @@ -595,7 +611,9 @@ func writePinnedNpm(path string, deps []pinnedDep) error { } jsonBytes = append(jsonBytes, '\n') - return os.WriteFile(path, jsonBytes, 0o644) + // 0o600 for the same reason as writePinnedPyPI — keep the pinned + // manifest from leaking to other users on a shared host. + return os.WriteFile(path, jsonBytes, 0o600) } // runLocalScan scans a local package file (.whl, .tgz) or directory. @@ -646,6 +664,17 @@ func runLocalScan(_ []string) error { pkg = detectPackageName(filepath.Base(localPath)) } + // Validate the derived package name with the same regex as registry + // scans. Local mode is the only path where pkg is built from a user- + // supplied filesystem path, so the name can carry any byte the OS + // allows — including newlines and shell metacharacters that downstream + // sandbox helpers used to interpolate into shell heredocs and Python/ + // JS string literals. The dockerWriteFile refactor closed the heredoc + // path, but defending at the boundary keeps every future use site safe. + if validateErr := downloaderValidate(pkg, ""); validateErr != nil { + return fmt.Errorf("local package name derived from %q is unsafe: %w", localPath, validateErr) + } + // Auto-detect ecosystem from file extension, but only if -e was not explicitly set. // .tgz is always npm; .tar.gz is ambiguous (could be PyPI sdist), so only // override to npm for .tgz, not .tar.gz. @@ -1115,7 +1144,13 @@ func openOutput() (*os.File, error) { return os.Stdout, nil } - f, err := os.Create(flagOutput) + // 0o600 — the report can carry attacker-supplied code snippets, + // inferred file paths, and any internal package names from the + // scanned dependency tree. os.Create defaults to 0o666 (then + // umask-clipped, typically 0o644), which would leak the report + // to other users on a multi-user host. Owner-only by default; + // users who explicitly want to share can chmod after the fact. + f, err := os.OpenFile(flagOutput, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o600) if err != nil { return nil, fmt.Errorf("creating output file: %w", err) } diff --git a/cmd/root_mock_test.go b/cmd/root_mock_test.go index 5819caf..37badf0 100644 --- a/cmd/root_mock_test.go +++ b/cmd/root_mock_test.go @@ -443,6 +443,40 @@ func TestRunLocalScan_NpmAutoDetect(t *testing.T) { } } +// TestRunLocalScan_RejectsUnsafeFilename pins the security invariant that +// a filename whose derived package name contains shell metacharacters or +// newlines is rejected at the boundary, before any sandbox interaction. +// Without this gate, a malicious .tgz/.whl filename containing a newline +// followed by "KOJUTO_EOF" used to terminate dockerWriteFile's predecessor +// heredoc and execute arbitrary commands as root inside the container. +// The dockerWriteFile refactor closed the heredoc; this test guards the +// belt-and-braces input check. +func TestRunLocalScan_RejectsUnsafeFilename(t *testing.T) { + saveAndRestoreFlags(t) + + dir := t.TempDir() + // "\n" + "KOJUTO_EOF" payload baked into a .tgz filename. macOS APFS + // and Linux ext4/xfs all permit '\n' in filenames; if the underlying + // FS rejects it, skip rather than fail. + badName := "evil\nKOJUTO_EOF\nrm-1.0.0.tgz" + badFile := filepath.Join(dir, badName) + if err := os.WriteFile(badFile, []byte("x"), 0o644); err != nil { + t.Skipf("filesystem refuses newlines in filenames: %v", err) + } + + flagLocal = badFile + flagEcosystem = types.EcosystemNpm + flagTimeout = 5 * time.Second + + err := runLocalScan(nil) + if err == nil { + t.Fatal("expected validation error for unsafe local filename") + } + if !strings.Contains(err.Error(), "unsafe") { + t.Errorf("expected 'unsafe' in error, got: %v", err) + } +} + // --- prepareLocalNpm tests --- func TestPrepareLocalNpm_NoTgz(t *testing.T) { diff --git a/cmd/root_test.go b/cmd/root_test.go index 7786b62..ba56d7f 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -349,7 +349,10 @@ func TestOpenOutput_File(t *testing.T) { t.Error("expected a file, not os.Stdout") } - // Verify the file was actually created. + // Verify the file was actually created. The owner-only (0o600) + // permission contract is exercised by TestOpenOutput_FileMode and + // TestOutputFiles_OwnerOnly in root_unix_test.go — those tests + // rely on syscall.Umask, which is not defined on Windows. if _, err := os.Stat(flagOutput); err != nil { t.Errorf("output file was not created: %v", err) } @@ -634,6 +637,55 @@ func TestPreRunLoadConfig_StrictIgnoresExclude(t *testing.T) { } } +// TestPreRunLoadConfig_LenientWarnsOnExclude pins the visibility +// guarantee added for surface C: when --strict is OFF and the loaded +// kojuto.yml carries `exclude` entries, a stderr warning surfaces the +// reduction in coverage so a CI log or interactive run cannot silently +// trust a "clean" verdict produced under a tampered config. Without +// this, an attacker who plants kojuto.yml in the cwd of a target repo +// can shrink the detection surface invisibly. +func TestPreRunLoadConfig_LenientWarnsOnExclude(t *testing.T) { + origConfig := flagConfig + origStrict := flagStrict + defer func() { flagConfig = origConfig; flagStrict = origStrict }() + + dir := t.TempDir() + cfgPath := filepath.Join(dir, "kojuto.yml") + if err := os.WriteFile(cfgPath, []byte("sensitive_paths:\n exclude:\n - \"/.ssh/\"\n - \"/.aws/\"\n"), 0o644); err != nil { + t.Fatal(err) + } + + flagConfig = cfgPath + flagStrict = false + + oldStderr := os.Stderr + r, w, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + os.Stderr = w + t.Cleanup(func() { os.Stderr = oldStderr }) + + runErr := preRunLoadConfig(nil, nil) + w.Close() + + data, _ := io.ReadAll(r) + got := string(data) + + if runErr != nil { + t.Fatalf("preRunLoadConfig returned error: %v", runErr) + } + if !strings.Contains(got, "excludes 2 sensitive path(s)") { + t.Errorf("stderr missing exclusion-count warning, got:\n%s", got) + } + if !strings.Contains(got, "/.ssh/") || !strings.Contains(got, "/.aws/") { + t.Errorf("stderr missing the actual excluded paths, got:\n%s", got) + } + if !strings.Contains(got, "--strict") { + t.Errorf("stderr missing pointer to --strict flag, got:\n%s", got) + } +} + func TestPreRunLoadConfig_InvalidConfig(t *testing.T) { origConfig := flagConfig origStrict := flagStrict diff --git a/cmd/root_unix_test.go b/cmd/root_unix_test.go new file mode 100644 index 0000000..14f0d15 --- /dev/null +++ b/cmd/root_unix_test.go @@ -0,0 +1,80 @@ +//go:build !windows + +package cmd + +import ( + "os" + "path/filepath" + "syscall" + "testing" +) + +// TestOpenOutput_FileMode pins the 0o600 contract for the main scan +// report file. Reports embed attacker-supplied code snippets, file +// paths, and the dependency tree of the scanned package; on a multi- +// user host that data must not be readable by other accounts. The +// test runs under syscall.Umask(0) so the assertion measures the +// requested mode rather than the user's umask-clipped result. +// +// Windows has no umask and no Unix-style permission bits, and Go's +// os.File.Mode().Perm() reports a synthetic 0o666/0o444 there, so +// this test lives in a !windows build-tagged file. +func TestOpenOutput_FileMode(t *testing.T) { + oldUmask := syscall.Umask(0) + defer syscall.Umask(oldUmask) + + original := flagOutput + defer func() { flagOutput = original }() + + dir := t.TempDir() + flagOutput = filepath.Join(dir, "test-output.json") + + f, err := openOutput() + if err != nil { + t.Fatalf("openOutput() error: %v", err) + } + defer f.Close() + + info, err := os.Stat(flagOutput) + if err != nil { + t.Fatalf("output file was not created: %v", err) + } + if mode := info.Mode().Perm(); mode != 0o600 { + t.Errorf("output file mode = %o, want 0o600", mode) + } +} + +// TestOutputFiles_OwnerOnly pins the 0o600 contract for both pinned- +// manifest writers — they share the same threat model as the main +// report (multi-user host, package metadata leakage). +func TestOutputFiles_OwnerOnly(t *testing.T) { + oldUmask := syscall.Umask(0) + defer syscall.Umask(oldUmask) + + dir := t.TempDir() + deps := []pinnedDep{{Name: "lodash", Version: "4.17.21"}} + + pyPath := filepath.Join(dir, "requirements.txt") + if err := writePinnedPyPI(pyPath, deps); err != nil { + t.Fatalf("writePinnedPyPI: %v", err) + } + pyInfo, err := os.Stat(pyPath) + if err != nil { + t.Fatal(err) + } + if mode := pyInfo.Mode().Perm(); mode != 0o600 { + t.Errorf("pinned requirements.txt mode = %o, want 0o600", mode) + } + + npmPath := filepath.Join(dir, "package.json") + if writeErr := writePinnedNpm(npmPath, deps); writeErr != nil { + t.Fatalf("writePinnedNpm: %v", writeErr) + } + npmInfo, err := os.Stat(npmPath) + if err != nil { + t.Fatal(err) + } + if mode := npmInfo.Mode().Perm(); mode != 0o600 { + t.Errorf("pinned package.json mode = %o, want 0o600", mode) + } +} diff --git a/internal/analyzer/analyzer.go b/internal/analyzer/analyzer.go index c5a27de..9ee4392 100644 --- a/internal/analyzer/analyzer.go +++ b/internal/analyzer/analyzer.go @@ -942,13 +942,27 @@ var fileOpCommands = map[string]bool{ // Renames to other destinations (e.g. pip installing a new CLI script) are benign. func isBenignRename(evt *types.SyscallEvent) bool { destBase := path.Base(evt.DstPath) - destDir := path.Dir(evt.DstPath) + "/" + allowedDirs, ok := benignPaths[destBase] + if !ok { + return true + } - if allowedDirs, ok := benignPaths[destBase]; ok { - for _, d := range allowedDirs { - if destDir == d { - return false - } + // Basename-only DstPath (no leading "/") means the probe captured + // only the dentry name and not the parent directory — the eBPF + // probe takes this shape because vfs_rename's renamedata gives us + // d_name (a qstr basename) rather than a full path. Without the + // parent we cannot prove the rename targets a trusted directory, + // so fail-safe: a rename whose basename matches a tracked binary + // (python3, node, sh, ...) stays suspicious. Strace mode emits + // absolute paths and falls through to the directory check below. + if !strings.HasPrefix(evt.DstPath, "/") { + return false + } + + destDir := path.Dir(evt.DstPath) + "/" + for _, d := range allowedDirs { + if destDir == d { + return false } } diff --git a/internal/analyzer/analyzer_test.go b/internal/analyzer/analyzer_test.go index 047ad55..50aac42 100644 --- a/internal/analyzer/analyzer_test.go +++ b/internal/analyzer/analyzer_test.go @@ -279,6 +279,16 @@ func TestAnalyze_RenameTrustedBinary(t *testing.T) { {"new CLI script", "/usr/local/bin/my-tool", types.VerdictClean}, {"install dir", "/install/lib/module.so", types.VerdictClean}, {"tmp rename", "/tmp/a", types.VerdictClean}, + // Basename-only DstPath: the eBPF probe captures only the + // dentry name from vfs_rename's renamedata, not the parent + // path. Without the parent we can't verify the rename is in + // a trusted dir, so a basename matching a system binary must + // stay suspicious. Otherwise an eBPF-mode scan of a malware + // sample that renames over /usr/local/bin/python3 silently + // reports CLEAN. + {"basename-only python3 (eBPF mode)", "python3", types.VerdictSuspicious}, + {"basename-only sh (eBPF mode)", "sh", types.VerdictSuspicious}, + {"basename-only unknown (eBPF mode)", "myapp", types.VerdictClean}, } for _, tc := range cases { diff --git a/internal/depfile/depfile.go b/internal/depfile/depfile.go index eae4503..17e85cb 100644 --- a/internal/depfile/depfile.go +++ b/internal/depfile/depfile.go @@ -7,11 +7,23 @@ import ( "fmt" "os" "path/filepath" + "regexp" "strings" "github.com/RalianENG/kojuto/internal/types" ) +// Package-name validators. Names flow into shell scripts (npm lifecycle +// hooks) and environment variables (KOJUTO_SCAN_PKGS) inside the sandbox, +// so we reject anything outside the registry-allowed character sets at +// parse time. The sandbox also single-quotes names at the use site, but +// validation here gives the user a clear error instead of a silently +// malformed command. +var ( + npmNameRe = regexp.MustCompile(`^(@[a-z0-9][a-z0-9._~-]*/)?[a-z0-9][a-z0-9._~-]*$`) + pypiNameRe = regexp.MustCompile(`^[A-Za-z0-9][A-Za-z0-9._-]*$`) +) + // Dep represents a single dependency extracted from a file. type Dep struct { Name string @@ -81,6 +93,9 @@ func parseRequirementsTxt(path string) ([]Dep, error) { } if dep.Name != "" { + if !pypiNameRe.MatchString(dep.Name) { + return nil, fmt.Errorf("invalid package name %q in %s", dep.Name, path) + } deps = append(deps, dep) } } @@ -106,9 +121,15 @@ func parsePackageJSON(path string) ([]Dep, error) { var deps []Dep for name, ver := range pkg.Dependencies { + if !npmNameRe.MatchString(name) { + return nil, fmt.Errorf("invalid package name %q in %s", name, path) + } deps = append(deps, Dep{Name: name, Version: cleanNpmVersion(ver)}) } for name, ver := range pkg.DevDependencies { + if !npmNameRe.MatchString(name) { + return nil, fmt.Errorf("invalid package name %q in %s", name, path) + } deps = append(deps, Dep{Name: name, Version: cleanNpmVersion(ver)}) } diff --git a/internal/depfile/depfile_test.go b/internal/depfile/depfile_test.go index 14b761e..78a3f6f 100644 --- a/internal/depfile/depfile_test.go +++ b/internal/depfile/depfile_test.go @@ -167,6 +167,73 @@ func TestParse_InvalidJSON(t *testing.T) { } } +func TestParse_RejectsInvalidNpmName(t *testing.T) { + cases := map[string]string{ + "shell metachars": `{"dependencies": {"foo;rm -rf /": "1.0.0"}}`, + "command sub": `{"dependencies": {"foo$(id)": "1.0.0"}}`, + "backtick": "{\"dependencies\": {\"foo`id`\": \"1.0.0\"}}", + "newline": "{\"dependencies\": {\"foo\\nbar\": \"1.0.0\"}}", + "uppercase": `{"dependencies": {"FooBar": "1.0.0"}}`, + "leading dot": `{"dependencies": {".hidden": "1.0.0"}}`, + "space": `{"dependencies": {"a b": "1.0.0"}}`, + "dev shell metachar": `{"devDependencies": {"foo|cat": "1.0.0"}}`, + } + + for name, content := range cases { + t.Run(name, func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "package.json") + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + if _, _, err := Parse(path); err == nil { + t.Errorf("expected error for invalid name, got nil (content=%q)", content) + } + }) + } +} + +func TestParse_AcceptsScopedNpmName(t *testing.T) { + content := `{"dependencies": {"@scope/foo-bar": "1.0.0", "lodash": "4.0.0"}}` + dir := t.TempDir() + path := filepath.Join(dir, "package.json") + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + deps, _, err := Parse(path) + if err != nil { + t.Fatalf("Parse failed: %v", err) + } + if len(deps) != 2 { + t.Errorf("expected 2 deps, got %d", len(deps)) + } +} + +func TestParse_RejectsInvalidPyPIName(t *testing.T) { + // Note: `;` is not exercised here because the parser strips PEP 508 + // environment markers (everything after `;`) before validation runs, + // so a shell `;` is never seen as part of the name. + cases := map[string]string{ + "command sub": "foo$(id)==1.0.0\n", + "backtick": "foo`id`==1.0.0\n", + "pipe": "foo|cat==1.0.0\n", + "space": "a b==1.0.0\n", + } + + for name, content := range cases { + t.Run(name, func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "requirements.txt") + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } + if _, _, err := Parse(path); err == nil { + t.Errorf("expected error for invalid name, got nil (content=%q)", content) + } + }) + } +} + func TestCleanNpmVersion(t *testing.T) { cases := []struct { input string diff --git a/internal/probe/container_strace.go b/internal/probe/container_strace.go index f5655da..b50fc5e 100644 --- a/internal/probe/container_strace.go +++ b/internal/probe/container_strace.go @@ -6,11 +6,21 @@ import ( "errors" "fmt" "io" + "os" "os/exec" "github.com/RalianENG/kojuto/internal/types" ) +// straceMaxLine bounds the size of a single strace stderr line. With +// `-s 256` the per-arg cap is 256 bytes, so realistic execve+argv lines +// stay well under 1 MiB. We allow 16 MiB to accommodate exotic kernels +// or tracee writes interleaved on the shared docker exec stderr; lines +// longer than this are treated as adversarial (a malicious package +// trying to overflow bufio.Scanner's default 64 KiB cap so the scan +// loop exits and subsequent strace events are silently dropped). +const straceMaxLine = 16 * 1024 * 1024 + // ContainerStrace monitors connect(2) syscalls by running strace inside the Docker container. // This works on all platforms where Docker is available (Linux, macOS, Windows). // ContainerStrace monitors connect(2) syscalls by running strace inside the Docker container. @@ -96,6 +106,7 @@ func (c *ContainerStrace) parseStraceOutput(stderr io.ReadCloser, done chan<- st state := NewParseState() scanner := bufio.NewScanner(stderr) + scanner.Buffer(make([]byte, 64*1024), straceMaxLine) for scanner.Scan() { evt, ok := parseStraceLine(scanner.Text(), state) if !ok { @@ -112,6 +123,16 @@ func (c *ContainerStrace) parseStraceOutput(stderr io.ReadCloser, done chan<- st c.dropped++ } } + if err := scanner.Err(); err != nil { + // bufio.ErrTooLong here means the tracee wrote a >16 MiB + // chunk to the shared docker-exec stderr without a newline, + // most plausibly to disable parsing of subsequent strace + // trace lines. Any other error means the pipe died early. + // Either way we lost an unknown number of events, so flip + // the verdict to inconclusive via the dropped counter. + c.dropped++ + fmt.Fprintf(os.Stderr, "warning: strace stderr scanner aborted: %v\n", err) + } } // drainReader reads from r up to 10MB and returns the content. diff --git a/internal/probe/container_strace_test.go b/internal/probe/container_strace_test.go index 92e36e3..77de5e2 100644 --- a/internal/probe/container_strace_test.go +++ b/internal/probe/container_strace_test.go @@ -265,3 +265,55 @@ func TestParseStraceOutput_Done(_ *testing.T) { <-parseDone // should complete without hanging } + +// chunkReader returns the same chunk repeatedly until `remaining` bytes +// have been served, then EOF. Used to feed bufio.Scanner a giant single +// "line" without allocating it all up front. +type chunkReader struct { + chunk []byte + remaining int +} + +func (r *chunkReader) Read(p []byte) (int, error) { + if r.remaining <= 0 { + return 0, io.EOF + } + n := copy(p, r.chunk) + if n > r.remaining { + n = r.remaining + } + r.remaining -= n + return n, nil +} + +func (r *chunkReader) Close() error { return nil } + +// TestParseStraceOutput_LongLineFlipsToInconclusive pins the security +// fix for the prior detection-bypass: a malicious package could write +// >64 KiB without a newline to the shared docker-exec stderr (which +// carries strace trace output too), trip bufio.Scanner's default cap, +// silently terminate the parser loop, and let every subsequent strace +// event evade the analyzer. The fix raises the cap to straceMaxLine +// and turns any scanner error into a `dropped++` so the verdict logic +// flips to inconclusive instead of trusting the truncated trace. +func TestParseStraceOutput_LongLineFlipsToInconclusive(t *testing.T) { + // Slightly more than straceMaxLine, no newlines, so bufio.Scanner + // returns ErrTooLong. Use a chunkReader to avoid allocating the + // full payload — straceMaxLine is 16 MiB. + reader := &chunkReader{ + chunk: bytes.Repeat([]byte{'A'}, 4096), + remaining: straceMaxLine + 1024, + } + + cs := &ContainerStrace{ + events: make(chan types.SyscallEvent, 16), + done: make(chan struct{}), + } + parseDone := make(chan struct{}) + go cs.parseStraceOutput(reader, parseDone) + <-parseDone + + if cs.dropped == 0 { + t.Error("expected dropped > 0 after scanner overflow, got 0") + } +} diff --git a/internal/probe/fallback.go b/internal/probe/fallback.go index 3428967..da1633f 100644 --- a/internal/probe/fallback.go +++ b/internal/probe/fallback.go @@ -6,6 +6,7 @@ import ( "bufio" "errors" "fmt" + "os" "os/exec" "strconv" "sync" @@ -59,6 +60,7 @@ func (s *StraceFallback) StartWithPID(pid uint32) error { defer close(s.events) state := NewParseState() scanner := bufio.NewScanner(stderr) + scanner.Buffer(make([]byte, 64*1024), straceMaxLine) for scanner.Scan() { evt, ok := parseStraceLine(scanner.Text(), state) if ok { @@ -71,6 +73,13 @@ func (s *StraceFallback) StartWithPID(pid uint32) error { } } } + if err := scanner.Err(); err != nil { + // See straceMaxLine: an unrecoverable scanner failure + // here means subsequent trace lines are lost. Flip to + // inconclusive via dropped. + s.dropped++ + fmt.Fprintf(os.Stderr, "warning: strace stderr scanner aborted: %v\n", err) + } }() return nil diff --git a/internal/probe/strace_parse.go b/internal/probe/strace_parse.go index 986e7d5..5e8fd3e 100644 --- a/internal/probe/strace_parse.go +++ b/internal/probe/strace_parse.go @@ -87,13 +87,19 @@ var ( ) // rename("/tmp/evil", "/usr/local/bin/python3") = 0. + // `(?:\\.|[^"\\])+` accepts strace's C-style backslash escapes + // inside the captured filename. The previous `[^"]+` failed on + // strace lines whose filename contained a literal `"` (rendered + // as `\"` by strace), causing the whole regex to drop the event + // — a malicious package could rename `/tmp/x"y` over a trusted + // system binary and leave no rename event for the analyzer. straceRenameRe = regexp.MustCompile( - `rename\("([^"]+)",\s*"([^"]+)"\)`, + `rename\("((?:\\.|[^"\\])+)",\s*"((?:\\.|[^"\\])+)"\)`, ) // renameat(AT_FDCWD, "old", AT_FDCWD, "new") or renameat2(...) straceRenameatRe = regexp.MustCompile( - `renameat2?\([^,]+,\s*"([^"]+)",\s*[^,]+,\s*"([^"]+)"`, + `renameat2?\([^,]+,\s*"((?:\\.|[^"\\])+)",\s*[^,]+,\s*"((?:\\.|[^"\\])+)"`, ) // mmap(NULL, 4096, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f... diff --git a/internal/probe/strace_parse_extra_test.go b/internal/probe/strace_parse_extra_test.go index fe11e70..f27c6c0 100644 --- a/internal/probe/strace_parse_extra_test.go +++ b/internal/probe/strace_parse_extra_test.go @@ -869,3 +869,54 @@ func TestParseAuditHook_IntegratedWithParseStraceLine(t *testing.T) { t.Errorf("audit_event = %q, want eval", evt.AuditEvent) } } + +// TestParseStraceLine_RenameWithEscapedQuotes pins the security fix for +// the prior `"([^"]+)"` capture pattern, which silently dropped the +// rename event whenever a filename contained a literal `"` (escaped by +// strace as `\"`). Without an event the analyzer never sees the hijack +// and the verdict stays clean — a malicious package could rename +// `/tmp/x"y` over `/usr/local/bin/python3` invisibly. +func TestParseStraceLine_RenameWithEscapedQuotes(t *testing.T) { + cases := []struct { + name string + line string + wantSrc string + wantDst string + }{ + { + name: "rename with escaped quote in src", + line: `[pid 100] rename("/tmp/x\"y", "/usr/local/bin/python3") = 0`, + wantSrc: `/tmp/x\"y`, + wantDst: "/usr/local/bin/python3", + }, + { + name: "renameat with escaped quote in dst", + line: `[pid 101] renameat(AT_FDCWD, "/tmp/legit", AT_FDCWD, "/etc/x\"y") = 0`, + wantSrc: "/tmp/legit", + wantDst: `/etc/x\"y`, + }, + { + name: "plain rename still parses", + line: `[pid 102] rename("/tmp/a", "/usr/local/bin/node") = 0`, + wantSrc: "/tmp/a", + wantDst: "/usr/local/bin/node", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + evt, ok := parseStraceLine(tc.line, NewParseState()) + if !ok { + t.Fatalf("expected rename to parse, line=%q", tc.line) + } + if evt.Syscall != types.EventRename { + t.Errorf("syscall = %q, want %q", evt.Syscall, types.EventRename) + } + if evt.SrcPath != tc.wantSrc { + t.Errorf("SrcPath = %q, want %q", evt.SrcPath, tc.wantSrc) + } + if evt.DstPath != tc.wantDst { + t.Errorf("DstPath = %q, want %q", evt.DstPath, tc.wantDst) + } + }) + } +} diff --git a/internal/report/report.go b/internal/report/report.go index daae717..02de4cb 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -4,6 +4,8 @@ import ( "encoding/json" "fmt" "io" + "reflect" + "strings" "time" "github.com/RalianENG/kojuto/internal/types" @@ -29,8 +31,19 @@ func Generate(pkg, version, ecosystem, verdict, probeMethod string, events []typ } } -// WriteJSON writes the report as indented JSON to w. +// WriteJSON writes the report as indented JSON to w. Every string field +// of every event is run through sanitizeControl first so a downstream +// consumer that decodes the JSON and prints a string field raw (e.g. +// `jq -r .events[].code_snippet`) cannot be fed an attacker-controlled +// ANSI escape sequence smuggled in via a malicious package's audit-hook +// snippet, prctl-set comm name, or future attacker-reachable string +// slot. The Python audit hook performs the same escape on its own side; +// this is the host-side belt-and-braces guard. func WriteJSON(r *types.Report, w io.Writer) error { + for i := range r.Events { + sanitizeEventStrings(&r.Events[i]) + } + enc := json.NewEncoder(w) enc.SetIndent("", " ") @@ -40,3 +53,59 @@ func WriteJSON(r *types.Report, w io.Writer) error { return nil } + +// sanitizeEventStrings rewrites every string field of e in place so any +// C0 control byte (0x00-0x1f) or DEL (0x7f) that an attacker placed in +// an event field is rendered as a printable `\xNN` escape rather than +// passing through to the report consumer's terminal. +// +// Reflection is used so future string fields added to SyscallEvent are +// covered automatically. Non-string fields are skipped. Fields kojuto +// itself populates (Syscall, Category, Phase, etc.) are no-ops because +// their values never contain control bytes. +func sanitizeEventStrings(e *types.SyscallEvent) { + v := reflect.ValueOf(e).Elem() + for i := range v.NumField() { + f := v.Field(i) + if f.Kind() == reflect.String && f.CanSet() { + f.SetString(sanitizeControl(f.String())) + } + } +} + +// sanitizeControl escapes C0 control bytes and DEL to printable forms. +// Common whitespace bytes use the conventional \n / \r / \t shorthand; +// all others render as \xNN. Any other byte (printable ASCII, UTF-8 +// multibyte, etc.) passes through unchanged. +func sanitizeControl(s string) string { + if !needsControlEscape(s) { + return s + } + var b strings.Builder + b.Grow(len(s)) + for _, r := range s { + switch { + case r == '\n': + b.WriteString(`\n`) + case r == '\r': + b.WriteString(`\r`) + case r == '\t': + b.WriteString(`\t`) + case r < 0x20 || r == 0x7f: + fmt.Fprintf(&b, `\x%02x`, r) + default: + b.WriteRune(r) + } + } + return b.String() +} + +func needsControlEscape(s string) bool { + for i := range len(s) { + c := s[i] + if c < 0x20 || c == 0x7f { + return true + } + } + return false +} diff --git a/internal/report/report_test.go b/internal/report/report_test.go index c757610..f9e8e71 100644 --- a/internal/report/report_test.go +++ b/internal/report/report_test.go @@ -85,3 +85,79 @@ func TestWriteJSON(t *testing.T) { t.Errorf("expected clean, got %s", decoded.Verdict) } } + +func TestSanitizeControl(t *testing.T) { + cases := []struct { + in, want string + }{ + {"plain ascii", "plain ascii"}, + {"日本語", "日本語"}, + {"with\nnewline", `with\nnewline`}, + {"with\ttab", `with\ttab`}, + {"esc\x1b[2J", `esc\x1b[2J`}, + {"\x00null", `\x00null`}, + {"\x07bell", `\x07bell`}, + {"del\x7f", `del\x7f`}, + {"all\x00\x01\x1b\x7fclean", `all\x00\x01\x1b\x7fclean`}, + } + for _, tc := range cases { + if got := sanitizeControl(tc.in); got != tc.want { + t.Errorf("sanitizeControl(%q) = %q, want %q", tc.in, got, tc.want) + } + } +} + +// TestWriteJSON_StripsAnsiFromAttackerFields pins the security +// invariant: any attacker-controlled string field (CodeSnippet, Comm, +// Cmdline, etc.) carrying ANSI escape sequences is rendered into the +// JSON as a printable backslash escape, so a downstream `jq -r` style +// raw-string consumer cannot be fed terminal control bytes. +func TestWriteJSON_StripsAnsiFromAttackerFields(t *testing.T) { + events := []types.SyscallEvent{ + { + Timestamp: time.Now(), + PID: 1, + Syscall: types.EventDynamicExec, + AuditEvent: "exec", + CodeSnippet: "payload\x1b[2J\x1b[Hverdict: clean", + Comm: "evil\x1b[31m", + Cmdline: "sh\x07-c\x00boom", + FilePath: "/tmp/\x1bweird", + }, + } + r := Generate("badpkg", "1.0.0", types.EcosystemPyPI, types.VerdictSuspicious, "ebpf", events, 0, 0, nil) + + var buf bytes.Buffer + if err := WriteJSON(&r, &buf); err != nil { + t.Fatalf("WriteJSON failed: %v", err) + } + + // The serialized JSON must not contain a literal ESC byte (0x1b) + // or BEL/NUL anywhere, even after JSON decoding. + var decoded types.Report + if err := json.Unmarshal(buf.Bytes(), &decoded); err != nil { + t.Fatalf("unmarshal: %v", err) + } + + for _, e := range decoded.Events { + fields := []struct{ name, val string }{ + {"CodeSnippet", e.CodeSnippet}, + {"Comm", e.Comm}, + {"Cmdline", e.Cmdline}, + {"FilePath", e.FilePath}, + } + for _, f := range fields { + for i := range len(f.val) { + c := f.val[i] + if c < 0x20 || c == 0x7f { + t.Errorf("%s contains raw control byte 0x%02x: %q", f.name, c, f.val) + break + } + } + } + // Spot-check the encoded form is human-readable. + if !bytes.Contains(buf.Bytes(), []byte(`\\x1b`)) { + t.Errorf("expected printable backslash-escaped ESC in JSON output, got:\n%s", buf.String()) + } + } +} diff --git a/internal/sandbox/hooks/sitecustomize.py b/internal/sandbox/hooks/sitecustomize.py index 1eff6d7..a1924f7 100644 --- a/internal/sandbox/hooks/sitecustomize.py +++ b/internal/sandbox/hooks/sitecustomize.py @@ -8,7 +8,6 @@ import sys _MAX = 200 -_FRAMES = 8 _P = chr(75) + chr(79) + chr(74) + chr(85) + chr(84) + chr(79) + ":" # Frames whose co_filename starts with one of these are treated as @@ -42,13 +41,42 @@ def _is_user(fn): def _t(s): - s = str(s).replace("\n", "\\n").replace("\r", "\\r") + # Escape every C0 control byte (0x00-0x1f) and DEL (0x7f) to a + # printable form. The wire format only needs newline-stripping for + # framing safety (handled in _w too), but the snippet field + # eventually surfaces in report.json's code_snippet — a downstream + # consumer that decodes the JSON and prints the raw string (e.g. + # `jq -r .events[].code_snippet`) used to receive an unescaped + # ESC (0x1b) byte and let an attacker-supplied payload paint + # arbitrary text on the user's terminal via ANSI sequences. + out = [] + for ch in str(s): + if ch == "\n": + out.append("\\n") + elif ch == "\r": + out.append("\\r") + elif ch == "\t": + out.append("\\t") + elif ord(ch) < 0x20 or ord(ch) == 0x7f: + out.append("\\x%02x" % ord(ch)) + else: + out.append(ch) + s = "".join(out) if len(s) > _MAX: return s[:_MAX] + "..." return s def _w(tag, body): + # Wire-format invariant: each KOJUTO: emission must occupy exactly + # one stderr line. The Go parser splits on '\n' and treats every + # KOJUTO:-prefixed line as an independent event. Strip line and null + # bytes from `body` so an attacker-controlled co_filename or compile + # filename arg cannot smuggle a fake follow-up event onto the wire. + # _t() already escapes newlines in the snippet field, so this is a + # defense for the filename field (which is concatenated raw from + # _origin()) and a belt-and-braces guard for any future tag. + body = body.replace("\n", "").replace("\r", "").replace("\0", "") sys.stderr.write(_P + tag + ":" + body + "\n") sys.stderr.flush() @@ -57,28 +85,32 @@ def _origin(fallback): """Walk the Python call stack and return the .py file responsible for the compile/exec call. - Returns "+" when the deepest user-code frame is found — the - leading "+" tells the Go analyzer this event originates in audited - code and must NOT be filtered by the path-based benign list. When - no user frame appears, returns the deepest non-internal frame so - the existing benign filter (stdlib/site-packages substring match) - still works for compat-library noise. + Returns "+" when a user-code frame is found anywhere on the + stack — the leading "+" tells the Go analyzer this event originates + in audited code and must NOT be filtered by the path-based benign + list. When no user frame appears, returns the first non-internal + frame so the existing benign filter (stdlib/site-packages substring + match) still works for compat-library noise. + + The walk is unbounded by design: a depth cap would let a malicious + package bury its own frame under stdlib wrappers and evade the + user-origin marker. Stack walking is O(depth) on a linked list and + early-returns on the first user frame, so the practical cost is + negligible compared with the surrounding compile/exec work. """ try: f = sys._getframe(2) # skip _origin + _h except ValueError: return fallback me = __file__ - seen = 0 deepest = "" - while f is not None and seen < _FRAMES: + while f is not None: fn = f.f_code.co_filename if fn != me: if _is_user(fn): return "+" + fn if not deepest: deepest = fn - seen += 1 f = f.f_back if deepest: return deepest diff --git a/internal/sandbox/sandbox.go b/internal/sandbox/sandbox.go index 50a3acf..75da4c0 100644 --- a/internal/sandbox/sandbox.go +++ b/internal/sandbox/sandbox.go @@ -223,14 +223,17 @@ func (s *Sandbox) containerArgs() ([]string, error) { // The result is sanitized to prevent Docker flag injection via hostile hostnames. func getHostHostname() string { if h, err := os.Hostname(); err == nil && h != "" { - return sanitizeDockerArg(h) + return sanitizeDockerArg(h, "localhost") } return "localhost" } -// sanitizeDockerArg strips characters that could break Docker CLI arguments. -// Docker hostnames must match [a-zA-Z0-9][a-zA-Z0-9_.-]. -func sanitizeDockerArg(s string) string { +// sanitizeDockerArg strips characters that could break Docker CLI +// arguments. Output is restricted to [a-zA-Z0-9._-] — the intersection +// of the Docker hostname rule and the chars that are safe to interpolate +// into shell command lines and -v : volume specs without +// quoting. fallback is returned when every byte was stripped. +func sanitizeDockerArg(s, fallback string) string { var b strings.Builder for _, c := range s { if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '-' || c == '.' || c == '_' { @@ -238,7 +241,7 @@ func sanitizeDockerArg(s string) string { } } if b.Len() == 0 { - return "localhost" + return fallback } return b.String() } @@ -285,12 +288,16 @@ func getHostResources() (cpus, memory string) { return cpus, memory } -// getHostUsername returns the current OS username. +// getHostUsername returns the current OS username, sanitized to the same +// charset as getHostHostname. The result flows into mountPoint = +// "/home//projects", which is interpolated into a -v ::ro +// volume spec and a `pip install ... /*` shell glob. Even +// though env vars are normally trusted, mirroring the hostname sanitizer +// removes any case where a future runner sets USER to something exotic. func getHostUsername() string { - // Try common env vars (works on Linux, macOS, Windows). for _, key := range []string{"USER", "USERNAME", "LOGNAME"} { if u := os.Getenv(key); u != "" { - return u + return sanitizeDockerArg(u, "user") } } return "user" @@ -519,44 +526,39 @@ func (s *Sandbox) plantHoneypotFiles(ctx context.Context) { // SSH key pair. s.dockerExecRoot(ctx, "mkdir", "-p", home+"/.ssh") - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+home+"/.ssh/id_rsa << 'KOJUTO_EOF'\n"+ + s.dockerWriteFile(ctx, home+"/.ssh/id_rsa", "-----BEGIN OPENSSH PRIVATE KEY-----\n"+ - "b3BlbnNzaC1rZXktdjEAAAAAFAAAAAAAAAEAAAAzAAAAC3NzaC1lZDI1NTE5\n"+ - "AAAAI"+sshKeyBody+"\n"+ - "-----END OPENSSH PRIVATE KEY-----\n"+ - "KOJUTO_EOF") + "b3BlbnNzaC1rZXktdjEAAAAAFAAAAAAAAAEAAAAzAAAAC3NzaC1lZDI1NTE5\n"+ + "AAAAI"+sshKeyBody+"\n"+ + "-----END OPENSSH PRIVATE KEY-----\n") s.dockerExecRoot(ctx, "chmod", "600", home+"/.ssh/id_rsa") // AWS credentials. s.dockerExecRoot(ctx, "mkdir", "-p", home+"/.aws") - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+home+"/.aws/credentials << 'KOJUTO_EOF'\n"+ + s.dockerWriteFile(ctx, home+"/.aws/credentials", "[default]\n"+ - "aws_access_key_id = "+awsKey+"\n"+ - "aws_secret_access_key = "+awsSecret+"\n"+ - "KOJUTO_EOF") + "aws_access_key_id = "+awsKey+"\n"+ + "aws_secret_access_key = "+awsSecret+"\n") // Git credentials. - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+home+"/.git-credentials << 'KOJUTO_EOF'\n"+ - "https://dev:"+ghToken+"@github.com\n"+ - "KOJUTO_EOF") + s.dockerWriteFile(ctx, home+"/.git-credentials", + "https://dev:"+ghToken+"@github.com\n") s.dockerExecRoot(ctx, "chmod", "600", home+"/.git-credentials") // Netrc. - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+home+"/.netrc << 'KOJUTO_EOF'\n"+ + s.dockerWriteFile(ctx, home+"/.netrc", "machine github.com\n"+ - "login dev\n"+ - "password "+ghToken+"\n"+ - "KOJUTO_EOF") + "login dev\n"+ + "password "+ghToken+"\n") s.dockerExecRoot(ctx, "chmod", "600", home+"/.netrc") // GitHub CLI config. s.dockerExecRoot(ctx, "mkdir", "-p", home+"/.config/gh") - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+home+"/.config/gh/hosts.yml << 'KOJUTO_EOF'\n"+ + s.dockerWriteFile(ctx, home+"/.config/gh/hosts.yml", "github.com:\n"+ - " oauth_token: "+ghToken+"\n"+ - " user: dev\n"+ - " git_protocol: https\n"+ - "KOJUTO_EOF") + " oauth_token: "+ghToken+"\n"+ + " user: dev\n"+ + " git_protocol: https\n") // Fix ownership so the container user (dev) owns the files. s.dockerExecRoot(ctx, "chown", "-R", "1000:1000", home+"/.ssh", home+"/.aws", @@ -586,6 +588,22 @@ func (s *Sandbox) dockerExecRoot(ctx context.Context, args ...string) { _ = cmd.Run() } +// dockerWriteFile writes content into the container at path as root. +// The body is delivered via stdin to `cat`, which keeps it out of the +// shell command line entirely — unlike a `<< 'EOF'` heredoc, an attacker +// who controls part of the body cannot smuggle a stray terminator line +// to break out of the heredoc and execute arbitrary commands as root in +// the container. path is single-quoted as defense-in-depth even though +// every current caller passes a constant. +func (s *Sandbox) dockerWriteFile(ctx context.Context, path, content string) { + cmdArgs := []string{"exec", "-i", "--user=root", s.containerID, "sh", "-c", "cat > " + shQuote(path)} + cmd := execCommand(ctx, "docker", cmdArgs...) + cmd.Stdin = strings.NewReader(content) + cmd.Stdout = io.Discard + cmd.Stderr = io.Discard + _ = cmd.Run() +} + // Exec runs a command inside the sandbox container and returns the combined output. func (s *Sandbox) Exec(ctx context.Context, command []string) ([]byte, error) { args := append([]string{"exec", s.containerID}, command...) @@ -696,17 +714,27 @@ func npmLifecycleScript(pkgs []string) string { if i > 0 { b.WriteString("; ") } - // Quote the path so scoped packages like @scope/pkg work - // without word-splitting. - b.WriteString(`(cd "/install/node_modules/`) - b.WriteString(p) - b.WriteString(`" && `) + // Single-quote the full cd target so any shell metacharacters + // in p (which originates from package.json keys, i.e. attacker- + // controllable input) cannot break out of the argument. depfile + // validates names at parse time; this is defense-in-depth for + // any future code path that populates pkgs from another source. + b.WriteString(`(cd `) + b.WriteString(shQuote("/install/node_modules/" + p)) + b.WriteString(` && `) b.WriteString(hooks) b.WriteString(`) 2>&1`) } return b.String() } +// shQuote wraps s in POSIX single quotes, escaping any embedded single +// quote with the standard '\” close-escape-reopen idiom. Safe for use +// in /bin/sh, dash, and bash. +func shQuote(s string) string { + return "'" + strings.ReplaceAll(s, "'", `'\''`) + "'" +} + // WriteProbeScriptsMulti writes one combined import probe script per OS identity. // This reduces Python/Node process launches from N*3 to just 3. func (s *Sandbox) WriteProbeScriptsMulti(ctx context.Context, pkgs []string) { @@ -723,7 +751,7 @@ func (s *Sandbox) WriteProbeScriptsMulti(ctx context.Context, pkgs []string) { p, requires.String(), ) filename := "/tmp/_kojuto_probe_all_" + p + ".js" - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+filename+" << 'KOJUTO_EOF'\n"+script+"KOJUTO_EOF") + s.dockerWriteFile(ctx, filename, script) } return } @@ -765,7 +793,7 @@ func (s *Sandbox) WriteProbeScriptsMulti(ctx context.Context, pkgs []string) { p.sep, p.pathsep, p.linesep, imports.String(), ) filename := "/tmp/_kojuto_probe_all_" + p.sysplatform + ".py" - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+filename+" << 'KOJUTO_EOF'\n"+script+"KOJUTO_EOF") + s.dockerWriteFile(ctx, filename, script) } } @@ -850,7 +878,7 @@ func (s *Sandbox) WriteProbeScripts(ctx context.Context) { p.sep, p.pathsep, p.linesep, importName, ) filename := "/tmp/_kojuto_probe_" + p.sysplatform + ".py" - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+filename+" << 'KOJUTO_EOF'\n"+script+"KOJUTO_EOF") + s.dockerWriteFile(ctx, filename, script) } jsPlatforms := []string{"linux", "win32", "darwin"} @@ -862,7 +890,7 @@ func (s *Sandbox) WriteProbeScripts(ctx context.Context) { p, s.pkg, ) filename := "/tmp/_kojuto_probe_" + p + ".js" - s.dockerExecRoot(ctx, "sh", "-c", "cat > "+filename+" << 'KOJUTO_EOF'\n"+script+"KOJUTO_EOF") + s.dockerWriteFile(ctx, filename, script) } } diff --git a/internal/sandbox/sandbox_extra_test.go b/internal/sandbox/sandbox_extra_test.go index 0b9546d..8c60b13 100644 --- a/internal/sandbox/sandbox_extra_test.go +++ b/internal/sandbox/sandbox_extra_test.go @@ -2,6 +2,8 @@ package sandbox import ( "context" + "os/exec" + "reflect" "strings" "testing" @@ -174,6 +176,32 @@ func TestGetHostUsername(t *testing.T) { } } +// TestGetHostUsername_Sanitizes pins that an exotic USER value (shell +// metacharacters, path separators, non-ASCII) is normalised before it +// flows into mountPoint and from there into -v volume specs and shell +// command lines. +func TestGetHostUsername_Sanitizes(t *testing.T) { + cases := []struct { + userEnv, want string + }{ + {"alice", "alice"}, + {"a;rm -rf /", "arm-rf"}, + {"$(id)", "id"}, + {"a:b", "ab"}, + {"a/b", "ab"}, + {"日本語", "user"}, + } + + for _, tc := range cases { + t.Setenv("USER", tc.userEnv) + t.Setenv("USERNAME", "") + t.Setenv("LOGNAME", "") + if got := getHostUsername(); got != tc.want { + t.Errorf("USER=%q: getHostUsername() = %q, want %q", tc.userEnv, got, tc.want) + } + } +} + func TestGetHostResources(t *testing.T) { cpus, mem := getHostResources() @@ -347,12 +375,45 @@ func TestNpmLifecycleScript_ScopedPackage(t *testing.T) { // shell command. Regression guard for a bug that would have hidden // scoped-dep lifecycle behavior from strace. got := npmLifecycleScript([]string{"@scope/lib"}) - want := `"/install/node_modules/@scope/lib"` + want := `'/install/node_modules/@scope/lib'` if !strings.Contains(got, want) { t.Errorf("scoped path missing %q in:\n%s", want, got) } } +func TestShQuote(t *testing.T) { + cases := []struct { + in, want string + }{ + {"foo", "'foo'"}, + {"a b", "'a b'"}, + {"@scope/pkg", "'@scope/pkg'"}, + {"$(id)", "'$(id)'"}, + {"`id`", "'`id`'"}, + {"a'b", `'a'\''b'`}, + {"", "''"}, + } + for _, tc := range cases { + if got := shQuote(tc.in); got != tc.want { + t.Errorf("shQuote(%q) = %q, want %q", tc.in, got, tc.want) + } + } +} + +func TestNpmLifecycleScript_QuotesShellMetachars(t *testing.T) { + // Defense-in-depth: even if a malformed name slips past the depfile + // validator, the cd target must be safely single-quoted so shell + // metacharacters cannot break out and execute on the sandbox shell. + got := npmLifecycleScript([]string{`foo$(id)`}) + if !strings.Contains(got, `'/install/node_modules/foo$(id)'`) { + t.Errorf("metachar payload not single-quoted in:\n%s", got) + } + // The bare token must NOT appear unquoted between cd and &&. + if strings.Contains(got, `cd /install/node_modules/foo$(id) `) { + t.Errorf("metachar payload unquoted in:\n%s", got) + } +} + // --------------------------------------------------------------------------- // WriteProbeScriptsMulti // --------------------------------------------------------------------------- @@ -506,3 +567,63 @@ func TestFakeTokens_NotHexOnly(t *testing.T) { t.Error("10 consecutive fakeAWSKeyID tokens were all hex-only") } } + +// TestDockerWriteFile_StructureNoHeredoc pins the security invariant that +// dockerWriteFile delivers content via stdin (`docker exec -i`) instead of +// the previous `cat << 'KOJUTO_EOF'` heredoc. The heredoc form was a root- +// in-container injection sink: an attacker-controlled package name with +// an embedded "\nKOJUTO_EOF\n" sequence used to terminate the +// heredoc early and run as root. Stdin delivery removes the body +// from the shell command line entirely. +func TestDockerWriteFile_StructureNoHeredoc(t *testing.T) { + var captured []string + orig := execCommand + execCommand = func(ctx context.Context, name string, args ...string) *exec.Cmd { + captured = append([]string{name}, args...) + return exec.CommandContext(ctx, "true") + } + t.Cleanup(func() { execCommand = orig }) + + sb := &Sandbox{containerID: "test-container"} + sb.dockerWriteFile(context.Background(), "/tmp/probe.js", + "any content with KOJUTO_EOF baked in\nstill not parsed by shell") + + want := []string{ + "docker", "exec", "-i", "--user=root", "test-container", + "sh", "-c", "cat > '/tmp/probe.js'", + } + if !reflect.DeepEqual(captured, want) { + t.Fatalf("docker args mismatch:\n got %q\nwant %q", captured, want) + } + + for _, a := range captured { + if strings.Contains(a, "<<") { + t.Errorf("heredoc syntax leaked into arg %q", a) + } + if strings.Contains(a, "KOJUTO_EOF") { + t.Errorf("KOJUTO_EOF terminator leaked into arg %q", a) + } + } +} + +// TestDockerWriteFile_QuotesPath confirms the path is single-quoted so a +// future caller passing a path with shell metacharacters cannot break the +// command. Every current caller passes a constant path; this test pins +// the defense-in-depth contract for future code. +func TestDockerWriteFile_QuotesPath(t *testing.T) { + var captured []string + orig := execCommand + execCommand = func(ctx context.Context, name string, args ...string) *exec.Cmd { + captured = append([]string{name}, args...) + return exec.CommandContext(ctx, "true") + } + t.Cleanup(func() { execCommand = orig }) + + sb := &Sandbox{containerID: "test-container"} + sb.dockerWriteFile(context.Background(), "/tmp/$(rm -rf /)/x", "x") + + last := captured[len(captured)-1] + if !strings.Contains(last, `'/tmp/$(rm -rf /)/x'`) { + t.Errorf("path not single-quoted in shell arg: %q", last) + } +} diff --git a/internal/sandbox/sandbox_test.go b/internal/sandbox/sandbox_test.go index b95f325..1a9779f 100644 --- a/internal/sandbox/sandbox_test.go +++ b/internal/sandbox/sandbox_test.go @@ -128,23 +128,32 @@ func TestHoneypotEnvVars(t *testing.T) { func TestSanitizeDockerArg(t *testing.T) { cases := []struct { - input string - want string + input string + fallback string + want string }{ - {"my-host", "my-host"}, - {"host.local", "host.local"}, - {"host name", "hostname"}, - {"--inject", "--inject"}, - {"$(evil)", "evil"}, - {"", "localhost"}, - {"日本語ホスト", "localhost"}, - {"valid_host-123.local", "valid_host-123.local"}, + {"my-host", "localhost", "my-host"}, + {"host.local", "localhost", "host.local"}, + {"host name", "localhost", "hostname"}, + {"--inject", "localhost", "--inject"}, + {"$(evil)", "localhost", "evil"}, + {"", "localhost", "localhost"}, + {"日本語ホスト", "localhost", "localhost"}, + {"valid_host-123.local", "localhost", "valid_host-123.local"}, + // Verify the fallback is parameterised, not hardcoded to "localhost". + {"", "user", "user"}, + {"$(evil)", "user", "evil"}, + {"日本語", "user", "user"}, + // Shell / volume-spec metacharacters get stripped. + {"a;rm -rf /", "user", "arm-rf"}, + {"a/b", "user", "ab"}, + {"a:b", "user", "ab"}, } for _, tc := range cases { - got := sanitizeDockerArg(tc.input) + got := sanitizeDockerArg(tc.input, tc.fallback) if got != tc.want { - t.Errorf("sanitizeDockerArg(%q) = %q, want %q", tc.input, got, tc.want) + t.Errorf("sanitizeDockerArg(%q, %q) = %q, want %q", tc.input, tc.fallback, got, tc.want) } } }