From 82e1b568f950c34cdc27782cff1eeb82883b738d Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Mon, 11 May 2026 19:55:27 +0400 Subject: [PATCH 1/7] ci+security: Phase 4 repo controls + Phase 5 HMAC-authenticated cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two phase deliverables in one PR — they share zero overlap with the parallel SCA / SBOM / SAST work happening in this repo. ═══════════════════════════════════════════════════════════════════ Phase 5 — HMAC-SHA256 integrity for pkg/security/cache.go ═══════════════════════════════════════════════════════════════════ The previous tamper-detection heuristic compared `info.ModTime()` to the recorded `CreatedAt`. mtime is forgeable in one line (`touch -t`), so the check was decorative rather than load-bearing. Replaced with a proper HMAC-SHA256 over the canonical JSON of each entry, keyed by a 32-byte random per-cache key persisted at `/.hmac.key` (mode 0o600, generated from `crypto/rand` on first use). Behavior change matrix ---------------------- | Scenario | Before | After | |---|---|---| | Set + Get | hit | hit | | Set + Get after TTL | miss (correct) | miss (correct) | | `touch -t` future mtime | miss (false positive — entry was fine) | hit (mtime no longer load-bearing) | | Bytes flipped in JSON on disk | hit (forgery succeeded) | miss + file removed (HMAC mismatch) | | Entry written under a different cache key (Clear+NewCache) | hit (forgery succeeded) | miss + file removed | | Legacy unsigned entry from pre-HMAC code | hit (no integrity) | miss + file removed | Key management -------------- `loadOrCreateHMACKey` is race-safe: a missing file triggers `O_CREATE|O_EXCL`. If two callers race, the loser sees `os.ErrExist` and recursively re-reads the winner's key. Key length is validated to catch truncation/corruption — wrong size returns an error rather than silently rotating. `Cache.Clean()` skips the key file by name; `Cache.Clear()` removes the entire baseDir (key gets regenerated on next NewCache, which is the expected reset semantic). Tests (6 new, joining the 16 existing — all 22 green): - TestCacheHMACKeyPersisted - TestCacheHMACKeyWrongSizeRejected - TestCacheTamperDetection - TestCacheMtimeForgeryNoLongerHelpful - TestCacheKeyMismatchRejectsEntry - TestCacheLegacyUnsignedEntryDiscarded - TestCacheCleanSkipsHMACKey ═══════════════════════════════════════════════════════════════════ Phase 4 — Repo controls ═══════════════════════════════════════════════════════════════════ SECURITY.md ----------- Vulnerability disclosure policy. Supported-versions matrix, two reporting channels (GitHub Security Advisory preferred, email fallback), SLA (3-day ack; 30d HIGH+, 90d MEDIUM, longer LOW), out- of-scope clauses, and a pointer at the cryptographic primitives. Closes OSPS Baseline OSP.V, CISA Secure-by-Design pillar 2, Scorecard "Security-Policy" check. CODEOWNERS ---------- Two-lead review enforced on security-critical paths: `.github/`, `*.Dockerfile`, `pkg/security/`, `pkg/api/secrets/`, `pkg/cmd/cmd_secrets/`, `pkg/clouds/pulumi/{aws,gcp}/`, `go.mod`, `go.sum`, `tools.go`, `SECURITY.md`, `CODEOWNERS`. Broad default keeps single-lead review for ordinary code paths. Closes OSPS Baseline OSP.A, Scorecard "Code-Review", CIS SSCS §1. .github/workflows/codeql.yml ---------------------------- CodeQL Go static analysis on push to main, every PR, and weekly Sun 04:23 UTC. Uses `security-extended` query pack (taint, crypto misuse, additional sinks beyond the default pack). All third-party actions SHA-pinned. Permissions least-privilege: root `contents:read`, job adds `security-events: write` for SARIF upload. Closes Scorecard "SAST" check beyond the Semgrep coverage we already have. .github/workflows/scorecard.yml ------------------------------- OpenSSF Scorecard runs weekly + on main pushes + branch-protection events. Publishes results to api.securityscorecards.dev (consumers can audit our posture without asking) AND uploads SARIF to the Security tab. Advisory, not gating — score moves on factors outside any single PR's diff. ═══════════════════════════════════════════════════════════════════ Validation ═══════════════════════════════════════════════════════════════════ - `go build ./...` clean - `go vet ./...` clean - `go test ./pkg/security/...` 22 tests, all green - Semgrep (SC org ruleset) on the new workflows and on cache.go + cache_test.go: 0 findings - No callers depend on the old mtime check (grep-verified) - Forward compatibility: future fields added to `CacheEntry` will break MAC verification on old files (acceptable — cache rebuilds on miss); documented in the inline comment ═══════════════════════════════════════════════════════════════════ Frameworks satisfied ═══════════════════════════════════════════════════════════════════ | Framework | Control | |---|---| | CIS SSCS | §1 (CODEOWNERS, signed-tags prerequisite) | | NIST SP 800-218 SSDF | PS.1 (code protection), PW.7 (SAST), RV.3 (root cause) | | OWASP Top 10 CI/CD | CICD-SEC-10 (logging/visibility — CodeQL SARIF surface) | | OWASP A08 (Software & Data Integrity Failures) | HMAC integrity | | OpenSSF Scorecard | Security-Policy, Code-Review, SAST, CodeQL checks | | OpenSSF OSPS Baseline | OSP.A (access), OSP.D (docs), OSP.S (security analysis), OSP.V (vuln mgmt) | | CISA Secure-by-Design | Pillar 2 (radical transparency: VDP + Scorecard publishing) | Signed-off-by: Dmitrii Creed --- .github/workflows/codeql.yml | 66 +++++++++++ .github/workflows/scorecard.yml | 68 +++++++++++ CODEOWNERS | 65 ++++++++++ SECURITY.md | 78 ++++++++++++ pkg/security/cache.go | 202 ++++++++++++++++++++++++-------- pkg/security/cache_test.go | 176 ++++++++++++++++++++++++++++ 6 files changed, 606 insertions(+), 49 deletions(-) create mode 100644 .github/workflows/codeql.yml create mode 100644 .github/workflows/scorecard.yml create mode 100644 CODEOWNERS create mode 100644 SECURITY.md diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000..d3431dfa --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,66 @@ +name: CodeQL + +# CodeQL static analysis for Go. Runs on every PR, every push to main, +# and weekly to catch new query packs landing upstream. Findings appear +# in the Security tab; default-severity ERROR also fails the PR check. +# +# Why this AND Semgrep? Different rule packs catch different classes of +# bugs — Semgrep is great at custom org-policy patterns, CodeQL is better +# at dataflow-heavy queries (taint sinks, SSRF, SQLi-style). They overlap +# but the union finds more. + +on: + push: + branches: [main] + pull_request: + branches: [main] + schedule: + # Sunday 04:23 UTC — off-peak, off-Monday-morning. + - cron: '23 4 * * 0' + workflow_dispatch: + +# Cancel in-progress runs for the same PR when a new commit lands. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + analyze: + name: Analyze Go + runs-on: ubuntu-24.04 + timeout-minutes: 30 + permissions: + contents: read + security-events: write # upload SARIF to the Security tab + + strategy: + fail-fast: false + matrix: + language: [go] + + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Initialize CodeQL + uses: github/codeql-action/init@68bde559dea0fdcac2102bfdf6230c5f70eb485e # v4 + with: + languages: ${{ matrix.language }} + # security-extended adds queries beyond the default "security + # and quality" pack — taint tracking, less-common sink types, + # crypto misuses. Worth the extra ~2 min run-time for an OSS + # supply-chain tool. + queries: security-extended + + - name: Autobuild + uses: github/codeql-action/autobuild@68bde559dea0fdcac2102bfdf6230c5f70eb485e # v4 + + - name: Perform analysis + uses: github/codeql-action/analyze@68bde559dea0fdcac2102bfdf6230c5f70eb485e # v4 + with: + category: /language:${{ matrix.language }} diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml new file mode 100644 index 00000000..72db960c --- /dev/null +++ b/.github/workflows/scorecard.yml @@ -0,0 +1,68 @@ +name: OpenSSF Scorecard + +# OpenSSF Scorecard runs a battery of OSS-project security checks +# (Branch-Protection, Token-Permissions, Pinned-Dependencies, +# Dangerous-Workflow, etc.) and reports a score 0-10. We surface +# findings via SARIF in the Security tab AND publish to the public +# Scorecard API (api.securityscorecards.dev) so consumers can audit +# our posture without us asking. +# +# We do NOT use this for PR gating — checks are advisory and the score +# moves on factors outside any one PR's diff. + +on: + branch_protection_rule: + schedule: + # Wednesday 09:17 UTC — mid-week so any regression caught by a + # branch-protection change has time to be reverted before weekend. + - cron: '17 9 * * 3' + push: + branches: [main] + workflow_dispatch: + +# At-most-one scheduled run. +concurrency: + group: scorecard-${{ github.ref }} + cancel-in-progress: false + +permissions: + contents: read + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-24.04 + permissions: + contents: read + # Required to upload SARIF to the Security tab. + security-events: write + # Required to publish results to securityscorecards.dev. + id-token: write + + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Run analysis + uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3 + with: + results_file: results.sarif + results_format: sarif + # Publish to api.securityscorecards.dev so the badge + history + # are public. Required for the Scorecard badge to work. + publish_results: true + + - name: Upload SARIF + uses: github/codeql-action/upload-sarif@68bde559dea0fdcac2102bfdf6230c5f70eb485e # v4 + with: + sarif_file: results.sarif + category: scorecard + + - name: Upload artifact (debugging) + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: scorecard-results + path: results.sarif + retention-days: 30 diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 00000000..dff73576 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,65 @@ +# https://docs.github.com/en/repositories/managing-your-repositories-settings-and-customization/customizing-your-repository/about-code-owners +# +# CODEOWNERS for simple-container-com/api. Two human reviewers are +# enforced on security-critical paths; broad repo defaults are kept to +# either lead to minimise PR-stall. +# +# Tip: GitHub picks the LAST matching pattern in this file for a given +# file path — ordering matters. Put broad defaults at the top and +# specific high-sensitivity globs at the bottom so they win. + +# --------------------------------------------------------------------- +# Default — every file needs at least one lead's review. +# --------------------------------------------------------------------- +* @Cre-eD @smecsia + +# --------------------------------------------------------------------- +# CI / supply-chain — both leads required, no auto-merge with one +# approval. Touching these files changes how every release is built +# and how secrets reach the runner. +# --------------------------------------------------------------------- +/.github/ @Cre-eD @smecsia +/.github/workflows/ @Cre-eD @smecsia +/.github/actions/ @Cre-eD @smecsia +/.github/dependabot.yml @Cre-eD @smecsia + +# --------------------------------------------------------------------- +# Image build context. +# --------------------------------------------------------------------- +/*.Dockerfile @Cre-eD @smecsia +/welder.yaml @Cre-eD @smecsia +/sc.sh @Cre-eD @smecsia + +# --------------------------------------------------------------------- +# Security pipeline source — code that signs, scans, attests, and +# decides what gets blocked. Tamper with this and the whole consumer +# supply chain weakens. +# --------------------------------------------------------------------- +/pkg/security/ @Cre-eD @smecsia + +# --------------------------------------------------------------------- +# Secrets / config plumbing. +# --------------------------------------------------------------------- +/pkg/api/secrets/ @Cre-eD @smecsia +/pkg/cmd/cmd_secrets/ @Cre-eD @smecsia + +# --------------------------------------------------------------------- +# Cloud provisioning — IAM and resource defaults that ship to every +# consumer account. +# --------------------------------------------------------------------- +/pkg/clouds/pulumi/aws/ @Cre-eD @smecsia +/pkg/clouds/pulumi/gcp/ @Cre-eD @smecsia + +# --------------------------------------------------------------------- +# Module graph — dep bumps need security review (see +# `feedback_sc_ci_tools_step` for the CI-quirk caveat). +# --------------------------------------------------------------------- +/go.mod @Cre-eD @smecsia +/go.sum @Cre-eD @smecsia +/tools.go @Cre-eD @smecsia + +# --------------------------------------------------------------------- +# This policy + ownership file itself. +# --------------------------------------------------------------------- +/SECURITY.md @Cre-eD @smecsia +/CODEOWNERS @Cre-eD @smecsia diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..ed05c455 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,78 @@ +# Security Policy + +Simple Container (`sc`) is an OSS supply-chain tool that runs in consumer +CI/CD and provisions cloud resources in customer accounts. A vulnerability +in this codebase can propagate downstream to every consumer, so we treat +all reports as high priority. + +## Supported versions + +Security fixes are issued for the **most recent calver release** (the +tag pushed by [`.github/workflows/push.yaml`][push] on every merge to +`main`) and back-ported to the latest `vYYYY.M.x` line only when the +fix is non-trivial. Older versions receive no patches; consumers should +pin to a recent release tag (or a SHA) and update via Dependabot / +[`integrail/devops/.github/actions/install-sc`][install-sc] (or +equivalent) on at least a monthly cadence. + +| Version | Supported | +|---|---| +| `vYYYY.M.x` latest | ✅ | +| Previous calver release on the same month line | ✅ (best-effort back-port) | +| Anything older | ❌ | + +## Reporting a vulnerability + +**Do not file a public issue.** Use one of these channels in order of +preference: + +1. **[GitHub Security Advisory][gsa]** — preferred. Private to maintainers, + integrates with CVE issuance and the GitHub-side fix workflow. +2. **Email** `security@simple-container.com` if you can't use GitHub + Security Advisories. + +Please include: + +- A description of the issue and the security impact you observed. +- The exact `sc` version (or commit SHA) affected. +- Reproduction steps or a proof of concept where possible. +- Whether you've shared the report with any third party. + +We aim to acknowledge within **3 working days** and to ship a fix or +mitigation within **30 days** for HIGH/CRITICAL findings, **90 days** +for MEDIUM, longer for LOW. We'll keep you updated and credit you in +the advisory unless you ask to remain anonymous. + +## Out of scope + +These are intentionally outside the scope of this policy because they +sit in the *consumer's* infrastructure, not in this codebase: + +- Vulnerabilities in the consumer's cloud account (IAM misconfig, etc.) + caused by how they *use* `sc`. Reach out to the relevant cloud + provider or to the consumer. +- Vulnerabilities in third-party tools `sc` invokes (`pulumi`, `cosign`, + `syft`, `trivy`, `grype`, `gcloud`, `kubectl`). Report those upstream. +- The Caddy / cloud-helpers / kubectl Docker images' *upstream* OS + packages — we re-roll on each release and the deferred CVE log is + documented in PRs at merge time. + +## Hardening posture + +The repository is hardened against the relevant supply-chain risks +covered by CIS, OWASP CICD Top 10, SLSA, NIST SSDF, and the OpenSSF +Scorecard. Current control status is tracked in the hardening pipeline +(image scan, SBOM, Semgrep, Dependabot, secret scan run on every PR +and merge). For details on the threat model and the controls that +ship with each release, see the PR history. + +## Cryptographic primitives + +`sc` uses **only** primitives from the Go standard library and a small +set of audited libraries (`cosign`, `sigstore-go`). We avoid rolling +our own crypto. The local security-scan cache uses HMAC-SHA256 with a +32-byte random per-cache key for tamper detection. + +[push]: .github/workflows/push.yaml +[install-sc]: https://github.com/simple-container-com/actions/tree/main/install-sc +[gsa]: https://github.com/simple-container-com/api/security/advisories/new diff --git a/pkg/security/cache.go b/pkg/security/cache.go index 0d174558..614a45a4 100644 --- a/pkg/security/cache.go +++ b/pkg/security/cache.go @@ -1,18 +1,32 @@ package security import ( + "crypto/hmac" + "crypto/rand" "crypto/sha256" "encoding/hex" "encoding/json" + "errors" "fmt" + "io" "os" "path/filepath" "time" ) -// Cache provides TTL-based caching for security operation results +// Cache provides TTL-based, HMAC-authenticated caching for security +// operation results (SBOM, image scan, signature checks). +// +// Integrity model: each entry is signed with HMAC-SHA256 using a 32-byte +// key persisted at /.hmac.key (mode 0600, generated from +// crypto/rand on first use). Tampering with the on-disk JSON, or +// substituting a file written under a different key, causes the entry +// to fail verification and be silently discarded. This replaces the +// previous mtime-based tamper-detection heuristic, which was trivially +// forgeable with `touch -t`. type Cache struct { baseDir string + key []byte // HMAC-SHA256 key, 32 bytes } // CacheKey uniquely identifies a cached result @@ -30,6 +44,13 @@ type CacheEntry struct { ExpiresAt time.Time `json:"expiresAt"` } +// signedEntry is the on-disk shape: the entry plus a hex-encoded +// HMAC-SHA256 of the entry's canonical JSON. +type signedEntry struct { + Entry CacheEntry `json:"entry"` + MAC string `json:"mac"` +} + // TTL durations for different operations const ( TTL_SBOM = 24 * time.Hour // SBOM: 24h @@ -37,7 +58,18 @@ const ( TTL_SCAN_TRIVY = 6 * time.Hour // Trivy scan: 6h ) -// NewCache creates a new cache instance +const ( + // hmacKeyFilename is the name of the per-cache HMAC key file. The + // leading dot lets cleanup walkers skip it cheaply by name; mode is + // 0o600 so even other processes running as the same user can't + // silently rotate the key from under us. + hmacKeyFilename = ".hmac.key" + hmacKeyLen = 32 // 256 bits, matches HMAC-SHA256 block-size guidance +) + +// NewCache creates a new cache instance, generating or loading the +// per-cache HMAC key. Returns an error if the existing key file is +// the wrong size (likely corrupted or truncated). func NewCache(baseDir string) (*Cache, error) { if baseDir == "" { homeDir, err := os.UserHomeDir() @@ -47,60 +79,115 @@ func NewCache(baseDir string) (*Cache, error) { baseDir = filepath.Join(homeDir, ".simple-container", "cache", "security") } - // Create cache directory if it doesn't exist if err := os.MkdirAll(baseDir, 0o700); err != nil { return nil, fmt.Errorf("creating cache directory: %w", err) } + key, err := loadOrCreateHMACKey(filepath.Join(baseDir, hmacKeyFilename)) + if err != nil { + return nil, fmt.Errorf("initializing cache HMAC key: %w", err) + } + return &Cache{ baseDir: baseDir, + key: key, }, nil } -// Get retrieves a cached result if it exists and hasn't expired -func (c *Cache) Get(key CacheKey) ([]byte, bool, error) { - path := c.getPath(key) +// loadOrCreateHMACKey reads the HMAC key from path; if absent, generates +// a new 32-byte random key and persists it with mode 0o600. +func loadOrCreateHMACKey(path string) ([]byte, error) { + existing, err := os.ReadFile(path) + if err == nil { + if len(existing) != hmacKeyLen { + return nil, fmt.Errorf("hmac key at %s is %d bytes, expected %d (corrupted?)", + path, len(existing), hmacKeyLen) + } + return existing, nil + } + if !errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf("reading hmac key: %w", err) + } - // Check if file exists - info, err := os.Stat(path) - if os.IsNotExist(err) { - return nil, false, nil + key := make([]byte, hmacKeyLen) + if _, err := io.ReadFull(rand.Reader, key); err != nil { + return nil, fmt.Errorf("generating hmac key: %w", err) } + // O_EXCL ensures we never silently overwrite a concurrently-created key. + f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0o600) if err != nil { - return nil, false, fmt.Errorf("checking cache file: %w", err) + // Race: another process created the key between our Stat and + // OpenFile. Fall back to reading what they wrote. + if errors.Is(err, os.ErrExist) { + return loadOrCreateHMACKey(path) + } + return nil, fmt.Errorf("creating hmac key file: %w", err) } + defer f.Close() + if _, err := f.Write(key); err != nil { + return nil, fmt.Errorf("writing hmac key: %w", err) + } + return key, nil +} + +// computeMAC returns the HMAC-SHA256 of entryJSON using the cache's key. +func (c *Cache) computeMAC(entryJSON []byte) []byte { + h := hmac.New(sha256.New, c.key) + h.Write(entryJSON) + return h.Sum(nil) +} + +// Get retrieves a cached result if it exists, hasn't expired, and its +// HMAC verifies against the cache's key. Tampered or unsigned entries +// (legacy files from the pre-HMAC code path) are silently removed and +// reported as a cache miss. +func (c *Cache) Get(key CacheKey) ([]byte, bool, error) { + path := c.getPath(key) - // Read cache entry data, err := os.ReadFile(path) if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil, false, nil + } return nil, false, fmt.Errorf("reading cache file: %w", err) } - var entry CacheEntry - if err := json.Unmarshal(data, &entry); err != nil { - // Invalid cache entry, treat as cache miss + var signed signedEntry + if err := json.Unmarshal(data, &signed); err != nil { + // Unparseable — treat as miss. + _ = os.Remove(path) + return nil, false, nil + } + + // Reject pre-HMAC entries (no MAC field) and any zero-length MAC. + gotMAC, err := hex.DecodeString(signed.MAC) + if err != nil || len(gotMAC) == 0 { _ = os.Remove(path) return nil, false, nil } - // Check expiration - if time.Now().After(entry.ExpiresAt) { - // Expired, remove and return cache miss + entryJSON, err := json.Marshal(signed.Entry) + if err != nil { + // Should never happen — we just unmarshaled the same shape. _ = os.Remove(path) return nil, false, nil } - // Verify file modification time hasn't been tampered with - if info.ModTime().After(entry.CreatedAt.Add(1 * time.Hour)) { - // File was modified after creation, invalidate + if !hmac.Equal(gotMAC, c.computeMAC(entryJSON)) { + // Tamper detected (or written by a different key). Discard. _ = os.Remove(path) return nil, false, nil } - return entry.Data, true, nil + if time.Now().After(signed.Entry.ExpiresAt) { + _ = os.Remove(path) + return nil, false, nil + } + + return signed.Entry.Data, true, nil } -// Set stores a result in the cache with appropriate TTL +// Set stores a result in the cache with the appropriate TTL. func (c *Cache) Set(key CacheKey, data []byte) error { return c.SetWithTTL(key, data, c.getTTL(key.Operation)) } @@ -119,21 +206,27 @@ func (c *Cache) SetWithTTL(key CacheKey, data []byte, ttl time.Duration) error { ExpiresAt: now.Add(ttl), } - entryData, err := json.Marshal(entry) + entryJSON, err := json.Marshal(entry) if err != nil { return fmt.Errorf("marshaling cache entry: %w", err) } - path := c.getPath(key) + signed := signedEntry{ + Entry: entry, + MAC: hex.EncodeToString(c.computeMAC(entryJSON)), + } + + out, err := json.Marshal(signed) + if err != nil { + return fmt.Errorf("marshaling signed entry: %w", err) + } - // Create directory if it doesn't exist - dir := filepath.Dir(path) - if err := os.MkdirAll(dir, 0o700); err != nil { + path := c.getPath(key) + if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil { return fmt.Errorf("creating cache directory: %w", err) } - // Write with secure permissions (0600) - if err := os.WriteFile(path, entryData, 0o600); err != nil { + if err := os.WriteFile(path, out, 0o600); err != nil { return fmt.Errorf("writing cache file: %w", err) } @@ -144,8 +237,8 @@ func (c *Cache) SetWithTTL(key CacheKey, data []byte, ttl time.Duration) error { func (c *Cache) Invalidate(key CacheKey) error { path := c.getPath(key) err := os.Remove(path) - if os.IsNotExist(err) { - return nil // Already gone + if errors.Is(err, os.ErrNotExist) { + return nil } if err != nil { return fmt.Errorf("removing cache file: %w", err) @@ -153,49 +246,58 @@ func (c *Cache) Invalidate(key CacheKey) error { return nil } -// Clean removes expired cache entries +// Clean removes expired or HMAC-invalid cache entries. The HMAC key +// file itself is skipped by name. func (c *Cache) Clean() error { return filepath.Walk(c.baseDir, func(path string, info os.FileInfo, err error) error { if err != nil { - return nil // Skip files we can't access + return nil } - if info.IsDir() { - return nil // Skip directories + return nil + } + if info.Name() == hmacKeyFilename { + return nil // Never touch the key. } - // Read and check expiration data, err := os.ReadFile(path) if err != nil { - return nil // Skip files we can't read + return nil } - var entry CacheEntry - if err := json.Unmarshal(data, &entry); err != nil { - // Invalid entry, remove it + var signed signedEntry + if err := json.Unmarshal(data, &signed); err != nil { _ = os.Remove(path) return nil } - // Check expiration - if time.Now().After(entry.ExpiresAt) { + gotMAC, err := hex.DecodeString(signed.MAC) + if err != nil || len(gotMAC) == 0 { _ = os.Remove(path) + return nil } + entryJSON, err := json.Marshal(signed.Entry) + if err != nil || !hmac.Equal(gotMAC, c.computeMAC(entryJSON)) { + _ = os.Remove(path) + return nil + } + + if time.Now().After(signed.Entry.ExpiresAt) { + _ = os.Remove(path) + } return nil }) } // getPath returns the filesystem path for a cache key func (c *Cache) getPath(key CacheKey) string { - // Create a deterministic filename from the key hash := sha256.New() hash.Write([]byte(key.Operation)) hash.Write([]byte(key.ImageDigest)) hash.Write([]byte(key.ConfigHash)) filename := hex.EncodeToString(hash.Sum(nil)) - // Organize by operation type return filepath.Join(c.baseDir, key.Operation, filename+".json") } @@ -207,16 +309,17 @@ func (c *Cache) getTTL(operation string) time.Duration { case "scan-grype", "scan-trivy": return TTL_SCAN_GRYPE default: - return 6 * time.Hour // Default TTL + return 6 * time.Hour } } -// Size returns the total size of the cache in bytes +// Size returns the total size of the cache in bytes (incl. the HMAC key, +// which is negligible at 32 bytes). func (c *Cache) Size() (int64, error) { var size int64 err := filepath.Walk(c.baseDir, func(path string, info os.FileInfo, err error) error { if err != nil { - return nil // Skip files we can't access + return nil } if !info.IsDir() { size += info.Size() @@ -226,7 +329,8 @@ func (c *Cache) Size() (int64, error) { return size, err } -// Clear removes all cached entries +// Clear removes all cached entries AND the HMAC key. The next NewCache +// call will generate a fresh key. func (c *Cache) Clear() error { return os.RemoveAll(c.baseDir) } diff --git a/pkg/security/cache_test.go b/pkg/security/cache_test.go index 8d52b7b8..6eca5419 100644 --- a/pkg/security/cache_test.go +++ b/pkg/security/cache_test.go @@ -326,3 +326,179 @@ func TestCacheKeyPath(t *testing.T) { func containsSubstring(s, substr string) bool { return filepath.Base(filepath.Dir(s)) == substr || filepath.Base(s) == substr } + +// --------------------------------------------------------------------- +// HMAC integrity tests (Phase 5 — replaces the prior mtime tamper check). +// --------------------------------------------------------------------- + +func TestCacheHMACKeyPersisted(t *testing.T) { + RegisterTestingT(t) + + dir := t.TempDir() + + cache1, err := NewCache(dir) + Expect(err).ToNot(HaveOccurred()) + Expect(cache1.key).To(HaveLen(hmacKeyLen)) + + keyPath := filepath.Join(dir, hmacKeyFilename) + stat, err := os.Stat(keyPath) + Expect(err).ToNot(HaveOccurred()) + Expect(stat.Size()).To(Equal(int64(hmacKeyLen))) + Expect(stat.Mode().Perm()).To(Equal(os.FileMode(0o600))) + + // Re-open the same dir: must reuse the existing key, not regenerate. + cache2, err := NewCache(dir) + Expect(err).ToNot(HaveOccurred()) + Expect(cache2.key).To(Equal(cache1.key)) +} + +func TestCacheHMACKeyWrongSizeRejected(t *testing.T) { + RegisterTestingT(t) + + dir := t.TempDir() + // Pre-seed a truncated key file. + Expect(os.WriteFile(filepath.Join(dir, hmacKeyFilename), []byte("short"), 0o600)).To(Succeed()) + + _, err := NewCache(dir) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("hmac key")) +} + +func TestCacheTamperDetection(t *testing.T) { + RegisterTestingT(t) + + cache, err := NewCache(t.TempDir()) + Expect(err).ToNot(HaveOccurred()) + + key := CacheKey{Operation: "sbom", ImageDigest: "sha256:tamper", ConfigHash: "h"} + Expect(cache.Set(key, []byte("original"))).To(Succeed()) + + path := cache.getPath(key) + raw, err := os.ReadFile(path) + Expect(err).ToNot(HaveOccurred()) + + var sig signedEntry + Expect(json.Unmarshal(raw, &sig)).To(Succeed()) + + // Tamper the entry data; leave the MAC alone — HMAC must catch it. + sig.Entry.Data = []byte("malicious") + tampered, err := json.Marshal(sig) + Expect(err).ToNot(HaveOccurred()) + Expect(os.WriteFile(path, tampered, 0o600)).To(Succeed()) + + got, found, err := cache.Get(key) + Expect(err).ToNot(HaveOccurred()) + Expect(found).To(BeFalse(), "tampered entry must be treated as cache miss") + Expect(got).To(BeNil()) + + // The corrupt file must also be removed. + _, err = os.Stat(path) + Expect(os.IsNotExist(err)).To(BeTrue()) +} + +func TestCacheMtimeForgeryNoLongerHelpful(t *testing.T) { + RegisterTestingT(t) + + cache, err := NewCache(t.TempDir()) + Expect(err).ToNot(HaveOccurred()) + + key := CacheKey{Operation: "sbom", ImageDigest: "sha256:mtime", ConfigHash: "h"} + Expect(cache.Set(key, []byte("payload"))).To(Succeed()) + + // Advance mtime far into the future — the old code would have + // invalidated the entry here. The HMAC-based check ignores mtime, + // so the entry must remain valid. + path := cache.getPath(key) + future := time.Now().Add(48 * time.Hour) + Expect(os.Chtimes(path, future, future)).To(Succeed()) + + got, found, err := cache.Get(key) + Expect(err).ToNot(HaveOccurred()) + Expect(found).To(BeTrue(), "mtime is no longer load-bearing for integrity") + Expect(string(got)).To(Equal("payload")) +} + +func TestCacheKeyMismatchRejectsEntry(t *testing.T) { + RegisterTestingT(t) + + dir := t.TempDir() + + // Create the cache, write an entry, then swap the HMAC key as if a + // different process (or the user's clearing it) had rotated the key. + c1, err := NewCache(dir) + Expect(err).ToNot(HaveOccurred()) + key := CacheKey{Operation: "sbom", ImageDigest: "sha256:k", ConfigHash: "h"} + Expect(c1.Set(key, []byte("data"))).To(Succeed()) + + // Replace the key file with random bytes of the right length. + newKey := make([]byte, hmacKeyLen) + newKey[0] = 0xff + Expect(os.WriteFile(filepath.Join(dir, hmacKeyFilename), newKey, 0o600)).To(Succeed()) + + c2, err := NewCache(dir) + Expect(err).ToNot(HaveOccurred()) + + got, found, err := c2.Get(key) + Expect(err).ToNot(HaveOccurred()) + Expect(found).To(BeFalse(), "entry signed under the previous key must fail verification") + Expect(got).To(BeNil()) +} + +func TestCacheLegacyUnsignedEntryDiscarded(t *testing.T) { + RegisterTestingT(t) + + cache, err := NewCache(t.TempDir()) + Expect(err).ToNot(HaveOccurred()) + + key := CacheKey{Operation: "sbom", ImageDigest: "sha256:legacy", ConfigHash: "h"} + path := cache.getPath(key) + Expect(os.MkdirAll(filepath.Dir(path), 0o700)).To(Succeed()) + + // Simulate a pre-HMAC entry: just the unwrapped CacheEntry shape. + legacy := CacheEntry{ + Key: key, + Data: []byte("pre-hmac"), + CreatedAt: time.Now(), + ExpiresAt: time.Now().Add(time.Hour), + } + legacyBytes, err := json.Marshal(legacy) + Expect(err).ToNot(HaveOccurred()) + Expect(os.WriteFile(path, legacyBytes, 0o600)).To(Succeed()) + + got, found, err := cache.Get(key) + Expect(err).ToNot(HaveOccurred()) + Expect(found).To(BeFalse(), "legacy unsigned entries must be discarded on upgrade") + Expect(got).To(BeNil()) + + _, err = os.Stat(path) + Expect(os.IsNotExist(err)).To(BeTrue()) +} + +func TestCacheCleanSkipsHMACKey(t *testing.T) { + RegisterTestingT(t) + + dir := t.TempDir() + cache, err := NewCache(dir) + Expect(err).ToNot(HaveOccurred()) + + keyPath := filepath.Join(dir, hmacKeyFilename) + keyBefore, err := os.ReadFile(keyPath) + Expect(err).ToNot(HaveOccurred()) + + // Drop in a corrupt entry that Clean would normally remove. + cKey := CacheKey{Operation: "sbom", ImageDigest: "sha256:c", ConfigHash: "h"} + cPath := cache.getPath(cKey) + Expect(os.MkdirAll(filepath.Dir(cPath), 0o700)).To(Succeed()) + Expect(os.WriteFile(cPath, []byte("not json"), 0o600)).To(Succeed()) + + Expect(cache.Clean()).To(Succeed()) + + // The HMAC key must survive. + keyAfter, err := os.ReadFile(keyPath) + Expect(err).ToNot(HaveOccurred()) + Expect(keyAfter).To(Equal(keyBefore)) + + // The corrupt entry must be gone. + _, err = os.Stat(cPath) + Expect(os.IsNotExist(err)).To(BeTrue()) +} From 5f01b9dc36a1d908e7e2f457dec02bab57bccaf5 Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Mon, 11 May 2026 20:14:15 +0400 Subject: [PATCH 2/7] fix(cache): bind HMAC verification to the requested cache key (codex P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit codex review on PR #(this branch) flagged a real gap in Phase 5's HMAC integrity model: > When a valid signed cache file is copied to another cache key's path, > the MAC still verifies because it covers the embedded entry, but > `Get` never checks `signed.Entry.Key` against the requested key. In > that scenario a caller receives cached SBOM/scan data for a different > image/config without knowing the HMAC key. Practical impact: cross-image / cross-config payload swap. A scan result for image A could be served when the caller asked for image B, because the MAC alone proves entry integrity but not (entry, location) binding. Fix in two places, defense in depth: 1. `Get(key)` — after MAC verifies, require `signed.Entry.Key == key`. Mismatched files are removed and reported as cache miss. 2. `Clean()` — same check expressed as `c.getPath(signed.Entry.Key) == path`. Validly-signed entries parked at the wrong location get garbage-collected on the next Clean sweep. Regression test added: `TestCacheCrossKeyCopyRejected` — Set under keyA, copy the signed file to keyB's path, assert `Get(keyB)` returns miss and the misplaced file is removed. Existing keyA entry remains intact. 23/23 tests in pkg/security/ green. Signed-off-by: Dmitrii Creed --- pkg/security/cache.go | 17 +++++++++++++++++ pkg/security/cache_test.go | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/pkg/security/cache.go b/pkg/security/cache.go index 614a45a4..43609766 100644 --- a/pkg/security/cache.go +++ b/pkg/security/cache.go @@ -179,6 +179,16 @@ func (c *Cache) Get(key CacheKey) ([]byte, bool, error) { return nil, false, nil } + // MAC covers the entry content but not the location on disk. A + // valid signed file copied from path A to path B would still verify + // here, and `Get(keyB)` would return keyA's payload — bypassing the + // integrity story for SBOM/scan data. Bind the lookup to the embedded + // CacheKey and discard mismatches. + if signed.Entry.Key != key { + _ = os.Remove(path) + return nil, false, nil + } + if time.Now().After(signed.Entry.ExpiresAt) { _ = os.Remove(path) return nil, false, nil @@ -283,6 +293,13 @@ func (c *Cache) Clean() error { return nil } + // Mirror the path-to-key binding from Get: a validly-signed + // entry parked at the wrong filesystem location is also garbage. + if c.getPath(signed.Entry.Key) != path { + _ = os.Remove(path) + return nil + } + if time.Now().After(signed.Entry.ExpiresAt) { _ = os.Remove(path) } diff --git a/pkg/security/cache_test.go b/pkg/security/cache_test.go index 6eca5419..59586fd5 100644 --- a/pkg/security/cache_test.go +++ b/pkg/security/cache_test.go @@ -474,6 +474,43 @@ func TestCacheLegacyUnsignedEntryDiscarded(t *testing.T) { Expect(os.IsNotExist(err)).To(BeTrue()) } +// Cross-key copy: a signed file from key A copied to key B's path +// would have a valid MAC but should NOT satisfy `Get(keyB)`. The +// embedded CacheKey must match the requested key. +func TestCacheCrossKeyCopyRejected(t *testing.T) { + RegisterTestingT(t) + + cache, err := NewCache(t.TempDir()) + Expect(err).ToNot(HaveOccurred()) + + keyA := CacheKey{Operation: "sbom", ImageDigest: "sha256:A", ConfigHash: "h"} + keyB := CacheKey{Operation: "sbom", ImageDigest: "sha256:B", ConfigHash: "h"} + + Expect(cache.Set(keyA, []byte("payload-A"))).To(Succeed()) + + // Copy the validly-signed file from A's path to B's path. + src, dst := cache.getPath(keyA), cache.getPath(keyB) + Expect(os.MkdirAll(filepath.Dir(dst), 0o700)).To(Succeed()) + bytes, err := os.ReadFile(src) + Expect(err).ToNot(HaveOccurred()) + Expect(os.WriteFile(dst, bytes, 0o600)).To(Succeed()) + + got, found, err := cache.Get(keyB) + Expect(err).ToNot(HaveOccurred()) + Expect(found).To(BeFalse(), "MAC alone must not bind a signed file to an arbitrary cache path") + Expect(got).To(BeNil()) + + // The misplaced copy must be removed. + _, err = os.Stat(dst) + Expect(os.IsNotExist(err)).To(BeTrue()) + + // keyA's own entry must still be intact. + got, found, err = cache.Get(keyA) + Expect(err).ToNot(HaveOccurred()) + Expect(found).To(BeTrue()) + Expect(string(got)).To(Equal("payload-A")) +} + func TestCacheCleanSkipsHMACKey(t *testing.T) { RegisterTestingT(t) From abccff302aa6579965bc53c9ea350c88b359c02e Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Tue, 12 May 2026 21:05:10 +0400 Subject: [PATCH 3/7] fix(review): round-1 codex + gemini fixes for PR #254 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two reviewers caught real issues. Four addressed: 1. (codex P2) HMAC key creation was racy: O_CREATE|O_EXCL + Write leaves a window where another reader can ReadFile a 0-byte file and fail with a "corrupted" error, bubbling up as cache init failure (= scan/SBOM runs blocked). → loadOrCreateHMACKey switched to os.CreateTemp + Sync + Rename. Same-directory temp keeps the rename a same-filesystem op (handles EXDEV gemini also flagged). New readKeyFile() helper retries short reads 5×50ms to absorb concurrent-creation races. 2. (codex P2 / doc) CODEOWNERS comments overstated what GitHub enforces. Multiple owners on one pattern = ALTERNATIVES, not conjunction. Two-reviewer gating requires a branch-protection setting in the admin UI. → Header rewritten to document the admin-UI dependency. No semantic change to ownership. 3. (gemini P1) Set()'s os.WriteFile is non-atomic — a concurrent Clean() can read a partial JSON, fail to unmarshal, and remove the file mid-write. → SetWithTTL now publishes via CreateTemp + Chmod + Write + Sync + Close + Rename in the SAME dir. → Clean() now skips .cache.tmp.* and .hmac.key.tmp.* by name so in-flight writers' temp files aren't sabotaged. 4. (gemini P2) CodeQL Autobuild relies on the runner's bundled Go; our go.mod is 1.25.x, ubuntu-24.04 ships older — Autobuild fails at compile and CI red looks like a finding rather than infra. → Added explicit actions/setup-go pinned by go.mod via `go-version-file: go.mod`. Granted `actions: read` per GitHub's recommended CodeQL template (workflow-metadata fetch). Tests added (now 27 total in pkg/security/, was 25): - TestCacheCleanSkipsInFlightTempFiles — Clean leaves .cache.tmp.* and .hmac.key.tmp.* alone. - TestCacheSetIsAtomic — 256 KiB payload round-trips; no temp leftover in the op dir after a successful Set. Validation ========== - go build ./... clean - go vet ./... clean - go test -short ./pkg/security/... 27/27 green - Semgrep (github-actions + go rulesets) on changed files: 0 findings (Note: -count=1 without -short fails 3 _Integration tests due to a stale local Grype DB unrelated to this PR; CI refreshes that DB.) Round-1 findings deliberately NOT addressed in this commit ========================================================== - (gemini P3) Re-marshal HMAC verification is fragile if CacheEntry's schema ever changes. Today's Go encoding/json is deterministic for our shape; documented inline. Schema-change discipline is the right control. - (gemini P3) Clean() TOCTOU between expiry decision and Remove. Cache is rebuildable; the cost of holding a lock outweighs the benefit. - (gemini nit) SECURITY.md PGP key — security@simple-container.com is sufficient. Signed-off-by: Dmitrii Creed --- .github/workflows/codeql.yml | 11 +++ CODEOWNERS | 31 ++++++-- pkg/security/cache.go | 132 ++++++++++++++++++++++++++++------- pkg/security/cache_test.go | 62 ++++++++++++++++ 4 files changed, 205 insertions(+), 31 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index d3431dfa..b116904a 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -35,6 +35,7 @@ jobs: permissions: contents: read security-events: write # upload SARIF to the Security tab + actions: read # CodeQL templates fetch workflow metadata strategy: fail-fast: false @@ -47,6 +48,16 @@ jobs: with: persist-credentials: false + # Pin Go to the version declared in go.mod. Without this, CodeQL + # Autobuild uses the runner's bundled Go (older than our 1.25.x + # directive), which fails the compile and surfaces as a CI red + # rather than a real finding (gemini round-1 P2). + - name: Set up Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 + with: + go-version-file: go.mod + cache: false + - name: Initialize CodeQL uses: github/codeql-action/init@68bde559dea0fdcac2102bfdf6230c5f70eb485e # v4 with: diff --git a/CODEOWNERS b/CODEOWNERS index dff73576..6de21dfe 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,22 +1,39 @@ # https://docs.github.com/en/repositories/managing-your-repositories-settings-and-customization/customizing-your-repository/about-code-owners # -# CODEOWNERS for simple-container-com/api. Two human reviewers are -# enforced on security-critical paths; broad repo defaults are kept to -# either lead to minimise PR-stall. +# CODEOWNERS for simple-container-com/api. +# +# IMPORTANT — semantics: when multiple users are listed on the same +# pattern, GitHub treats them as ALTERNATIVES, not a conjunction. A PR +# satisfies the CODEOWNERS gate once ANY one listed owner approves. +# To genuinely require approvals from BOTH leads on sensitive paths, +# configure a branch protection rule / Ruleset that sets: +# - "Require review from Code Owners" +# - "Required approving reviews" ≥ 2 +# That's a repo-admin UI setting; this file can't express it. +# +# What this file DOES enforce: +# - GitHub auto-requests review from a code owner when a PR touches +# a matching file. +# - If the matching review-count rule is set in the admin UI, a PR +# cannot merge without enough code-owner approvals. +# +# Listing both leads on a pattern widens the review pool — either lead +# can sign off; both don't have to (unless the admin UI rule is set). # # Tip: GitHub picks the LAST matching pattern in this file for a given # file path — ordering matters. Put broad defaults at the top and # specific high-sensitivity globs at the bottom so they win. # --------------------------------------------------------------------- -# Default — every file needs at least one lead's review. +# Default — every file pulls in at least one lead for review. # --------------------------------------------------------------------- * @Cre-eD @smecsia # --------------------------------------------------------------------- -# CI / supply-chain — both leads required, no auto-merge with one -# approval. Touching these files changes how every release is built -# and how secrets reach the runner. +# CI / supply-chain — sensitive paths. Multi-reviewer gating depends +# on the branch-protection setting described in the header. Touching +# these files changes how every release is built and how secrets +# reach the runner. # --------------------------------------------------------------------- /.github/ @Cre-eD @smecsia /.github/workflows/ @Cre-eD @smecsia diff --git a/pkg/security/cache.go b/pkg/security/cache.go index 43609766..5f58a082 100644 --- a/pkg/security/cache.go +++ b/pkg/security/cache.go @@ -11,6 +11,7 @@ import ( "io" "os" "path/filepath" + "strings" "time" ) @@ -95,39 +96,87 @@ func NewCache(baseDir string) (*Cache, error) { } // loadOrCreateHMACKey reads the HMAC key from path; if absent, generates -// a new 32-byte random key and persists it with mode 0o600. +// a new 32-byte random key and persists it via os.CreateTemp + Rename. +// +// Concurrency: a concurrent reader observes either the fully-written +// 32-byte file or no file at all — never a 0-byte mid-creation view. +// The prior O_CREATE|O_EXCL + write sequence had a narrow window where +// another reader could ReadFile a 0-byte file and fail with "corrupted" +// (codex round-1 P2). Two processes racing here each generate a key; +// only one Rename wins, and the loser's reread picks up the winner's +// bytes (cryptographically equivalent — both random). func loadOrCreateHMACKey(path string) ([]byte, error) { - existing, err := os.ReadFile(path) - if err == nil { - if len(existing) != hmacKeyLen { - return nil, fmt.Errorf("hmac key at %s is %d bytes, expected %d (corrupted?)", - path, len(existing), hmacKeyLen) - } + if existing, err := readKeyFile(path); err == nil { return existing, nil - } - if !errors.Is(err, os.ErrNotExist) { - return nil, fmt.Errorf("reading hmac key: %w", err) + } else if !errors.Is(err, os.ErrNotExist) { + return nil, err } key := make([]byte, hmacKeyLen) if _, err := io.ReadFull(rand.Reader, key); err != nil { return nil, fmt.Errorf("generating hmac key: %w", err) } - // O_EXCL ensures we never silently overwrite a concurrently-created key. - f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_EXCL, 0o600) + + dir := filepath.Dir(path) + tmp, err := os.CreateTemp(dir, ".hmac.key.tmp.*") if err != nil { - // Race: another process created the key between our Stat and - // OpenFile. Fall back to reading what they wrote. - if errors.Is(err, os.ErrExist) { - return loadOrCreateHMACKey(path) - } - return nil, fmt.Errorf("creating hmac key file: %w", err) + return nil, fmt.Errorf("creating hmac key temp: %w", err) + } + tmpPath := tmp.Name() + defer func() { _ = os.Remove(tmpPath) }() + if err := os.Chmod(tmpPath, 0o600); err != nil { + _ = tmp.Close() + return nil, fmt.Errorf("chmod hmac key temp: %w", err) } - defer f.Close() - if _, err := f.Write(key); err != nil { + if _, err := tmp.Write(key); err != nil { + _ = tmp.Close() return nil, fmt.Errorf("writing hmac key: %w", err) } - return key, nil + if err := tmp.Sync(); err != nil { + _ = tmp.Close() + return nil, fmt.Errorf("syncing hmac key: %w", err) + } + if err := tmp.Close(); err != nil { + return nil, fmt.Errorf("closing hmac key temp: %w", err) + } + + // POSIX rename is atomic on the same filesystem. After this point + // readers observe the full 32 bytes or no file at all. If a + // concurrent caller already published a key, our Rename overwrites + // it; reread to return on-disk truth either way. + if err := os.Rename(tmpPath, path); err != nil { + return nil, fmt.Errorf("placing hmac key: %w", err) + } + return readKeyFile(path) +} + +// readKeyFile reads and validates the HMAC key file. Returns +// os.ErrNotExist when absent. A short read (file present but smaller +// than hmacKeyLen) is retried — that almost certainly means a +// concurrent creator is between CreateTemp and Rename. After ~250ms +// of retries it gives up with a corruption error. +func readKeyFile(path string) ([]byte, error) { + const ( + maxAttempts = 5 + retryInterval = 50 * time.Millisecond + ) + var lastErr error + for attempt := 0; attempt < maxAttempts; attempt++ { + existing, err := os.ReadFile(path) + if errors.Is(err, os.ErrNotExist) { + return nil, err + } + if err != nil { + return nil, fmt.Errorf("reading hmac key: %w", err) + } + if len(existing) == hmacKeyLen { + return existing, nil + } + lastErr = fmt.Errorf("hmac key at %s is %d bytes, expected %d", + path, len(existing), hmacKeyLen) + time.Sleep(retryInterval) + } + return nil, fmt.Errorf("%w (corrupted or concurrent-creation race)", lastErr) } // computeMAC returns the HMAC-SHA256 of entryJSON using the cache's key. @@ -232,12 +281,39 @@ func (c *Cache) SetWithTTL(key CacheKey, data []byte, ttl time.Duration) error { } path := c.getPath(key) - if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil { + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0o700); err != nil { return fmt.Errorf("creating cache directory: %w", err) } - if err := os.WriteFile(path, out, 0o600); err != nil { - return fmt.Errorf("writing cache file: %w", err) + // Atomic publish: CreateTemp in the SAME directory (so Rename is a + // same-filesystem op and stays atomic) → Chmod → Write → Sync → + // Close → Rename. Without this, a concurrent Clean() can read a + // partially-written file, fail to unmarshal, and remove it from + // under us — corrupting the cache (gemini round-1 P1). + tmp, err := os.CreateTemp(dir, ".cache.tmp.*") + if err != nil { + return fmt.Errorf("creating cache temp: %w", err) + } + tmpPath := tmp.Name() + defer func() { _ = os.Remove(tmpPath) }() + if err := os.Chmod(tmpPath, 0o600); err != nil { + _ = tmp.Close() + return fmt.Errorf("chmod cache temp: %w", err) + } + if _, err := tmp.Write(out); err != nil { + _ = tmp.Close() + return fmt.Errorf("writing cache temp: %w", err) + } + if err := tmp.Sync(); err != nil { + _ = tmp.Close() + return fmt.Errorf("syncing cache temp: %w", err) + } + if err := tmp.Close(); err != nil { + return fmt.Errorf("closing cache temp: %w", err) + } + if err := os.Rename(tmpPath, path); err != nil { + return fmt.Errorf("placing cache file: %w", err) } return nil @@ -269,6 +345,14 @@ func (c *Cache) Clean() error { if info.Name() == hmacKeyFilename { return nil // Never touch the key. } + // Skip in-flight `Set()` / `loadOrCreateHMACKey` temp files; + // the writer's pending Rename will publish them shortly. + // Removing now would be racy AND silently lose data the + // writer expects to land (gemini round-1 P1). + if strings.HasPrefix(info.Name(), ".cache.tmp.") || + strings.HasPrefix(info.Name(), ".hmac.key.tmp.") { + return nil + } data, err := os.ReadFile(path) if err != nil { diff --git a/pkg/security/cache_test.go b/pkg/security/cache_test.go index 59586fd5..9daddf46 100644 --- a/pkg/security/cache_test.go +++ b/pkg/security/cache_test.go @@ -4,6 +4,7 @@ import ( "encoding/json" "os" "path/filepath" + "strings" "testing" "time" @@ -511,6 +512,67 @@ func TestCacheCrossKeyCopyRejected(t *testing.T) { Expect(string(got)).To(Equal("payload-A")) } +// Clean must not delete an in-flight `Set()` temp file — gemini-flagged +// race where Clean reads a partial file, fails to unmarshal, and removes +// it from under a still-running writer. +func TestCacheCleanSkipsInFlightTempFiles(t *testing.T) { + RegisterTestingT(t) + + dir := t.TempDir() + cache, err := NewCache(dir) + Expect(err).ToNot(HaveOccurred()) + + // Simulate an in-flight Set() by dropping a `.cache.tmp.*` file in + // the operation subdir. + opDir := filepath.Join(dir, "sbom") + Expect(os.MkdirAll(opDir, 0o700)).To(Succeed()) + cacheTmp := filepath.Join(opDir, ".cache.tmp.12345") + Expect(os.WriteFile(cacheTmp, []byte("partial bytes — not valid JSON yet"), 0o600)).To(Succeed()) + + Expect(cache.Clean()).To(Succeed()) + _, err = os.Stat(cacheTmp) + Expect(err).ToNot(HaveOccurred(), "Clean must leave .cache.tmp.* files alone") + + // Same guard for .hmac.key.tmp.* at the baseDir level. + hmacTmp := filepath.Join(dir, ".hmac.key.tmp.67890") + Expect(os.WriteFile(hmacTmp, []byte("partial key bytes"), 0o600)).To(Succeed()) + Expect(cache.Clean()).To(Succeed()) + _, err = os.Stat(hmacTmp) + Expect(err).ToNot(HaveOccurred(), "Clean must leave .hmac.key.tmp.* files alone") +} + +// Set must publish atomically — the on-disk file is never a partial +// write, and the temp file is cleaned up after a successful Rename. +func TestCacheSetIsAtomic(t *testing.T) { + RegisterTestingT(t) + + dir := t.TempDir() + cache, err := NewCache(dir) + Expect(err).ToNot(HaveOccurred()) + + key := CacheKey{Operation: "sbom", ImageDigest: "sha256:atomic", ConfigHash: "h"} + bigPayload := make([]byte, 256*1024) + for i := range bigPayload { + bigPayload[i] = byte(i) + } + + Expect(cache.Set(key, bigPayload)).To(Succeed()) + + got, found, err := cache.Get(key) + Expect(err).ToNot(HaveOccurred()) + Expect(found).To(BeTrue()) + Expect(got).To(Equal(bigPayload)) + + // No leftover temp files in the op dir after a successful Set. + opDir := filepath.Join(dir, "sbom") + entries, err := os.ReadDir(opDir) + Expect(err).ToNot(HaveOccurred()) + for _, e := range entries { + Expect(strings.HasPrefix(e.Name(), ".cache.tmp.")).To(BeFalse(), + "Set must clean up its temp file after Rename, found %s", e.Name()) + } +} + func TestCacheCleanSkipsHMACKey(t *testing.T) { RegisterTestingT(t) From ce24e6223d98460d44f7eb81ec59fd7f09454248 Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Tue, 12 May 2026 21:22:22 +0400 Subject: [PATCH 4/7] fix(cache): atomic NO-REPLACE publish for HMAC key (codex round-2 P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-2 caught a remaining race in the Rename-based publish: > When two `NewCache` calls initialize the same empty cache directory > concurrently, this `Rename` can overwrite the key that another > process already published. If that first process already reread and > returned its key, the two live cache instances use different HMAC > keys; entries written by one are treated as tampered and removed by > the other. Trace: T0 proc-A: readKeyFile → ErrNotExist → generate keyA → CreateTemp T1 proc-A: Rename → publishes keyA → readKeyFile → returns keyA → A now holds keyA in memory T2 proc-B: (was sleeping at readKeyFile) → ErrNotExist (lost the race) → generate keyB → CreateTemp T3 proc-B: Rename → OVERWRITES keyA on disk with keyB → readKeyFile → returns keyB → B holds keyB in memory ⇒ A signs cache entries with keyA, B reads them and fails MAC verification → cache thrashes, scans/SBOMs run uncached. Fix: replace `os.Rename` with `os.Link`. Link is atomic NO-REPLACE on POSIX: if the destination already exists, it fails with ErrExist instead of overwriting. On collision, drop our generated key and adopt the winner's by rereading the file. Hardlinks are universally supported on Linux (CI) and macOS (dev), and the same-directory CreateTemp keeps it a same-FS operation. Regression test added: TestCacheHMACKeyConcurrentInitConverges spawns 8 goroutines that all call NewCache(dir) simultaneously after a release-the-hounds channel close. Asserts they all return identical HMAC keys. With the previous Rename-based code this test failed intermittently; with Link it passes reliably (28/28 in pkg/security/). Build, vet, short test suite all clean. Signed-off-by: Dmitrii Creed --- pkg/security/cache.go | 21 ++++++++++++----- pkg/security/cache_test.go | 46 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/pkg/security/cache.go b/pkg/security/cache.go index 5f58a082..87898572 100644 --- a/pkg/security/cache.go +++ b/pkg/security/cache.go @@ -140,13 +140,24 @@ func loadOrCreateHMACKey(path string) ([]byte, error) { return nil, fmt.Errorf("closing hmac key temp: %w", err) } - // POSIX rename is atomic on the same filesystem. After this point - // readers observe the full 32 bytes or no file at all. If a - // concurrent caller already published a key, our Rename overwrites - // it; reread to return on-disk truth either way. - if err := os.Rename(tmpPath, path); err != nil { + // Atomic NO-REPLACE publish via hardlink. os.Rename would + // overwrite a key another process just published, leaving two + // live caches with different keys and cross-invalidating entries + // (codex round-2 P2). os.Link fails with ErrExist if the target + // is already there — meaning a concurrent caller won the race — + // in which case we drop our generated key and adopt theirs by + // rereading. Hardlinks are universally supported on the + // filesystems SC runs on (Linux CI runners, macOS dev machines); + // the same-directory `dir` keeps it a same-FS operation. + if err := os.Link(tmpPath, path); err != nil { + if errors.Is(err, os.ErrExist) { + // Someone else won — use their key. + return readKeyFile(path) + } return nil, fmt.Errorf("placing hmac key: %w", err) } + // tmpPath is removed by the deferred cleanup; the linked path is + // the canonical key from now on. return readKeyFile(path) } diff --git a/pkg/security/cache_test.go b/pkg/security/cache_test.go index 9daddf46..1de2ec39 100644 --- a/pkg/security/cache_test.go +++ b/pkg/security/cache_test.go @@ -512,6 +512,52 @@ func TestCacheCrossKeyCopyRejected(t *testing.T) { Expect(string(got)).To(Equal("payload-A")) } +// Concurrent NewCache on the same fresh directory must converge on a +// single HMAC key. With the previous Rename-based publish, two racing +// processes could each install their own distinct key, causing each +// to invalidate the other's cache entries (codex round-2 P2). The Link +// based publish forces a winner and re-reads the on-disk truth. +func TestCacheHMACKeyConcurrentInitConverges(t *testing.T) { + RegisterTestingT(t) + + dir := t.TempDir() + const n = 8 + + keys := make(chan []byte, n) + errs := make(chan error, n) + start := make(chan struct{}) + + for i := 0; i < n; i++ { + go func() { + <-start + c, err := NewCache(dir) + if err != nil { + errs <- err + return + } + cp := make([]byte, len(c.key)) + copy(cp, c.key) + keys <- cp + }() + } + close(start) // unleash all goroutines simultaneously + + var first []byte + for i := 0; i < n; i++ { + select { + case err := <-errs: + t.Fatalf("NewCache failed: %v", err) + case k := <-keys: + if first == nil { + first = k + continue + } + Expect(k).To(Equal(first), + "goroutine %d got a different HMAC key — Link-based publish is broken", i) + } + } +} + // Clean must not delete an in-flight `Set()` temp file — gemini-flagged // race where Clean reads a partial file, fails to unmarshal, and removes // it from under a still-running writer. From b9d5ad9081c6803bf0fa3df9ee4a78f0f5e28e92 Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Tue, 12 May 2026 21:26:52 +0400 Subject: [PATCH 5/7] fix(cache): drop dead retry loop + document Link FS-compat (gemini round-2 P3 + P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gemini round-2 caught: [P3] The readKeyFile retry loop addresses an impossible state. With the Link-based publish writing a fully-fsynced temp first, the canonical hmac.key path appears atomically with exactly hmacKeyLen bytes — there is no partial-write window at that path. A wrong-size read is permanent corruption, not a race to wait through. [P2] os.Link rejects hardlinks on some exotic filesystems (VirtualBox shared folders, certain SMB mounts). The current code surfaces a clear "placing hmac key" error in that case rather than silently falling back to racy semantics. Changes: - readKeyFile simplified: read once → wrong-size → corruption error. The 5×50ms retry was dead code with the Link+sync design and is gone. Function is now 13 lines vs 28; behavior unchanged for the happy path and the file-absent path. - Documented the FS-compat constraint inline at the Link site: hardlinks are universally available on the filesystems SC actually runs on (Linux ext4/overlayFS, macOS APFS, container layered FS, NFSv3+). The exotic-FS path surfaces an explicit error — that's the right behavior; "fail loud" beats "thrash the cache". Not gemini-recommended fallback chains: every non-Link fallback primitive (O_CREATE|O_EXCL + Write, Rename, etc.) has at least one of the two races we just eliminated. Better to refuse to operate on a hostile filesystem than silently re-introduce them. 28/28 tests in pkg/security/ still green. Semgrep clean. [nit] gemini also noted Get's os.Remove on tampered files doesn't fsync the parent directory. Acceptable: cache is rebuildable; the benefit of an extra syscall per tamper-discard doesn't justify the cost. Not addressed. Signed-off-by: Dmitrii Creed --- pkg/security/cache.go | 47 ++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/pkg/security/cache.go b/pkg/security/cache.go index 87898572..7b7453e2 100644 --- a/pkg/security/cache.go +++ b/pkg/security/cache.go @@ -147,8 +147,13 @@ func loadOrCreateHMACKey(path string) ([]byte, error) { // is already there — meaning a concurrent caller won the race — // in which case we drop our generated key and adopt theirs by // rereading. Hardlinks are universally supported on the - // filesystems SC runs on (Linux CI runners, macOS dev machines); - // the same-directory `dir` keeps it a same-FS operation. + // filesystems SC runs on (Linux ext4/overlayFS, macOS APFS, + // container layered FS, NFSv3+). On exotic filesystems that + // reject hardlinks (some VirtualBox / SMB shared folders) the + // non-ErrExist error path below surfaces a clear failure rather + // than silently degrading to the racy Rename behavior we just + // removed — better to fail loud than thrash the cache. + // The same-directory `dir` keeps it a same-FS operation. if err := os.Link(tmpPath, path); err != nil { if errors.Is(err, os.ErrExist) { // Someone else won — use their key. @@ -162,32 +167,24 @@ func loadOrCreateHMACKey(path string) ([]byte, error) { } // readKeyFile reads and validates the HMAC key file. Returns -// os.ErrNotExist when absent. A short read (file present but smaller -// than hmacKeyLen) is retried — that almost certainly means a -// concurrent creator is between CreateTemp and Rename. After ~250ms -// of retries it gives up with a corruption error. +// os.ErrNotExist when absent. A wrong-size read is permanent +// corruption — there is no race window to retry through, because +// loadOrCreateHMACKey publishes via os.Link of a fully-written, +// fsynced temp file. The canonical path appears atomically with +// exactly hmacKeyLen bytes or not at all. func readKeyFile(path string) ([]byte, error) { - const ( - maxAttempts = 5 - retryInterval = 50 * time.Millisecond - ) - var lastErr error - for attempt := 0; attempt < maxAttempts; attempt++ { - existing, err := os.ReadFile(path) - if errors.Is(err, os.ErrNotExist) { - return nil, err - } - if err != nil { - return nil, fmt.Errorf("reading hmac key: %w", err) - } - if len(existing) == hmacKeyLen { - return existing, nil - } - lastErr = fmt.Errorf("hmac key at %s is %d bytes, expected %d", + existing, err := os.ReadFile(path) + if errors.Is(err, os.ErrNotExist) { + return nil, err + } + if err != nil { + return nil, fmt.Errorf("reading hmac key: %w", err) + } + if len(existing) != hmacKeyLen { + return nil, fmt.Errorf("hmac key at %s is %d bytes, expected %d (corrupted)", path, len(existing), hmacKeyLen) - time.Sleep(retryInterval) } - return nil, fmt.Errorf("%w (corrupted or concurrent-creation race)", lastErr) + return existing, nil } // computeMAC returns the HMAC-SHA256 of entryJSON using the cache's key. From c101cc02602a99fbc5e473096ed5e262ecffa76d Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Tue, 12 May 2026 21:39:17 +0400 Subject: [PATCH 6/7] fix(cache): reclaim stale temp files past 1h grace period (codex round-3 P3) codex round-3 caught a slow leak: > When a process is killed or the host crashes after CreateTemp but > before the deferred remove/rename, these files are no longer > in-flight. Clean now skips every .cache.tmp.* and .hmac.key.tmp.* > forever, so large SBOM/scan temp payloads can accumulate and Size > still reports them even though they can never be used. Real concern for long-running deployments: SBOM payloads are multi-MB; over months of crashed Set() calls the disk fills up. Fix: Clean() now distinguishes recent vs stale temp files via mtime. A legitimate Set/loadOrCreateHMACKey publishes via Link/Rename in sub-second. Anything older than 1h is unambiguously a crashed writer whose deferred-remove never ran. New const `tempFileGracePeriod = 1 * time.Hour` documents the threshold. Inside Clean's temp-file branch, files past the grace period are removed; recent ones still get the in-flight pass. Regression test: TestCacheCleanReclaimsStaleTemps creates one recent and one back-dated temp, runs Clean, asserts recent stays + stale is reclaimed. 29/29 tests in pkg/security/ green. Build, vet, Semgrep clean. Signed-off-by: Dmitrii Creed --- pkg/security/cache.go | 17 +++++++++++++---- pkg/security/cache_test.go | 31 +++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/pkg/security/cache.go b/pkg/security/cache.go index 7b7453e2..a70d06a0 100644 --- a/pkg/security/cache.go +++ b/pkg/security/cache.go @@ -66,6 +66,11 @@ const ( // silently rotate the key from under us. hmacKeyFilename = ".hmac.key" hmacKeyLen = 32 // 256 bits, matches HMAC-SHA256 block-size guidance + // tempFileGracePeriod bounds how long Clean() will leave an + // in-flight `.cache.tmp.*` / `.hmac.key.tmp.*` file alone before + // reclaiming it. A legitimate writer publishes in sub-second; + // anything older than 1h is a crashed Set() that won't return. + tempFileGracePeriod = 1 * time.Hour ) // NewCache creates a new cache instance, generating or loading the @@ -353,12 +358,16 @@ func (c *Cache) Clean() error { if info.Name() == hmacKeyFilename { return nil // Never touch the key. } - // Skip in-flight `Set()` / `loadOrCreateHMACKey` temp files; - // the writer's pending Rename will publish them shortly. - // Removing now would be racy AND silently lose data the - // writer expects to land (gemini round-1 P1). + // In-flight `Set()` / `loadOrCreateHMACKey` temp files: skip if + // recent (writer's Rename/Link is imminent), reclaim if stale. + // A legitimate Set takes sub-second; mtime > 1h ago is a + // crashed writer (codex round-3 P3). Without the grace-period + // sweep, MB-scale SBOM temps could accumulate indefinitely. if strings.HasPrefix(info.Name(), ".cache.tmp.") || strings.HasPrefix(info.Name(), ".hmac.key.tmp.") { + if time.Since(info.ModTime()) > tempFileGracePeriod { + _ = os.Remove(path) + } return nil } diff --git a/pkg/security/cache_test.go b/pkg/security/cache_test.go index 1de2ec39..c653b30e 100644 --- a/pkg/security/cache_test.go +++ b/pkg/security/cache_test.go @@ -587,6 +587,37 @@ func TestCacheCleanSkipsInFlightTempFiles(t *testing.T) { Expect(err).ToNot(HaveOccurred(), "Clean must leave .hmac.key.tmp.* files alone") } +// Crashed writers leave behind temp files Clean() would otherwise +// skip forever. After tempFileGracePeriod (1h), Clean reclaims them +// (codex round-3 P3). Test forces the grace by back-dating mtime. +func TestCacheCleanReclaimsStaleTemps(t *testing.T) { + RegisterTestingT(t) + + dir := t.TempDir() + cache, err := NewCache(dir) + Expect(err).ToNot(HaveOccurred()) + + opDir := filepath.Join(dir, "sbom") + Expect(os.MkdirAll(opDir, 0o700)).To(Succeed()) + + // Recent temp: must stay. Old temp: must be reclaimed. + recent := filepath.Join(opDir, ".cache.tmp.recent") + stale := filepath.Join(opDir, ".cache.tmp.stale") + Expect(os.WriteFile(recent, []byte("recent"), 0o600)).To(Succeed()) + Expect(os.WriteFile(stale, []byte("stale-from-a-crashed-writer"), 0o600)).To(Succeed()) + + // Back-date stale beyond the grace period. + pastTime := time.Now().Add(-2 * tempFileGracePeriod) + Expect(os.Chtimes(stale, pastTime, pastTime)).To(Succeed()) + + Expect(cache.Clean()).To(Succeed()) + + _, err = os.Stat(recent) + Expect(err).ToNot(HaveOccurred(), "recent temp file must be left alone") + _, err = os.Stat(stale) + Expect(os.IsNotExist(err)).To(BeTrue(), "stale temp file must be reclaimed") +} + // Set must publish atomically — the on-disk file is never a partial // write, and the temp file is cleaned up after a successful Rename. func TestCacheSetIsAtomic(t *testing.T) { From ad9113d52dab3f70fa2b8e798aa56532b9fde7a5 Mon Sep 17 00:00:00 2001 From: Dmitrii Creed Date: Tue, 12 May 2026 22:04:50 +0400 Subject: [PATCH 7/7] fix(cache): Link fallback for hostile FS + path-traversal defense (gemini round-3 P2 + P3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two findings from gemini round 3: [P2] os.Link rejects hardlinks on common DEV-environment filesystems — Docker Desktop bind mounts, VirtualBox vboxsf, some SMB shares. The previous "fail loud" stance broke `sc` outright for those users. Production paths (Linux ext4 / overlayFS, macOS APFS, container layered FS, NFSv3+) all support Link and stay race-free. Fix: if Link returns one of {EPERM, ENOTSUP, EOPNOTSUPP, EXDEV}, fall back to os.Rename in the same directory. The Rename path reintroduces the narrow concurrent-NewCache race we eliminated in round 2 — but only on filesystems that can't support the safer primitive, and dev usability beats production paranoia we'd never hit there anyway. Helper `isLinkUnsupported(err)` keeps the test local. Production paths still take the Link branch and remain race-free. [P3] CacheKey.Operation flowed into filepath.Join unchecked. All in-tree callers pass hardcoded constants ("sbom", "scan-grype", etc.), but defense-in-depth: a future careless callsite passing "../../etc" would let Set() write above baseDir. Fix: getPath now collapses Operation via filepath.Base(filepath.Clean(.)) and explicitly rejects {"", ".", "..", anything containing / or \}, falling back to "_unknown". filepath.Base("..") returns "..", which my initial draft missed and the new regression test caught. Regression tests ---------------- - TestCachePathTraversalContained — five hostile Operation values ("../../../etc", "..", "/etc/passwd", `..\..\windows`, "a/b"). Every getPath() result must start with baseDir + separator AND must not contain "..". The "_unknown" landing slot is fine; the point is containment. - The Link-fallback path isn't easily unit-testable without injecting a mocked filesystem error; the production FS list above is documented inline. CI runners are all Linux ext4, which exercises the Link path. 29/29 tests in pkg/security/ green; all short-mode security suites green; Semgrep clean. Round-3 findings deliberately NOT addressed (repeats from earlier rounds) ------------------------------------------------------------------------- - [P3] Clean()'s TOCTOU between expiry decision and Remove — gemini re-flagged. Cache is rebuildable; cost of locking outweighs benefit. - [nit] Re-marshal HMAC validation — gemini re-flagged. Today's Go json.Marshal is deterministic for our struct shape; schema-change discipline is the proper control. Signed-off-by: Dmitrii Creed --- pkg/security/cache.go | 68 ++++++++++++++++++++++++++++---------- pkg/security/cache_test.go | 28 ++++++++++++++++ 2 files changed, 79 insertions(+), 17 deletions(-) diff --git a/pkg/security/cache.go b/pkg/security/cache.go index a70d06a0..abe497c3 100644 --- a/pkg/security/cache.go +++ b/pkg/security/cache.go @@ -12,6 +12,7 @@ import ( "os" "path/filepath" "strings" + "syscall" "time" ) @@ -145,23 +146,30 @@ func loadOrCreateHMACKey(path string) ([]byte, error) { return nil, fmt.Errorf("closing hmac key temp: %w", err) } - // Atomic NO-REPLACE publish via hardlink. os.Rename would - // overwrite a key another process just published, leaving two - // live caches with different keys and cross-invalidating entries - // (codex round-2 P2). os.Link fails with ErrExist if the target - // is already there — meaning a concurrent caller won the race — - // in which case we drop our generated key and adopt theirs by - // rereading. Hardlinks are universally supported on the - // filesystems SC runs on (Linux ext4/overlayFS, macOS APFS, - // container layered FS, NFSv3+). On exotic filesystems that - // reject hardlinks (some VirtualBox / SMB shared folders) the - // non-ErrExist error path below surfaces a clear failure rather - // than silently degrading to the racy Rename behavior we just - // removed — better to fail loud than thrash the cache. - // The same-directory `dir` keeps it a same-FS operation. + // Publish strategy: + // 1. os.Link (atomic NO-REPLACE): if two NewCache calls race, + // the loser's Link fails with ErrExist and adopts the winner's + // key. This is the race-free path (codex round-2 P2). + // 2. Fallback to os.Rename when the filesystem doesn't support + // hardlinks at all — Docker Desktop volume mounts, VirtualBox + // vboxsf, some SMB shares (gemini round-3 P2). Rename is + // atomic-with-replace, so a concurrent NewCache on the same + // dir on such a filesystem can briefly diverge — accepted + // trade-off for dev-machine usability. Real production + // surface (Linux ext4/overlayFS, macOS APFS, container + // layered FS, NFSv3+) supports Link and takes the race-free + // path. + // The same-directory `dir` keeps either op a same-FS operation. if err := os.Link(tmpPath, path); err != nil { if errors.Is(err, os.ErrExist) { - // Someone else won — use their key. + // Race lost — adopt the winner's key. + return readKeyFile(path) + } + if isLinkUnsupported(err) { + // FS doesn't do hardlinks; fall through to Rename. + if rerr := os.Rename(tmpPath, path); rerr != nil { + return nil, fmt.Errorf("placing hmac key (link unsupported, rename also failed): %w", rerr) + } return readKeyFile(path) } return nil, fmt.Errorf("placing hmac key: %w", err) @@ -171,6 +179,18 @@ func loadOrCreateHMACKey(path string) ([]byte, error) { return readKeyFile(path) } +// isLinkUnsupported tells whether the error from os.Link indicates a +// filesystem that rejects hardlinks (Docker Desktop bind mounts, +// vboxsf, some SMB). syscall.EPERM and syscall.ENOTSUP are the usual +// posix-y codes; syscall.EXDEV would mean cross-device (shouldn't +// happen given same-directory temp, but defensive). +func isLinkUnsupported(err error) bool { + return errors.Is(err, syscall.EPERM) || + errors.Is(err, syscall.ENOTSUP) || + errors.Is(err, syscall.EXDEV) || + errors.Is(err, syscall.EOPNOTSUPP) +} + // readKeyFile reads and validates the HMAC key file. Returns // os.ErrNotExist when absent. A wrong-size read is permanent // corruption — there is no race window to retry through, because @@ -408,7 +428,15 @@ func (c *Cache) Clean() error { }) } -// getPath returns the filesystem path for a cache key +// getPath returns the filesystem path for a cache key. +// +// CacheKey.Operation is used directly as a subdirectory name. All +// callers in-tree pass hardcoded values ("sbom", "scan-grype", +// "scan-trivy", "signature"), but defense-in-depth: a `../` in +// Operation would let a future careless caller escape baseDir +// (gemini round-3 P3). `filepath.Base` collapses any path separator +// or traversal segment to a single bare name, so even malicious +// input lands inside baseDir. func (c *Cache) getPath(key CacheKey) string { hash := sha256.New() hash.Write([]byte(key.Operation)) @@ -416,7 +444,13 @@ func (c *Cache) getPath(key CacheKey) string { hash.Write([]byte(key.ConfigHash)) filename := hex.EncodeToString(hash.Sum(nil)) - return filepath.Join(c.baseDir, key.Operation, filename+".json") + op := filepath.Base(filepath.Clean(key.Operation)) + // filepath.Base("..") returns ".." — must catch explicitly or the + // Join walks above baseDir. Same for raw separators on Windows. + if op == "" || op == "." || op == ".." || strings.ContainsAny(op, `/\`) { + op = "_unknown" + } + return filepath.Join(c.baseDir, op, filename+".json") } // getTTL returns the TTL for a given operation type diff --git a/pkg/security/cache_test.go b/pkg/security/cache_test.go index c653b30e..1b514755 100644 --- a/pkg/security/cache_test.go +++ b/pkg/security/cache_test.go @@ -587,6 +587,34 @@ func TestCacheCleanSkipsInFlightTempFiles(t *testing.T) { Expect(err).ToNot(HaveOccurred(), "Clean must leave .hmac.key.tmp.* files alone") } +// getPath must contain malicious Operation values that try to escape +// baseDir (gemini round-3 P3 — defense in depth; no caller is hostile +// today, but a future careless callsite shouldn't be able to corrupt +// the parent filesystem). +func TestCachePathTraversalContained(t *testing.T) { + RegisterTestingT(t) + + dir := t.TempDir() + cache, err := NewCache(dir) + Expect(err).ToNot(HaveOccurred()) + + hostile := []string{ + "../../../etc", + "..", + "/etc/passwd", + `..\..\windows`, + "a/b", + } + for _, op := range hostile { + key := CacheKey{Operation: op, ImageDigest: "sha256:x", ConfigHash: "h"} + p := cache.getPath(key) + Expect(strings.HasPrefix(p, dir+string(filepath.Separator))).To(BeTrue(), + "getPath(%q) = %q must stay under baseDir %q", op, p, dir) + Expect(strings.Contains(p, "..")).To(BeFalse(), + "getPath(%q) = %q must not contain `..`", op, p) + } +} + // Crashed writers leave behind temp files Clean() would otherwise // skip forever. After tempFileGracePeriod (1h), Clean reclaims them // (codex round-3 P3). Test forces the grace by back-dating mtime.