diff --git a/.github/workflows/generate-clike-stdlib.yml b/.github/workflows/generate-clike-stdlib.yml new file mode 100644 index 00000000..e50e13b4 --- /dev/null +++ b/.github/workflows/generate-clike-stdlib.yml @@ -0,0 +1,218 @@ +name: Generate C/C++ Stdlib Registries + +# Generates C and C++ stdlib registries for Linux, Windows, and Darwin and +# uploads them to Cloudflare R2 (the same backend that serves the Go stdlib +# registries). Three jobs run in parallel: +# +# - generate-linux ubuntu-latest with libc6-dev + libstdc++-13-dev +# - generate-windows ubuntu-latest with mingw-w64 (Win32 + mingw libstdc++) +# - generate-darwin macos-latest using xcrun's Command Line Tools SDK +# +# The publish job collects the six manifest sets and uploads them to R2 only +# when both R2 secrets are present — running this workflow on a fork without +# secrets still validates that generation works. +# +# Trigger: push to main when generator code or overlays change, or on demand +# via workflow_dispatch. + +on: + push: + branches: [main] + paths: + - 'sast-engine/tools/generate_clike_stdlib_registry.go' + - 'sast-engine/tools/internal/clikeextract/**' + - 'sast-engine/tools/c_stdlib_overlay.yaml' + - 'sast-engine/tools/cpp_stdlib_overlay.yaml' + - '.github/workflows/generate-clike-stdlib.yml' + workflow_dispatch: + +permissions: + contents: read + +jobs: + generate-linux: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version: '1.23' + cache: false + + - name: Install Linux stdlib headers + run: | + sudo apt-get update + sudo apt-get install -y libc6-dev libstdc++-13-dev + + - name: Generate Linux C registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=linux --language=c \ + --output-dir=/tmp/clike-out/linux/c/v1 + + - name: Generate Linux C++ registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=linux --language=cpp \ + --output-dir=/tmp/clike-out/linux/cpp/v1 + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: clike-registries-linux + path: /tmp/clike-out/linux/ + retention-days: 7 + + generate-windows: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version: '1.23' + cache: false + + - name: Install mingw-w64 (Win32 + libstdc++ headers on Linux) + run: | + sudo apt-get update + sudo apt-get install -y mingw-w64 g++-mingw-w64 + + - name: Generate Windows C registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=windows --language=c \ + --output-dir=/tmp/clike-out/windows/c/v1 + + - name: Generate Windows C++ registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=windows --language=cpp \ + --output-dir=/tmp/clike-out/windows/cpp/v1 + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: clike-registries-windows + path: /tmp/clike-out/windows/ + retention-days: 7 + + generate-darwin: + runs-on: macos-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version: '1.23' + cache: false + + - name: Verify SDK availability + run: | + xcrun --show-sdk-path + ls /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/stdio.h + ls /Library/Developer/CommandLineTools/usr/include/c++/v1/vector + + - name: Generate Darwin C registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=darwin --language=c \ + --output-dir=/tmp/clike-out/darwin/c/v1 + + - name: Generate Darwin C++ registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=darwin --language=cpp \ + --output-dir=/tmp/clike-out/darwin/cpp/v1 + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: clike-registries-darwin + path: /tmp/clike-out/darwin/ + retention-days: 7 + + publish: + needs: [generate-linux, generate-windows, generate-darwin] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: /tmp/clike-out + pattern: clike-registries-* + merge-multiple: false + + - name: Stage registries into the expected layout + run: | + mkdir -p /tmp/clike-staged + for plat in linux windows darwin; do + mkdir -p /tmp/clike-staged/$plat + if [ -d "/tmp/clike-out/clike-registries-$plat/$plat" ]; then + cp -R /tmp/clike-out/clike-registries-$plat/$plat/* /tmp/clike-staged/$plat/ + fi + done + + - name: Verify all 6 manifest.json files + run: | + for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do + path="/tmp/clike-staged/$combo/v1/manifest.json" + test -f "$path" || { echo "Missing: $path"; exit 1; } + echo "OK $combo/v1/manifest.json ($(wc -c < "$path") bytes)" + done + + # Cloudflare R2 upload runs only when both secrets are configured. On a + # fork or before the secrets are provisioned the workflow still passes + # generation + verification and skips the upload — keeps the pipeline + # green during operator setup. + - name: Upload to Cloudflare R2 + if: ${{ env.HAS_R2_CREDS == 'true' }} + env: + HAS_R2_CREDS: ${{ secrets.R2_ACCOUNT_ID != '' && secrets.R2_ACCESS_KEY_ID != '' && secrets.R2_SECRET_ACCESS_KEY != '' }} + R2_ACCOUNT_ID: ${{ secrets.R2_ACCOUNT_ID }} + R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} + R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} + run: | + export AWS_ACCESS_KEY_ID="$R2_ACCESS_KEY_ID" + export AWS_SECRET_ACCESS_KEY="$R2_SECRET_ACCESS_KEY" + R2_ENDPOINT="https://${R2_ACCOUNT_ID}.r2.cloudflarestorage.com" + + for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do + echo "Uploading $combo..." + aws s3 sync "/tmp/clike-staged/$combo/v1/" \ + "s3://code-pathfinder-assets/registries/$combo/v1/" \ + --endpoint-url "$R2_ENDPOINT" \ + --delete \ + --content-type "application/json" \ + --cache-control "public, max-age=86400" + done + + - name: Verify CDN URLs + if: ${{ env.HAS_R2_CREDS == 'true' }} + env: + HAS_R2_CREDS: ${{ secrets.R2_ACCOUNT_ID != '' && secrets.R2_ACCESS_KEY_ID != '' && secrets.R2_SECRET_ACCESS_KEY != '' }} + run: | + for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do + url="https://assets.codepathfinder.dev/registries/$combo/v1/manifest.json" + status=$(curl -s -o /dev/null -w "%{http_code}" "$url") + echo "$status $url" + test "$status" = "200" || { echo "URL not reachable: $url"; exit 1; } + done + + - name: Summary + run: | + echo "## C/C++ Stdlib Registry Generation" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do + mf="/tmp/clike-staged/$combo/v1/manifest.json" + if [ -f "$mf" ]; then + size=$(wc -c < "$mf") + echo "- ${combo} manifest.json: ${size} bytes" >> "$GITHUB_STEP_SUMMARY" + fi + done diff --git a/sast-engine/cmd/scan_stdlib_test.go b/sast-engine/cmd/scan_stdlib_test.go index 811fe9b5..9630fa55 100644 --- a/sast-engine/cmd/scan_stdlib_test.go +++ b/sast-engine/cmd/scan_stdlib_test.go @@ -119,11 +119,19 @@ func TestInitClikeStdlib_BarePathTreatedAsFile(t *testing.T) { require.NotNil(t, cfg.cppLoader) } -func TestInitClikeStdlib_HTTPSchemeIsStubbed(t *testing.T) { - // HTTP path returns a constructed loader, but LoadManifest fails - // with the PR-03 stub error. Both loaders should be nil after the - // failed load. - cfg, wired := initClikeStdlib(t.TempDir(), "linux", "https://example.test/registries", newTestLogger()) +func TestInitClikeStdlib_HTTPSchemeFailsGracefullyOnUnreachableHost(t *testing.T) { + // PR-03 wires HTTP up — when the URL doesn't resolve and there's no + // disk cache to fall back on, both loaders fail to load and stay + // nil. The scan continues under Phase 1 behavior. + // + // Point the cache at a fresh temp dir so a previously populated + // developer cache (e.g. from running another test) cannot serve a + // stale manifest and turn this into a false-positive success. + t.Setenv("XDG_CACHE_HOME", t.TempDir()) + t.Setenv("HOME", t.TempDir()) + t.Setenv("LOCALAPPDATA", t.TempDir()) + + cfg, wired := initClikeStdlib(t.TempDir(), "linux", "http://127.0.0.1:1/registries", newTestLogger()) assert.False(t, wired) assert.Nil(t, cfg.cLoader) assert.Nil(t, cfg.cppLoader) diff --git a/sast-engine/graph/callgraph/registry/c_stdlib_remote.go b/sast-engine/graph/callgraph/registry/c_stdlib_remote.go index d2fdc71d..0e9dfc74 100644 --- a/sast-engine/graph/callgraph/registry/c_stdlib_remote.go +++ b/sast-engine/graph/callgraph/registry/c_stdlib_remote.go @@ -107,11 +107,57 @@ func (r *CStdlibRegistryRemote) loadManifestFromFile(logger core.CStdlibLogger) return nil } -// loadManifestFromHTTP is the PR-03 hook. PR-02 ships it as a deliberate stub -// so the type satisfies CStdlibLoader without any half-built network code -// shipping early. -func (r *CStdlibRegistryRemote) loadManifestFromHTTP(_ core.CStdlibLogger) error { - return errors.New("CStdlibRegistryRemote: HTTP loader not yet implemented; tracked in PR-03") +// loadManifestFromHTTP downloads the top-level manifest.json over HTTP, with +// fallback to a stale on-disk cache when the network is unreachable. Layout: +// +// GET //c/v1/manifest.json +// └─ on success: parse JSON, write to disk cache, populate r.manifest +// └─ on network failure: read disk cache (regardless of TTL), warn, continue +// └─ on cache miss too: surface the original network error +// +// Disk-cache writes are best-effort: a failed write logs a warning but does +// not block in-memory population (the scan still benefits from this run, the +// next run just has to re-fetch). +func (r *CStdlibRegistryRemote) loadManifestFromHTTP(logger core.CStdlibLogger) error { + url := joinURL(r.baseURL, r.platform, "c", "v1", "manifest.json") + if logger != nil { + logger.Debug("Downloading C stdlib manifest: %s", url) + } + + data, err := fetchURL(r.httpClient, url) + if err != nil { + // Network failed — try disk cache irrespective of freshness so a + // scan in a no-network environment still resolves stdlib calls. + if cached, cerr := r.diskCache.GetManifest(); cerr == nil { + if logger != nil { + logger.Warning("Network failed for %s; serving cached manifest. Underlying: %v", url, err) + } + r.cacheMutex.Lock() + r.manifest = cached + r.cacheMutex.Unlock() + return nil + } + return fmt.Errorf("loadManifestFromHTTP: %w", err) + } + + var manifest core.CStdlibManifest + if err := json.Unmarshal(data, &manifest); err != nil { + return fmt.Errorf("loadManifestFromHTTP: parsing manifest from %s: %w", url, err) + } + + if cerr := r.diskCache.SaveManifest(data); cerr != nil && logger != nil { + logger.Warning("Failed to save C manifest to disk cache: %v", cerr) + } + + r.cacheMutex.Lock() + r.manifest = &manifest + r.cacheMutex.Unlock() + + if logger != nil { + logger.Statistic("Loaded C stdlib manifest over HTTP: %d headers for %s", + len(manifest.Headers), r.platform) + } + return nil } // GetHeader retrieves the per-header content, fetching on first reference and @@ -179,10 +225,55 @@ func (r *CStdlibRegistryRemote) fetchHeaderFromFile(entry *core.CStdlibHeaderEnt return &h, nil } -// fetchHeaderFromHTTP is the PR-03 hook. PR-02 stub keeps the type -// satisfying its interface contract without shipping half-built network code. -func (r *CStdlibRegistryRemote) fetchHeaderFromHTTP(_ *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) { - return nil, errors.New("CStdlibRegistryRemote: HTTP fetch not yet implemented; tracked in PR-03") +// fetchHeaderFromHTTP downloads one per-header JSON over HTTP, with disk-cache +// freshness checks on the way in and stale-cache fallback on network failure. +// +// The lookup chain: +// 1. Disk cache hit AND fresh (< 24h) → return cached, no network. +// 2. Otherwise GET the entry's URL (or construct one from baseURL + entry.File +// when the manifest predates URL embedding). +// 3. On 200 OK: verify checksum (when present in the manifest), parse JSON, +// persist to disk cache, return. +// 4. On any network or parse failure: try the on-disk cache irrespective of +// freshness — a stale registry beats no resolution at all. +func (r *CStdlibRegistryRemote) fetchHeaderFromHTTP(entry *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) { + if r.diskCache.IsFresh(entry.File, stdlibCacheTTL) { + if cached, err := r.diskCache.GetHeader(entry.File); err == nil { + return cached, nil + } + } + + url := r.headerURL(entry) + data, err := fetchURL(r.httpClient, url) + if err != nil { + if cached, cerr := r.diskCache.GetHeader(entry.File); cerr == nil { + return cached, nil + } + return nil, fmt.Errorf("fetchHeaderFromHTTP: %w", err) + } + + if err := verifyChecksum(data, entry.Checksum); err != nil { + return nil, fmt.Errorf("fetchHeaderFromHTTP: %s: %w", entry.Header, err) + } + + var h core.CStdlibHeader + if err := json.Unmarshal(data, &h); err != nil { + return nil, fmt.Errorf("fetchHeaderFromHTTP: parsing %s: %w", url, err) + } + + _ = r.diskCache.SaveHeader(entry.File, data) // best-effort + + return &h, nil +} + +// headerURL prefers the manifest-embedded URL when present (lets the registry +// publisher point individual files at a different host or a versioned path) +// and otherwise constructs one from the loader's baseURL + entry.File. +func (r *CStdlibRegistryRemote) headerURL(entry *core.CStdlibHeaderEntry) string { + if entry.URL != "" { + return entry.URL + } + return joinURL(r.baseURL, r.platform, "c", "v1", entry.File) } // GetFunction is a convenience accessor: GetHeader followed by a function diff --git a/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go b/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go index 9a572705..fc5367af 100644 --- a/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go +++ b/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go @@ -233,27 +233,16 @@ func TestCStdlibRegistry_DoubleCheckLocking(t *testing.T) { } } -func TestCStdlibRegistry_HTTPMode_Stub(t *testing.T) { - r := NewCStdlibRegistryRemote("https://example.com/registries", core.PlatformLinux) +// TestCStdlibRegistry_HTTPMode_NetworkFailureNoCacheSurfacesError verifies +// that an HTTP-only loader with no on-disk cache surfaces the underlying +// network error rather than swallowing it. Uses an unreachable port on +// localhost so the test never depends on external connectivity. +func TestCStdlibRegistry_HTTPMode_NetworkFailureNoCacheSurfacesError(t *testing.T) { + r := NewCStdlibRegistryRemote("http://127.0.0.1:1/registries", core.PlatformLinux) + r.diskCache = nil // no cache → no fallback path err := r.LoadManifest(noopLogger{}) require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") -} - -func TestCStdlibRegistry_HTTPMode_FetchHeaderStub(t *testing.T) { - // Construct the HTTP loader with an in-memory manifest by going through - // a file:// loader first, then forcing the fetch path to HTTP. - dir := t.TempDir() - writeCRegistry(t, dir) - r := NewCStdlibRegistryFile(dir, core.PlatformLinux) - require.NoError(t, r.LoadManifest(noopLogger{})) - - // Switch to HTTP mode mid-flight by clearing fileBase. Tests-only — - // production code never does this. - r.fileBase = "" - _, err := r.GetHeader("stdio.h") // fresh header (not yet cached) - require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") + assert.Contains(t, err.Error(), "loadManifestFromHTTP") } func TestCStdlibRegistry_RemoteCtor_TrimsTrailingSlash(t *testing.T) { diff --git a/sast-engine/graph/callgraph/registry/clike_http.go b/sast-engine/graph/callgraph/registry/clike_http.go new file mode 100644 index 00000000..8a75a669 --- /dev/null +++ b/sast-engine/graph/callgraph/registry/clike_http.go @@ -0,0 +1,90 @@ +package registry + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "io" + "net/http" + "strings" +) + +// httpFetchTimeout is the upper bound on a single HTTP request. The 30-second +// value matches the Go stdlib loader; CDN responses are typically <100ms but +// flaky networks need slack to avoid spurious cache misses. +// +// The constant lives at package scope (not on the loader struct) so test +// servers and benchmarks can rely on a deterministic value. + +// fetchURL performs a single HTTP GET and returns the response body. Non-200 +// responses are turned into errors so callers can branch on a single failure +// path. The caller-supplied client carries timeout + retry policy. +// +// Defined at package scope so both the C and C++ remote loaders share the +// same wire-format behavior — the only thing that differs between them is the +// URL path (`/c/v1/` vs `/cpp/v1/`). +func fetchURL(client *http.Client, url string) ([]byte, error) { + if client == nil { + return nil, fmt.Errorf("fetchURL: nil HTTP client") + } + req, err := http.NewRequestWithContext(context.Background(), http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("fetchURL: building request for %s: %w", url, err) + } + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("fetchURL: GET %s: %w", url, err) + } + defer func() { _ = resp.Body.Close() }() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("fetchURL: reading body from %s: %w", url, err) + } + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("fetchURL: GET %s: HTTP %d", url, resp.StatusCode) + } + return body, nil +} + +// verifyChecksum confirms that data hashes to the expected sha256 digest. +// expected is the manifest's `checksum` field, expected to be of the form +// "sha256:". An empty expected string disables checksum verification — +// useful for manifests generated before the checksum field was populated, and +// for tests that don't want to compute hashes by hand. +// +// Returns an error tagged with both expected + actual so log lines stay +// useful for diagnosing CDN tampering or stale-cache + new-checksum drift. +func verifyChecksum(data []byte, expected string) error { + if expected == "" { + return nil + } + if !strings.HasPrefix(expected, "sha256:") { + return fmt.Errorf("verifyChecksum: unsupported checksum format %q (want sha256:)", expected) + } + want := strings.TrimPrefix(expected, "sha256:") + sum := sha256.Sum256(data) + got := hex.EncodeToString(sum[:]) + if got != want { + return fmt.Errorf("verifyChecksum: digest mismatch (want %s, got %s)", want, got) + } + return nil +} + +// joinURL concatenates a base URL with one or more path segments using a +// single forward slash as separator. Stripping leading/trailing slashes on +// the input avoids the duplicate-slash artifacts a naive join would emit +// (e.g. "https://x/" + "/foo" → "https://x//foo"). +func joinURL(base string, segments ...string) string { + parts := make([]string, 0, len(segments)+1) + parts = append(parts, strings.TrimRight(base, "/")) + for _, s := range segments { + s = strings.Trim(s, "/") + if s == "" { + continue + } + parts = append(parts, s) + } + return strings.Join(parts, "/") +} diff --git a/sast-engine/graph/callgraph/registry/clike_http_test.go b/sast-engine/graph/callgraph/registry/clike_http_test.go new file mode 100644 index 00000000..66a37397 --- /dev/null +++ b/sast-engine/graph/callgraph/registry/clike_http_test.go @@ -0,0 +1,394 @@ +package registry + +import ( + "crypto/sha256" + "encoding/hex" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// stdlibFixture is the canonical small registry the HTTP tests reuse. Each +// test that needs a server can call serveFixture(t, fixture) to get a live +// httptest.Server and a baseURL that the loader recognizes. +type stdlibFixture struct { + manifestC *core.CStdlibManifest + headerC *core.CStdlibHeader + manifestCpp *core.CStdlibManifest + headerCpp *core.CStdlibHeader +} + +func newCFixture() *stdlibFixture { + return &stdlibFixture{ + manifestC: &core.CStdlibManifest{ + SchemaVersion: "1.0.0", + RegistryVersion: "v1", + Platform: core.PlatformLinux, + Language: core.LanguageC, + Headers: []*core.CStdlibHeaderEntry{ + {Header: "stdio.h", ModuleID: "c::stdio", File: "stdio_stdlib.json"}, + }, + Statistics: &core.CStdlibStatistics{TotalHeaders: 1, TotalFunctions: 1}, + }, + headerC: &core.CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "stdio.h", + ModuleID: "c::stdio", + Language: core.LanguageC, + Functions: map[string]*core.CStdlibFunction{ + "printf": {FQN: "c::stdio::printf", ReturnType: "int", Source: core.SourceOverlay, Confidence: 1.0}, + }, + }, + manifestCpp: &core.CStdlibManifest{ + SchemaVersion: "1.0.0", + RegistryVersion: "v1", + Platform: core.PlatformLinux, + Language: core.LanguageCpp, + Headers: []*core.CStdlibHeaderEntry{ + {Header: "vector", ModuleID: "std::vector", File: "vector_stdlib.json"}, + }, + Statistics: &core.CStdlibStatistics{TotalHeaders: 1, TotalClasses: 1, TotalFunctions: 1}, + }, + headerCpp: &core.CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "vector", + ModuleID: "std::vector", + Language: core.LanguageCpp, + Classes: map[string]*core.CppStdlibClass{ + "std::vector": { + FQN: "std::vector", TypeParams: []string{"T"}, + Methods: map[string]*core.CStdlibFunction{ + "push_back": {FQN: "std::vector::push_back", ReturnType: "void", Source: core.SourceOverlay, Confidence: 1.0}, + }, + }, + }, + }, + } +} + +// serveFixture stands up a test server that mimics the CDN's URL layout: +// +// GET /registries/linux/c/v1/manifest.json +// GET /registries/linux/c/v1/stdio_stdlib.json +// GET /registries/linux/cpp/v1/manifest.json +// GET /registries/linux/cpp/v1/vector_stdlib.json +// +// The returned baseURL ends in "/registries" so the loader's joinURL builds +// exactly the paths above. +func serveFixture(t *testing.T, f *stdlibFixture) *httptest.Server { + t.Helper() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + switch req.URL.Path { + case "/registries/linux/c/v1/manifest.json": + writeJSON(t, w, f.manifestC) + case "/registries/linux/c/v1/stdio_stdlib.json": + writeJSON(t, w, f.headerC) + case "/registries/linux/cpp/v1/manifest.json": + writeJSON(t, w, f.manifestCpp) + case "/registries/linux/cpp/v1/vector_stdlib.json": + writeJSON(t, w, f.headerCpp) + default: + http.NotFound(w, req) + } + })) + t.Cleanup(srv.Close) + return srv +} + +func writeJSON(t *testing.T, w http.ResponseWriter, v any) { + t.Helper() + data, err := json.Marshal(v) + require.NoError(t, err) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write(data) +} + +// withTempCacheRoot points the loader's disk cache at a t.TempDir for the +// duration of the test. The default $HOME/.cache root is unsuitable because +// tests must not pollute the developer's real cache. +func withTempCacheRoot(t *testing.T, r *CStdlibRegistryRemote) { + t.Helper() + r.diskCache = newDiskCacheStore(t.TempDir()) +} + +func withTempCacheRootCpp(t *testing.T, r *CppStdlibRegistryRemote) { + t.Helper() + r.diskCache = newDiskCacheStore(t.TempDir()) +} + +func TestCStdlibRegistry_HTTP_LoadManifestAndHeader(t *testing.T) { + srv := serveFixture(t, newCFixture()) + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + + require.NoError(t, r.LoadManifest(noopLogger{})) + assert.Equal(t, 1, r.HeaderCount()) + + h, err := r.GetHeader("stdio.h") + require.NoError(t, err) + require.Contains(t, h.Functions, "printf") + assert.Equal(t, "c::stdio::printf", h.Functions["printf"].FQN) + + // Second call must hit the in-memory cache (same pointer). + h2, err := r.GetHeader("stdio.h") + require.NoError(t, err) + assert.Same(t, h, h2) +} + +func TestCStdlibRegistry_HTTP_NetworkFailureFallsBackToCachedManifest(t *testing.T) { + // First successful fetch writes the manifest to disk cache. + srv := serveFixture(t, newCFixture()) + cacheDir := t.TempDir() + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r.LoadManifest(noopLogger{})) + + // Spin up a second loader pointed at a dead URL but sharing the same cache. + r2 := NewCStdlibRegistryRemote("http://127.0.0.1:1/registries", core.PlatformLinux) + r2.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r2.LoadManifest(noopLogger{}), "stale cache must serve when network is down") + assert.Equal(t, 1, r2.HeaderCount()) +} + +func TestCStdlibRegistry_HTTP_HeaderFallsBackToCachedOnNetworkFailure(t *testing.T) { + srv := serveFixture(t, newCFixture()) + cacheDir := t.TempDir() + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetHeader("stdio.h") // populates header cache on disk + require.NoError(t, err) + + // Tear the server down — subsequent loaders must serve from cache only. + srv.Close() + + r2 := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r2.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r2.LoadManifest(noopLogger{}), "loader should pick up cached manifest") + h, err := r2.GetHeader("stdio.h") + require.NoError(t, err) + require.Contains(t, h.Functions, "printf") +} + +func TestCStdlibRegistry_HTTP_404Surfaces(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.NotFound(w, nil) + })) + t.Cleanup(srv.Close) + + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r.diskCache = nil + err := r.LoadManifest(noopLogger{}) + require.Error(t, err) + assert.Contains(t, err.Error(), "404") +} + +func TestCStdlibRegistry_HTTP_ChecksumValid(t *testing.T) { + f := newCFixture() + headerBytes, err := json.Marshal(f.headerC) + require.NoError(t, err) + sum := sha256.Sum256(headerBytes) + f.manifestC.Headers[0].Checksum = "sha256:" + hex.EncodeToString(sum[:]) + + srv := serveFixture(t, f) + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err = r.GetHeader("stdio.h") + require.NoError(t, err) +} + +func TestCStdlibRegistry_HTTP_ChecksumMismatch(t *testing.T) { + f := newCFixture() + f.manifestC.Headers[0].Checksum = "sha256:deadbeef" // wrong hash + + srv := serveFixture(t, f) + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetHeader("stdio.h") + require.Error(t, err) + assert.Contains(t, err.Error(), "digest mismatch") +} + +func TestCStdlibRegistry_HTTP_ChecksumUnsupportedFormat(t *testing.T) { + f := newCFixture() + f.manifestC.Headers[0].Checksum = "md5:abcdef" + + srv := serveFixture(t, f) + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetHeader("stdio.h") + require.Error(t, err) + assert.Contains(t, err.Error(), "unsupported checksum format") +} + +func TestCStdlibRegistry_HTTP_ManifestEmbeddedURL(t *testing.T) { + f := newCFixture() + srv := serveFixture(t, f) + + // Override the entry's URL to something at a different path; the loader + // must follow it instead of constructing one from baseURL. + f.manifestC.Headers[0].URL = srv.URL + "/registries/linux/c/v1/stdio_stdlib.json" + + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + h, err := r.GetHeader("stdio.h") + require.NoError(t, err) + require.Contains(t, h.Functions, "printf") +} + +func TestCStdlibRegistry_HTTP_ParallelHeaderFetches(t *testing.T) { + srv := serveFixture(t, newCFixture()) + + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + + const workers = 50 + results := make([]*core.CStdlibHeader, workers) + errs := make([]error, workers) + done := make(chan struct{}) + for i := range workers { + go func(idx int) { + results[idx], errs[idx] = r.GetHeader("stdio.h") + done <- struct{}{} + }(i) + } + for range workers { + <-done + } + for i := range workers { + require.NoError(t, errs[i]) + } + for i := 1; i < workers; i++ { + assert.Same(t, results[0], results[i], "worker %d saw different pointer", i) + } +} + +func TestCStdlibRegistry_HTTP_NoCachePropagatesHeaderError(t *testing.T) { + srv := serveFixture(t, newCFixture()) + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + srv.Close() // cause subsequent fetches to fail + r.diskCache = nil + + _, err := r.GetHeader("stdio.h") + require.Error(t, err) + assert.Contains(t, err.Error(), "fetchHeaderFromHTTP") +} + +func TestCStdlibRegistry_HTTP_ParseErrorOnGarbageBody(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte("not json")) + })) + t.Cleanup(srv.Close) + + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + err := r.LoadManifest(noopLogger{}) + require.Error(t, err) + assert.Contains(t, err.Error(), "parsing manifest") +} + +// --- C++ HTTP loader ------------------------------------------------------- + +func TestCppStdlibRegistry_HTTP_LoadManifestAndHeader(t *testing.T) { + srv := serveFixture(t, newCFixture()) + r := NewCppStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRootCpp(t, r) + + require.NoError(t, r.LoadManifest(noopLogger{})) + assert.Equal(t, 1, r.HeaderCount()) + + cls, err := r.GetClass("vector", "std::vector") + require.NoError(t, err) + assert.Equal(t, []string{"T"}, cls.TypeParams) +} + +func TestCppStdlibRegistry_HTTP_NetworkFailureFallsBackToCachedHeader(t *testing.T) { + srv := serveFixture(t, newCFixture()) + cacheDir := t.TempDir() + r := NewCppStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetClass("vector", "std::vector") + require.NoError(t, err) + + srv.Close() + + r2 := NewCppStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r2.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r2.LoadManifest(noopLogger{})) + cls, err := r2.GetClass("vector", "std::vector") + require.NoError(t, err) + assert.Equal(t, []string{"T"}, cls.TypeParams) +} + +func TestCppStdlibRegistry_HTTP_ChecksumMismatch(t *testing.T) { + f := newCFixture() + f.manifestCpp.Headers[0].Checksum = "sha256:deadbeef" + + srv := serveFixture(t, f) + r := NewCppStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRootCpp(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetClass("vector", "std::vector") + require.Error(t, err) + assert.Contains(t, err.Error(), "digest mismatch") +} + +func TestCppStdlibRegistry_HTTP_ManifestEmbeddedURL(t *testing.T) { + f := newCFixture() + srv := serveFixture(t, f) + f.manifestCpp.Headers[0].URL = srv.URL + "/registries/linux/cpp/v1/vector_stdlib.json" + + r := NewCppStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRootCpp(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetClass("vector", "std::vector") + require.NoError(t, err) +} + +// --- helpers -------------------------------------------------------------- + +func TestVerifyChecksum_EmptyExpectedSkips(t *testing.T) { + require.NoError(t, verifyChecksum([]byte("anything"), "")) +} + +func TestJoinURL(t *testing.T) { + tests := []struct { + name string + base string + segs []string + want string + }{ + {"trailing slash on base", "https://x/", []string{"a", "b"}, "https://x/a/b"}, + {"leading slash on segment", "https://x", []string{"/a", "/b"}, "https://x/a/b"}, + {"empty segments dropped", "https://x", []string{"", "a", ""}, "https://x/a"}, + {"no segments", "https://x", nil, "https://x"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, joinURL(tt.base, tt.segs...)) + }) + } +} + +func TestFetchURL_NilClient(t *testing.T) { + _, err := fetchURL(nil, "http://x") + require.Error(t, err) + assert.Contains(t, err.Error(), "nil HTTP client") +} + +func TestFetchURL_BadRequestURL(t *testing.T) { + _, err := fetchURL(&http.Client{}, "http://[::1:bad") + require.Error(t, err) +} diff --git a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go index 093182f7..ce2d7ff3 100644 --- a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go +++ b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go @@ -64,13 +64,58 @@ func NewCppStdlibRegistryRemote(baseURL, platform string) *CppStdlibRegistryRemo } } -// LoadManifest reads the top-level manifest.json. file:// path is wired in -// PR-02; HTTP path is the PR-03 stub. +// LoadManifest reads the top-level manifest.json over file:// or HTTPS. +// HTTP failures fall back to the on-disk cache regardless of TTL so an +// offline scan still resolves stdlib calls when a previous run populated +// the cache. func (r *CppStdlibRegistryRemote) LoadManifest(logger core.CStdlibLogger) error { if r.fileBase != "" { return r.loadManifestFromFile(logger) } - return errors.New("CppStdlibRegistryRemote: HTTP loader not yet implemented; tracked in PR-03") + return r.loadManifestFromHTTP(logger) +} + +// loadManifestFromHTTP is the C++ counterpart to the C loader's HTTP path. +// The URL layout is identical except for the language segment ("/cpp/v1/" +// instead of "/c/v1/"). +func (r *CppStdlibRegistryRemote) loadManifestFromHTTP(logger core.CStdlibLogger) error { + url := joinURL(r.baseURL, r.platform, "cpp", "v1", "manifest.json") + if logger != nil { + logger.Debug("Downloading C++ stdlib manifest: %s", url) + } + + data, err := fetchURL(r.httpClient, url) + if err != nil { + if cached, cerr := r.diskCache.GetManifest(); cerr == nil { + if logger != nil { + logger.Warning("Network failed for %s; serving cached manifest. Underlying: %v", url, err) + } + r.cacheMutex.Lock() + r.manifest = cached + r.cacheMutex.Unlock() + return nil + } + return fmt.Errorf("loadManifestFromHTTP: %w", err) + } + + var manifest core.CStdlibManifest + if err := json.Unmarshal(data, &manifest); err != nil { + return fmt.Errorf("loadManifestFromHTTP: parsing manifest from %s: %w", url, err) + } + + if cerr := r.diskCache.SaveManifest(data); cerr != nil && logger != nil { + logger.Warning("Failed to save C++ manifest to disk cache: %v", cerr) + } + + r.cacheMutex.Lock() + r.manifest = &manifest + r.cacheMutex.Unlock() + + if logger != nil { + logger.Statistic("Loaded C++ stdlib manifest over HTTP: %d headers for %s", + len(manifest.Headers), r.platform) + } + return nil } func (r *CppStdlibRegistryRemote) loadManifestFromFile(logger core.CStdlibLogger) error { @@ -132,7 +177,50 @@ func (r *CppStdlibRegistryRemote) fetchHeaderLocked(name string) (*core.CStdlibH if r.fileBase != "" { return r.fetchHeaderFromFile(entry) } - return nil, errors.New("CppStdlibRegistryRemote: HTTP fetch not yet implemented; tracked in PR-03") + return r.fetchHeaderFromHTTP(entry) +} + +// fetchHeaderFromHTTP downloads one per-header JSON over HTTP. Mirrors +// the C loader's strategy: disk-cache freshness on the way in, stale-cache +// fallback on network failure, optional checksum verification. +func (r *CppStdlibRegistryRemote) fetchHeaderFromHTTP(entry *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) { + if r.diskCache.IsFresh(entry.File, stdlibCacheTTL) { + if cached, err := r.diskCache.GetHeader(entry.File); err == nil { + return cached, nil + } + } + + url := r.headerURL(entry) + data, err := fetchURL(r.httpClient, url) + if err != nil { + if cached, cerr := r.diskCache.GetHeader(entry.File); cerr == nil { + return cached, nil + } + return nil, fmt.Errorf("fetchHeaderFromHTTP: %w", err) + } + + if err := verifyChecksum(data, entry.Checksum); err != nil { + return nil, fmt.Errorf("fetchHeaderFromHTTP: %s: %w", entry.Header, err) + } + + var h core.CStdlibHeader + if err := json.Unmarshal(data, &h); err != nil { + return nil, fmt.Errorf("fetchHeaderFromHTTP: parsing %s: %w", url, err) + } + + _ = r.diskCache.SaveHeader(entry.File, data) // best-effort + + return &h, nil +} + +// headerURL prefers the manifest-embedded URL when present and falls back +// to //cpp/v1/ for manifests that predate the +// per-entry URL field. +func (r *CppStdlibRegistryRemote) headerURL(entry *core.CStdlibHeaderEntry) string { + if entry.URL != "" { + return entry.URL + } + return joinURL(r.baseURL, r.platform, "cpp", "v1", entry.File) } func (r *CppStdlibRegistryRemote) fetchHeaderFromFile(entry *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) { diff --git a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go index e7c75cd3..a4b13361 100644 --- a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go +++ b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go @@ -148,22 +148,15 @@ func TestCppStdlibRegistry_GetFunctionMissing(t *testing.T) { require.Error(t, err) } -func TestCppStdlibRegistry_HTTPStub(t *testing.T) { - r := NewCppStdlibRegistryRemote("https://x/", core.PlatformLinux) +// TestCppStdlibRegistry_HTTPMode_NetworkFailureNoCacheSurfacesError mirrors +// the C-loader test: with the disk cache explicitly disabled, an HTTP-only +// loader pointed at an unreachable port must surface the error. +func TestCppStdlibRegistry_HTTPMode_NetworkFailureNoCacheSurfacesError(t *testing.T) { + r := NewCppStdlibRegistryRemote("http://127.0.0.1:1/registries", core.PlatformLinux) + r.diskCache = nil err := r.LoadManifest(noopLogger{}) require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") -} - -func TestCppStdlibRegistry_HTTPFetchStub(t *testing.T) { - dir := t.TempDir() - writeCppRegistry(t, dir) - r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) - require.NoError(t, r.LoadManifest(noopLogger{})) - r.fileBase = "" // simulate HTTP-only mode - _, err := r.GetHeader("vector") - require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") + assert.Contains(t, err.Error(), "loadManifestFromHTTP") } func TestCppStdlibRegistry_HeaderCountBeforeLoad(t *testing.T) { diff --git a/sast-engine/tools/internal/clikeextract/extractor_test.go b/sast-engine/tools/internal/clikeextract/extractor_test.go index 7e19221c..cdf3175e 100644 --- a/sast-engine/tools/internal/clikeextract/extractor_test.go +++ b/sast-engine/tools/internal/clikeextract/extractor_test.go @@ -140,14 +140,18 @@ func TestRun_OverlayLoadError(t *testing.T) { } func TestRun_DiscoveryError(t *testing.T) { + // Windows + missing mingw toolchain → discovery error with remediation + // hint. Pre-PR-03 this asserted the stub message; now it asserts the + // "headers not found" path. + withTempMingwRoot(t, "/definitely/missing") cfg := Config{ - Target: core.PlatformWindows, // PR-01 doesn't ship windows + Target: core.PlatformWindows, Language: core.LanguageC, OutputDir: t.TempDir(), } err := NewExtractor(cfg).Run() require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") + assert.Contains(t, err.Error(), "mingw-w64") } func TestRun_WalkError(t *testing.T) { diff --git a/sast-engine/tools/internal/clikeextract/walker.go b/sast-engine/tools/internal/clikeextract/walker.go index 6e144a36..7cc5e445 100644 --- a/sast-engine/tools/internal/clikeextract/walker.go +++ b/sast-engine/tools/internal/clikeextract/walker.go @@ -77,13 +77,33 @@ func DiscoverHeaderSources(target, language string) ([]HeaderSource, error) { } return []HeaderSource{src}, nil - case core.PlatformWindows + "/" + core.LanguageC, - core.PlatformWindows + "/" + core.LanguageCpp, - core.PlatformDarwin + "/" + core.LanguageC, - core.PlatformDarwin + "/" + core.LanguageCpp: - return nil, fmt.Errorf("DiscoverHeaderSources: target %q language %q is scheduled for PR-03; "+ - "PR-01 only ships %s/%s and %s/%s", target, language, - core.PlatformLinux, core.LanguageC, core.PlatformLinux, core.LanguageCpp) + case core.PlatformWindows + "/" + core.LanguageC: + src, err := windowsCSource() + if err != nil { + return nil, err + } + return []HeaderSource{src}, nil + + case core.PlatformWindows + "/" + core.LanguageCpp: + src, err := windowsCppSource() + if err != nil { + return nil, err + } + return []HeaderSource{src}, nil + + case core.PlatformDarwin + "/" + core.LanguageC: + src, err := darwinCSource() + if err != nil { + return nil, err + } + return []HeaderSource{src}, nil + + case core.PlatformDarwin + "/" + core.LanguageCpp: + src, err := darwinCppSource() + if err != nil { + return nil, err + } + return []HeaderSource{src}, nil default: return nil, fmt.Errorf("DiscoverHeaderSources: unknown target+language combination %q+%q", target, language) diff --git a/sast-engine/tools/internal/clikeextract/walker_test.go b/sast-engine/tools/internal/clikeextract/walker_test.go index 21dfdd60..0a6be958 100644 --- a/sast-engine/tools/internal/clikeextract/walker_test.go +++ b/sast-engine/tools/internal/clikeextract/walker_test.go @@ -41,19 +41,26 @@ func TestDiscoverHeaderSources_LinuxCpp_NotInstalled(t *testing.T) { assert.Contains(t, err.Error(), "libstdc++") } -func TestDiscoverHeaderSources_NotImplementedTargets(t *testing.T) { - deferred := []struct { - platform, language string +// TestDiscoverHeaderSources_CrossPlatformHeadersMissing verifies that when +// the cross-platform toolchains aren't installed on the host, each target +// surfaces a remediation hint instead of crashing. PR-03 implements the +// dispatch; the host-installation case is covered by walker_xplat_test.go. +func TestDiscoverHeaderSources_CrossPlatformHeadersMissing(t *testing.T) { + withTempMingwRoot(t, "/definitely/missing") + withTempDarwinRoots(t, []string{"/missing/sdk"}, []string{"/missing/cpp"}) + + cases := []struct { + platform, language, fragment string }{ - {core.PlatformWindows, core.LanguageC}, - {core.PlatformWindows, core.LanguageCpp}, - {core.PlatformDarwin, core.LanguageC}, - {core.PlatformDarwin, core.LanguageCpp}, + {core.PlatformWindows, core.LanguageC, "mingw-w64"}, + {core.PlatformWindows, core.LanguageCpp, "mingw libstdc++"}, + {core.PlatformDarwin, core.LanguageC, "macOS SDK"}, + {core.PlatformDarwin, core.LanguageCpp, "libc++"}, } - for _, tt := range deferred { + for _, tt := range cases { _, err := DiscoverHeaderSources(tt.platform, tt.language) require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") + assert.Contains(t, err.Error(), tt.fragment) } } diff --git a/sast-engine/tools/internal/clikeextract/walker_xplat.go b/sast-engine/tools/internal/clikeextract/walker_xplat.go new file mode 100644 index 00000000..3f0c15e6 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/walker_xplat.go @@ -0,0 +1,202 @@ +package clikeextract + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "sort" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" +) + +// Windows headers, accessed cross-platform via mingw-w64 on Ubuntu. +// +// `apt install mingw-w64` on ubuntu-latest places the Win32 + MSVCRT headers +// under /usr/x86_64-w64-mingw32/include and the mingw libstdc++ tree at +// /usr/x86_64-w64-mingw32/include/c++/. Using mingw on Linux beats +// running a Windows GitHub Actions runner for cost and simplicity, and gives +// us a faithful Win32 surface for stdlib resolution. +// +// All paths exposed as package vars (rather than literals) so tests can +// override them to exercise both the hit and miss branches without depending +// on whether the host actually has mingw installed. +var ( + // windowsMingwRoot is the canonical mingw-w64 install root. Subdirectories + // `include` (C) and `include/c++/` (C++) live underneath. + windowsMingwRoot = "/usr/x86_64-w64-mingw32" + + // darwinSDKRoots is the ordered list of macOS SDK include directories the + // generator probes. Command Line Tools first because that's the lighter + // install used in CI; Xcode.app is a fallback for full developer setups. + darwinSDKRoots = []string{ + "/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include", + "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include", + } + + // darwinCppRoots is the ordered list of clang-shipped libc++ trees under + // the Apple toolchain. macos-latest's xcrun typically lands the headers + // under Command Line Tools. + darwinCppRoots = []string{ + "/Library/Developer/CommandLineTools/usr/include/c++/v1", + "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include/c++/v1", + } +) + +// windowsCSource constructs the HeaderSource for Win32 + MSVCRT C headers. +// Probes the canonical mingw include dir; on miss, returns an error with a +// concrete remediation hint so CI logs make the cause obvious. +func windowsCSource() (HeaderSource, error) { + dir := filepath.Join(windowsMingwRoot, "include") + if !dirExists(dir) { + return HeaderSource{}, fmt.Errorf("windowsCSource: mingw-w64 headers not found at %s "+ + "(install with: apt install mingw-w64)", dir) + } + return HeaderSource{ + Platform: core.PlatformWindows, + Language: core.LanguageC, + SearchDirs: []string{dir}, + HeaderExts: []string{".h"}, + SystemTag: "mingw-w64-" + detectMingwVersion(), + }, nil +} + +// windowsCppSource finds the mingw libstdc++ header tree. Returns an error +// when no version directory exists — without one, the C++ surface is +// unrecoverable (the directory name encodes the version). +// +// The walk lists the C++ STL tree only. Win32 C headers are exposed via +// windowsCSource; mixing both into one source would conflate languages. +func windowsCppSource() (HeaderSource, error) { + root := filepath.Join(windowsMingwRoot, "include", "c++") + dir, version := findVersionedDir(root) + if dir == "" { + return HeaderSource{}, fmt.Errorf("windowsCppSource: no mingw libstdc++ headers under %s "+ + "(install with: apt install g++-mingw-w64)", root) + } + return HeaderSource{ + Platform: core.PlatformWindows, + Language: core.LanguageCpp, + SearchDirs: []string{dir}, + HeaderExts: []string{".h", ".hpp", ".hxx", ""}, + SystemTag: "mingw-w64-libstdc++-" + version, + }, nil +} + +// darwinCSource probes the canonical macOS SDK include locations and uses +// the first one that exists. Apple ships Command Line Tools and full Xcode +// installs in different subtrees; the loader tries CLT first because that's +// the cheaper macos-latest layout. +func darwinCSource() (HeaderSource, error) { + dir := firstExistingDir(darwinSDKRoots) + if dir == "" { + return HeaderSource{}, errors.New("darwinCSource: macOS SDK headers not found at any of " + + fmt.Sprint(darwinSDKRoots) + " (install Command Line Tools: xcode-select --install)") + } + return HeaderSource{ + Platform: core.PlatformDarwin, + Language: core.LanguageC, + SearchDirs: []string{dir}, + HeaderExts: []string{".h"}, + SystemTag: "darwin-" + detectDarwinSDKTag(dir), + }, nil +} + +// darwinCppSource probes the libc++ tree shipped with the Apple toolchain. +// Apple's libc++ has a notably different surface from libstdc++ — different +// container ABI, different `__1::` inline namespace — but the same public +// API; the manifest is generated against the actual installed headers so +// resolution stays correct on the host platform. +func darwinCppSource() (HeaderSource, error) { + dir := firstExistingDir(darwinCppRoots) + if dir == "" { + return HeaderSource{}, errors.New("darwinCppSource: libc++ headers not found at any of " + + fmt.Sprint(darwinCppRoots) + " (install Xcode or Command Line Tools)") + } + return HeaderSource{ + Platform: core.PlatformDarwin, + Language: core.LanguageCpp, + SearchDirs: []string{dir}, + HeaderExts: []string{".h", ".hpp", ".hxx", ""}, + SystemTag: "libc++-darwin-" + detectDarwinCppTag(dir), + }, nil +} + +// findVersionedDir lists root and returns the lexically-largest entry name +// containing a digit (canonical "13", "13.2.0", "v1") together with its +// version. Returns ("","") on missing root or empty result. +// +// Shared between windowsCppSource (looks for c++/) and the +// darwin probes (looks for c++/v) — both want the freshest version dir +// without parsing semver explicitly. +func findVersionedDir(root string) (dir, version string) { + entries, err := os.ReadDir(root) + if err != nil { + return "", "" + } + var versions []string + for _, e := range entries { + if !e.IsDir() { + continue + } + name := e.Name() + if !containsDigit(name) { + continue + } + versions = append(versions, name) + } + if len(versions) == 0 { + return "", "" + } + sort.Strings(versions) + v := versions[len(versions)-1] + return filepath.Join(root, v), v +} + +// firstExistingDir returns the first directory in the list that exists on +// disk, or "" if none do. Order matters — callers list the cheaper / more +// likely option first. +func firstExistingDir(candidates []string) string { + for _, c := range candidates { + if dirExists(c) { + return c + } + } + return "" +} + +// detectMingwVersion derives a version string from the libstdc++ directory +// name embedded under the mingw root. Returns "unknown" when the tree has +// not been probed yet (windowsCppSource hits this path before the C source +// builder runs). Lightweight on purpose: parsing `gcc --version` would add +// an exec dependency that complicates testing for marginal accuracy gain. +func detectMingwVersion() string { + root := filepath.Join(windowsMingwRoot, "include", "c++") + _, v := findVersionedDir(root) + if v == "" { + return "unknown" + } + return v +} + +// detectDarwinSDKTag returns a short identifier for the SDK whose include +// dir was selected. Currently uses the parent directory name (e.g. +// "MacOSX.sdk") which is enough to tell CommandLineTools apart from Xcode. +func detectDarwinSDKTag(includeDir string) string { + // includeDir ends in `.../MacOSX.sdk/usr/include`. + parent := filepath.Base(filepath.Dir(filepath.Dir(includeDir))) + if parent == "" { + return "unknown" + } + return parent +} + +// detectDarwinCppTag returns the libc++ version directory name (typically +// "v1") so the manifest can distinguish future ABI bumps. +func detectDarwinCppTag(includeDir string) string { + base := filepath.Base(includeDir) + if base == "" { + return "unknown" + } + return base +} diff --git a/sast-engine/tools/internal/clikeextract/walker_xplat_test.go b/sast-engine/tools/internal/clikeextract/walker_xplat_test.go new file mode 100644 index 00000000..f952dd1f --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/walker_xplat_test.go @@ -0,0 +1,223 @@ +package clikeextract + +import ( + "os" + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// withTempMingwRoot wires a temporary directory in as the mingw root for +// the duration of t. Restores the original on cleanup so tests are +// independent of order. +func withTempMingwRoot(t *testing.T, root string) { + t.Helper() + orig := windowsMingwRoot + windowsMingwRoot = root + t.Cleanup(func() { windowsMingwRoot = orig }) +} + +// withTempDarwinRoots replaces both the C and C++ Darwin probe lists for t. +func withTempDarwinRoots(t *testing.T, sdkRoots, cppRoots []string) { + t.Helper() + origC := darwinSDKRoots + origCpp := darwinCppRoots + darwinSDKRoots = sdkRoots + darwinCppRoots = cppRoots + t.Cleanup(func() { + darwinSDKRoots = origC + darwinCppRoots = origCpp + }) +} + +func TestWindowsCSource_Found(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "include"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(root, "include", "c++", "13"), 0o755)) + withTempMingwRoot(t, root) + + src, err := windowsCSource() + require.NoError(t, err) + assert.Equal(t, core.PlatformWindows, src.Platform) + assert.Equal(t, core.LanguageC, src.Language) + assert.Equal(t, []string{filepath.Join(root, "include")}, src.SearchDirs) + assert.Equal(t, "mingw-w64-13", src.SystemTag) +} + +func TestWindowsCSource_Missing(t *testing.T) { + withTempMingwRoot(t, filepath.Join(t.TempDir(), "absent")) + _, err := windowsCSource() + require.Error(t, err) + assert.Contains(t, err.Error(), "mingw-w64 headers not found") +} + +func TestWindowsCSource_VersionUnknownWhenCppTreeMissing(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "include"), 0o755)) + withTempMingwRoot(t, root) + + src, err := windowsCSource() + require.NoError(t, err) + assert.Equal(t, "mingw-w64-unknown", src.SystemTag) +} + +func TestWindowsCppSource_Found(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "include", "c++", "13"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(root, "include", "c++", "12"), 0o755)) + withTempMingwRoot(t, root) + + src, err := windowsCppSource() + require.NoError(t, err) + assert.Equal(t, core.PlatformWindows, src.Platform) + assert.Equal(t, core.LanguageCpp, src.Language) + assert.Equal(t, []string{filepath.Join(root, "include", "c++", "13")}, src.SearchDirs, + "freshest version directory must win") + assert.Equal(t, "mingw-w64-libstdc++-13", src.SystemTag) +} + +func TestWindowsCppSource_Missing(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "include"), 0o755)) + withTempMingwRoot(t, root) + + _, err := windowsCppSource() + require.Error(t, err) + assert.Contains(t, err.Error(), "no mingw libstdc++ headers") +} + +func TestDarwinCSource_Found(t *testing.T) { + sdkInclude := filepath.Join(t.TempDir(), "MacOSX.sdk", "usr", "include") + require.NoError(t, os.MkdirAll(sdkInclude, 0o755)) + withTempDarwinRoots(t, []string{sdkInclude}, []string{"/nope"}) + + src, err := darwinCSource() + require.NoError(t, err) + assert.Equal(t, core.PlatformDarwin, src.Platform) + assert.Equal(t, core.LanguageC, src.Language) + assert.Equal(t, "darwin-MacOSX.sdk", src.SystemTag) +} + +func TestDarwinCSource_Missing(t *testing.T) { + withTempDarwinRoots(t, []string{"/no", "/where"}, []string{"/no"}) + _, err := darwinCSource() + require.Error(t, err) + assert.Contains(t, err.Error(), "macOS SDK headers not found") +} + +func TestDarwinCppSource_Found(t *testing.T) { + cppInclude := filepath.Join(t.TempDir(), "v1") + require.NoError(t, os.MkdirAll(cppInclude, 0o755)) + withTempDarwinRoots(t, []string{"/no"}, []string{cppInclude}) + + src, err := darwinCppSource() + require.NoError(t, err) + assert.Equal(t, core.PlatformDarwin, src.Platform) + assert.Equal(t, core.LanguageCpp, src.Language) + assert.Equal(t, "libc++-darwin-v1", src.SystemTag) +} + +func TestDarwinCppSource_Missing(t *testing.T) { + withTempDarwinRoots(t, []string{"/no"}, []string{"/no", "/where"}) + _, err := darwinCppSource() + require.Error(t, err) + assert.Contains(t, err.Error(), "libc++ headers not found") +} + +func TestFindVersionedDir(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "13"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(root, "12"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(root, "experimental"), 0o755)) + + dir, ver := findVersionedDir(root) + assert.Equal(t, filepath.Join(root, "13"), dir) + assert.Equal(t, "13", ver) +} + +func TestFindVersionedDir_MissingRoot(t *testing.T) { + dir, ver := findVersionedDir(filepath.Join(t.TempDir(), "missing")) + assert.Empty(t, dir) + assert.Empty(t, ver) +} + +func TestFindVersionedDir_NoVersionedEntries(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "experimental"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(root, "internal"), 0o755)) + dir, ver := findVersionedDir(root) + assert.Empty(t, dir) + assert.Empty(t, ver) +} + +func TestFirstExistingDir(t *testing.T) { + good := t.TempDir() + assert.Equal(t, good, firstExistingDir([]string{"/missing", good})) + assert.Empty(t, firstExistingDir([]string{"/missing", "/also-missing"})) +} + +func TestDetectDarwinSDKTag(t *testing.T) { + assert.Equal(t, "MacOSX.sdk", + detectDarwinSDKTag("/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include")) +} + +func TestDetectDarwinCppTag(t *testing.T) { + assert.Equal(t, "v1", detectDarwinCppTag("/Library/Developer/CommandLineTools/usr/include/c++/v1")) +} + +// TestDiscoverHeaderSources_DarwinAndWindowsDispatched is the integration +// test for DiscoverHeaderSources: it confirms each (target, language) reaches +// the right per-platform source builder when fixture trees exist on disk. +func TestDiscoverHeaderSources_DarwinAndWindowsDispatched(t *testing.T) { + mingw := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(mingw, "include"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(mingw, "include", "c++", "13"), 0o755)) + withTempMingwRoot(t, mingw) + + sdkInclude := filepath.Join(t.TempDir(), "MacOSX.sdk", "usr", "include") + require.NoError(t, os.MkdirAll(sdkInclude, 0o755)) + cppInclude := filepath.Join(t.TempDir(), "v1") + require.NoError(t, os.MkdirAll(cppInclude, 0o755)) + withTempDarwinRoots(t, []string{sdkInclude}, []string{cppInclude}) + + for _, tt := range []struct { + target, language, wantTag string + }{ + {core.PlatformWindows, core.LanguageC, "mingw-w64-13"}, + {core.PlatformWindows, core.LanguageCpp, "mingw-w64-libstdc++-13"}, + {core.PlatformDarwin, core.LanguageC, "darwin-MacOSX.sdk"}, + {core.PlatformDarwin, core.LanguageCpp, "libc++-darwin-v1"}, + } { + t.Run(tt.target+"/"+tt.language, func(t *testing.T) { + sources, err := DiscoverHeaderSources(tt.target, tt.language) + require.NoError(t, err) + require.Len(t, sources, 1) + assert.Equal(t, tt.wantTag, sources[0].SystemTag) + }) + } +} + +// TestDiscoverHeaderSources_DarwinAndWindowsErrorWhenAbsent surfaces the +// error path: missing toolchain → clear error message. +func TestDiscoverHeaderSources_DarwinAndWindowsErrorWhenAbsent(t *testing.T) { + withTempMingwRoot(t, "/definitely/missing") + withTempDarwinRoots(t, []string{"/missing/sdk"}, []string{"/missing/cpp"}) + + for _, tc := range []struct { + target, language, fragment string + }{ + {core.PlatformWindows, core.LanguageC, "mingw-w64 headers not found"}, + {core.PlatformWindows, core.LanguageCpp, "no mingw libstdc++"}, + {core.PlatformDarwin, core.LanguageC, "macOS SDK headers not found"}, + {core.PlatformDarwin, core.LanguageCpp, "libc++ headers not found"}, + } { + t.Run(tc.target+"/"+tc.language, func(t *testing.T) { + _, err := DiscoverHeaderSources(tc.target, tc.language) + require.Error(t, err) + assert.Contains(t, err.Error(), tc.fragment) + }) + } +}