From d39faca3500bb40d6004168ec6d8dfde366671e0 Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 19:05:44 -0400 Subject: [PATCH 1/3] feat(registry): HTTP loader for C/C++ stdlib registries Fills in PR-02's stubbed loadManifestFromHTTP / fetchHeaderFromHTTP paths so the remote loaders work end-to-end against a CDN. Local file:// behavior is unchanged. clike_http.go shared helpers: - fetchURL: single GET, 200-only, error message embeds the URL so CDN-side issues are diagnosable without bouncing through retries. - verifyChecksum: optional sha256: verification; empty checksum disables verification (manifests pre-PR-03 ship without it). - joinURL: trailing/leading slash-tolerant URL composition shared between C and C++. C and C++ HTTP loaders now: - Construct paths as ///v1/, prefer per-entry URL when the manifest embeds one. - Try the on-disk cache first when fresh (24h TTL); always fall back to stale cache on network failure so offline scans still resolve stdlib calls. - Best-effort write to the on-disk cache after a successful fetch; cache write failures warn but never block the in-memory result. Updated tests: - New httptest-driven coverage for both loaders: 200, 404, network failure with cache fallback, checksum match, checksum mismatch, parallel fetches, embedded URL, garbage body. - Two PR-02 stub assertions (HTTPMode_Stub, HTTPMode_FetchHeaderStub) rewritten to point at an unreachable port and assert the proper network error. - cmd/scan_stdlib_test.go now isolates XDG_CACHE_HOME / HOME when exercising the unreachable-host path so a developer cache cannot produce a false-positive success. Co-Authored-By: Claude Sonnet 4.5 --- sast-engine/cmd/scan_stdlib_test.go | 18 +- .../callgraph/registry/c_stdlib_remote.go | 109 ++++- .../registry/c_stdlib_remote_test.go | 27 +- .../graph/callgraph/registry/clike_http.go | 90 ++++ .../callgraph/registry/clike_http_test.go | 394 ++++++++++++++++++ .../callgraph/registry/cpp_stdlib_remote.go | 96 ++++- .../registry/cpp_stdlib_remote_test.go | 21 +- 7 files changed, 704 insertions(+), 51 deletions(-) create mode 100644 sast-engine/graph/callgraph/registry/clike_http.go create mode 100644 sast-engine/graph/callgraph/registry/clike_http_test.go diff --git a/sast-engine/cmd/scan_stdlib_test.go b/sast-engine/cmd/scan_stdlib_test.go index 811fe9b5..9630fa55 100644 --- a/sast-engine/cmd/scan_stdlib_test.go +++ b/sast-engine/cmd/scan_stdlib_test.go @@ -119,11 +119,19 @@ func TestInitClikeStdlib_BarePathTreatedAsFile(t *testing.T) { require.NotNil(t, cfg.cppLoader) } -func TestInitClikeStdlib_HTTPSchemeIsStubbed(t *testing.T) { - // HTTP path returns a constructed loader, but LoadManifest fails - // with the PR-03 stub error. Both loaders should be nil after the - // failed load. - cfg, wired := initClikeStdlib(t.TempDir(), "linux", "https://example.test/registries", newTestLogger()) +func TestInitClikeStdlib_HTTPSchemeFailsGracefullyOnUnreachableHost(t *testing.T) { + // PR-03 wires HTTP up — when the URL doesn't resolve and there's no + // disk cache to fall back on, both loaders fail to load and stay + // nil. The scan continues under Phase 1 behavior. + // + // Point the cache at a fresh temp dir so a previously populated + // developer cache (e.g. from running another test) cannot serve a + // stale manifest and turn this into a false-positive success. + t.Setenv("XDG_CACHE_HOME", t.TempDir()) + t.Setenv("HOME", t.TempDir()) + t.Setenv("LOCALAPPDATA", t.TempDir()) + + cfg, wired := initClikeStdlib(t.TempDir(), "linux", "http://127.0.0.1:1/registries", newTestLogger()) assert.False(t, wired) assert.Nil(t, cfg.cLoader) assert.Nil(t, cfg.cppLoader) diff --git a/sast-engine/graph/callgraph/registry/c_stdlib_remote.go b/sast-engine/graph/callgraph/registry/c_stdlib_remote.go index d2fdc71d..0e9dfc74 100644 --- a/sast-engine/graph/callgraph/registry/c_stdlib_remote.go +++ b/sast-engine/graph/callgraph/registry/c_stdlib_remote.go @@ -107,11 +107,57 @@ func (r *CStdlibRegistryRemote) loadManifestFromFile(logger core.CStdlibLogger) return nil } -// loadManifestFromHTTP is the PR-03 hook. PR-02 ships it as a deliberate stub -// so the type satisfies CStdlibLoader without any half-built network code -// shipping early. -func (r *CStdlibRegistryRemote) loadManifestFromHTTP(_ core.CStdlibLogger) error { - return errors.New("CStdlibRegistryRemote: HTTP loader not yet implemented; tracked in PR-03") +// loadManifestFromHTTP downloads the top-level manifest.json over HTTP, with +// fallback to a stale on-disk cache when the network is unreachable. Layout: +// +// GET //c/v1/manifest.json +// └─ on success: parse JSON, write to disk cache, populate r.manifest +// └─ on network failure: read disk cache (regardless of TTL), warn, continue +// └─ on cache miss too: surface the original network error +// +// Disk-cache writes are best-effort: a failed write logs a warning but does +// not block in-memory population (the scan still benefits from this run, the +// next run just has to re-fetch). +func (r *CStdlibRegistryRemote) loadManifestFromHTTP(logger core.CStdlibLogger) error { + url := joinURL(r.baseURL, r.platform, "c", "v1", "manifest.json") + if logger != nil { + logger.Debug("Downloading C stdlib manifest: %s", url) + } + + data, err := fetchURL(r.httpClient, url) + if err != nil { + // Network failed — try disk cache irrespective of freshness so a + // scan in a no-network environment still resolves stdlib calls. + if cached, cerr := r.diskCache.GetManifest(); cerr == nil { + if logger != nil { + logger.Warning("Network failed for %s; serving cached manifest. Underlying: %v", url, err) + } + r.cacheMutex.Lock() + r.manifest = cached + r.cacheMutex.Unlock() + return nil + } + return fmt.Errorf("loadManifestFromHTTP: %w", err) + } + + var manifest core.CStdlibManifest + if err := json.Unmarshal(data, &manifest); err != nil { + return fmt.Errorf("loadManifestFromHTTP: parsing manifest from %s: %w", url, err) + } + + if cerr := r.diskCache.SaveManifest(data); cerr != nil && logger != nil { + logger.Warning("Failed to save C manifest to disk cache: %v", cerr) + } + + r.cacheMutex.Lock() + r.manifest = &manifest + r.cacheMutex.Unlock() + + if logger != nil { + logger.Statistic("Loaded C stdlib manifest over HTTP: %d headers for %s", + len(manifest.Headers), r.platform) + } + return nil } // GetHeader retrieves the per-header content, fetching on first reference and @@ -179,10 +225,55 @@ func (r *CStdlibRegistryRemote) fetchHeaderFromFile(entry *core.CStdlibHeaderEnt return &h, nil } -// fetchHeaderFromHTTP is the PR-03 hook. PR-02 stub keeps the type -// satisfying its interface contract without shipping half-built network code. -func (r *CStdlibRegistryRemote) fetchHeaderFromHTTP(_ *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) { - return nil, errors.New("CStdlibRegistryRemote: HTTP fetch not yet implemented; tracked in PR-03") +// fetchHeaderFromHTTP downloads one per-header JSON over HTTP, with disk-cache +// freshness checks on the way in and stale-cache fallback on network failure. +// +// The lookup chain: +// 1. Disk cache hit AND fresh (< 24h) → return cached, no network. +// 2. Otherwise GET the entry's URL (or construct one from baseURL + entry.File +// when the manifest predates URL embedding). +// 3. On 200 OK: verify checksum (when present in the manifest), parse JSON, +// persist to disk cache, return. +// 4. On any network or parse failure: try the on-disk cache irrespective of +// freshness — a stale registry beats no resolution at all. +func (r *CStdlibRegistryRemote) fetchHeaderFromHTTP(entry *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) { + if r.diskCache.IsFresh(entry.File, stdlibCacheTTL) { + if cached, err := r.diskCache.GetHeader(entry.File); err == nil { + return cached, nil + } + } + + url := r.headerURL(entry) + data, err := fetchURL(r.httpClient, url) + if err != nil { + if cached, cerr := r.diskCache.GetHeader(entry.File); cerr == nil { + return cached, nil + } + return nil, fmt.Errorf("fetchHeaderFromHTTP: %w", err) + } + + if err := verifyChecksum(data, entry.Checksum); err != nil { + return nil, fmt.Errorf("fetchHeaderFromHTTP: %s: %w", entry.Header, err) + } + + var h core.CStdlibHeader + if err := json.Unmarshal(data, &h); err != nil { + return nil, fmt.Errorf("fetchHeaderFromHTTP: parsing %s: %w", url, err) + } + + _ = r.diskCache.SaveHeader(entry.File, data) // best-effort + + return &h, nil +} + +// headerURL prefers the manifest-embedded URL when present (lets the registry +// publisher point individual files at a different host or a versioned path) +// and otherwise constructs one from the loader's baseURL + entry.File. +func (r *CStdlibRegistryRemote) headerURL(entry *core.CStdlibHeaderEntry) string { + if entry.URL != "" { + return entry.URL + } + return joinURL(r.baseURL, r.platform, "c", "v1", entry.File) } // GetFunction is a convenience accessor: GetHeader followed by a function diff --git a/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go b/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go index 9a572705..fc5367af 100644 --- a/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go +++ b/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go @@ -233,27 +233,16 @@ func TestCStdlibRegistry_DoubleCheckLocking(t *testing.T) { } } -func TestCStdlibRegistry_HTTPMode_Stub(t *testing.T) { - r := NewCStdlibRegistryRemote("https://example.com/registries", core.PlatformLinux) +// TestCStdlibRegistry_HTTPMode_NetworkFailureNoCacheSurfacesError verifies +// that an HTTP-only loader with no on-disk cache surfaces the underlying +// network error rather than swallowing it. Uses an unreachable port on +// localhost so the test never depends on external connectivity. +func TestCStdlibRegistry_HTTPMode_NetworkFailureNoCacheSurfacesError(t *testing.T) { + r := NewCStdlibRegistryRemote("http://127.0.0.1:1/registries", core.PlatformLinux) + r.diskCache = nil // no cache → no fallback path err := r.LoadManifest(noopLogger{}) require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") -} - -func TestCStdlibRegistry_HTTPMode_FetchHeaderStub(t *testing.T) { - // Construct the HTTP loader with an in-memory manifest by going through - // a file:// loader first, then forcing the fetch path to HTTP. - dir := t.TempDir() - writeCRegistry(t, dir) - r := NewCStdlibRegistryFile(dir, core.PlatformLinux) - require.NoError(t, r.LoadManifest(noopLogger{})) - - // Switch to HTTP mode mid-flight by clearing fileBase. Tests-only — - // production code never does this. - r.fileBase = "" - _, err := r.GetHeader("stdio.h") // fresh header (not yet cached) - require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") + assert.Contains(t, err.Error(), "loadManifestFromHTTP") } func TestCStdlibRegistry_RemoteCtor_TrimsTrailingSlash(t *testing.T) { diff --git a/sast-engine/graph/callgraph/registry/clike_http.go b/sast-engine/graph/callgraph/registry/clike_http.go new file mode 100644 index 00000000..8a75a669 --- /dev/null +++ b/sast-engine/graph/callgraph/registry/clike_http.go @@ -0,0 +1,90 @@ +package registry + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "io" + "net/http" + "strings" +) + +// httpFetchTimeout is the upper bound on a single HTTP request. The 30-second +// value matches the Go stdlib loader; CDN responses are typically <100ms but +// flaky networks need slack to avoid spurious cache misses. +// +// The constant lives at package scope (not on the loader struct) so test +// servers and benchmarks can rely on a deterministic value. + +// fetchURL performs a single HTTP GET and returns the response body. Non-200 +// responses are turned into errors so callers can branch on a single failure +// path. The caller-supplied client carries timeout + retry policy. +// +// Defined at package scope so both the C and C++ remote loaders share the +// same wire-format behavior — the only thing that differs between them is the +// URL path (`/c/v1/` vs `/cpp/v1/`). +func fetchURL(client *http.Client, url string) ([]byte, error) { + if client == nil { + return nil, fmt.Errorf("fetchURL: nil HTTP client") + } + req, err := http.NewRequestWithContext(context.Background(), http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("fetchURL: building request for %s: %w", url, err) + } + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("fetchURL: GET %s: %w", url, err) + } + defer func() { _ = resp.Body.Close() }() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("fetchURL: reading body from %s: %w", url, err) + } + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("fetchURL: GET %s: HTTP %d", url, resp.StatusCode) + } + return body, nil +} + +// verifyChecksum confirms that data hashes to the expected sha256 digest. +// expected is the manifest's `checksum` field, expected to be of the form +// "sha256:". An empty expected string disables checksum verification — +// useful for manifests generated before the checksum field was populated, and +// for tests that don't want to compute hashes by hand. +// +// Returns an error tagged with both expected + actual so log lines stay +// useful for diagnosing CDN tampering or stale-cache + new-checksum drift. +func verifyChecksum(data []byte, expected string) error { + if expected == "" { + return nil + } + if !strings.HasPrefix(expected, "sha256:") { + return fmt.Errorf("verifyChecksum: unsupported checksum format %q (want sha256:)", expected) + } + want := strings.TrimPrefix(expected, "sha256:") + sum := sha256.Sum256(data) + got := hex.EncodeToString(sum[:]) + if got != want { + return fmt.Errorf("verifyChecksum: digest mismatch (want %s, got %s)", want, got) + } + return nil +} + +// joinURL concatenates a base URL with one or more path segments using a +// single forward slash as separator. Stripping leading/trailing slashes on +// the input avoids the duplicate-slash artifacts a naive join would emit +// (e.g. "https://x/" + "/foo" → "https://x//foo"). +func joinURL(base string, segments ...string) string { + parts := make([]string, 0, len(segments)+1) + parts = append(parts, strings.TrimRight(base, "/")) + for _, s := range segments { + s = strings.Trim(s, "/") + if s == "" { + continue + } + parts = append(parts, s) + } + return strings.Join(parts, "/") +} diff --git a/sast-engine/graph/callgraph/registry/clike_http_test.go b/sast-engine/graph/callgraph/registry/clike_http_test.go new file mode 100644 index 00000000..66a37397 --- /dev/null +++ b/sast-engine/graph/callgraph/registry/clike_http_test.go @@ -0,0 +1,394 @@ +package registry + +import ( + "crypto/sha256" + "encoding/hex" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// stdlibFixture is the canonical small registry the HTTP tests reuse. Each +// test that needs a server can call serveFixture(t, fixture) to get a live +// httptest.Server and a baseURL that the loader recognizes. +type stdlibFixture struct { + manifestC *core.CStdlibManifest + headerC *core.CStdlibHeader + manifestCpp *core.CStdlibManifest + headerCpp *core.CStdlibHeader +} + +func newCFixture() *stdlibFixture { + return &stdlibFixture{ + manifestC: &core.CStdlibManifest{ + SchemaVersion: "1.0.0", + RegistryVersion: "v1", + Platform: core.PlatformLinux, + Language: core.LanguageC, + Headers: []*core.CStdlibHeaderEntry{ + {Header: "stdio.h", ModuleID: "c::stdio", File: "stdio_stdlib.json"}, + }, + Statistics: &core.CStdlibStatistics{TotalHeaders: 1, TotalFunctions: 1}, + }, + headerC: &core.CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "stdio.h", + ModuleID: "c::stdio", + Language: core.LanguageC, + Functions: map[string]*core.CStdlibFunction{ + "printf": {FQN: "c::stdio::printf", ReturnType: "int", Source: core.SourceOverlay, Confidence: 1.0}, + }, + }, + manifestCpp: &core.CStdlibManifest{ + SchemaVersion: "1.0.0", + RegistryVersion: "v1", + Platform: core.PlatformLinux, + Language: core.LanguageCpp, + Headers: []*core.CStdlibHeaderEntry{ + {Header: "vector", ModuleID: "std::vector", File: "vector_stdlib.json"}, + }, + Statistics: &core.CStdlibStatistics{TotalHeaders: 1, TotalClasses: 1, TotalFunctions: 1}, + }, + headerCpp: &core.CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "vector", + ModuleID: "std::vector", + Language: core.LanguageCpp, + Classes: map[string]*core.CppStdlibClass{ + "std::vector": { + FQN: "std::vector", TypeParams: []string{"T"}, + Methods: map[string]*core.CStdlibFunction{ + "push_back": {FQN: "std::vector::push_back", ReturnType: "void", Source: core.SourceOverlay, Confidence: 1.0}, + }, + }, + }, + }, + } +} + +// serveFixture stands up a test server that mimics the CDN's URL layout: +// +// GET /registries/linux/c/v1/manifest.json +// GET /registries/linux/c/v1/stdio_stdlib.json +// GET /registries/linux/cpp/v1/manifest.json +// GET /registries/linux/cpp/v1/vector_stdlib.json +// +// The returned baseURL ends in "/registries" so the loader's joinURL builds +// exactly the paths above. +func serveFixture(t *testing.T, f *stdlibFixture) *httptest.Server { + t.Helper() + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + switch req.URL.Path { + case "/registries/linux/c/v1/manifest.json": + writeJSON(t, w, f.manifestC) + case "/registries/linux/c/v1/stdio_stdlib.json": + writeJSON(t, w, f.headerC) + case "/registries/linux/cpp/v1/manifest.json": + writeJSON(t, w, f.manifestCpp) + case "/registries/linux/cpp/v1/vector_stdlib.json": + writeJSON(t, w, f.headerCpp) + default: + http.NotFound(w, req) + } + })) + t.Cleanup(srv.Close) + return srv +} + +func writeJSON(t *testing.T, w http.ResponseWriter, v any) { + t.Helper() + data, err := json.Marshal(v) + require.NoError(t, err) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write(data) +} + +// withTempCacheRoot points the loader's disk cache at a t.TempDir for the +// duration of the test. The default $HOME/.cache root is unsuitable because +// tests must not pollute the developer's real cache. +func withTempCacheRoot(t *testing.T, r *CStdlibRegistryRemote) { + t.Helper() + r.diskCache = newDiskCacheStore(t.TempDir()) +} + +func withTempCacheRootCpp(t *testing.T, r *CppStdlibRegistryRemote) { + t.Helper() + r.diskCache = newDiskCacheStore(t.TempDir()) +} + +func TestCStdlibRegistry_HTTP_LoadManifestAndHeader(t *testing.T) { + srv := serveFixture(t, newCFixture()) + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + + require.NoError(t, r.LoadManifest(noopLogger{})) + assert.Equal(t, 1, r.HeaderCount()) + + h, err := r.GetHeader("stdio.h") + require.NoError(t, err) + require.Contains(t, h.Functions, "printf") + assert.Equal(t, "c::stdio::printf", h.Functions["printf"].FQN) + + // Second call must hit the in-memory cache (same pointer). + h2, err := r.GetHeader("stdio.h") + require.NoError(t, err) + assert.Same(t, h, h2) +} + +func TestCStdlibRegistry_HTTP_NetworkFailureFallsBackToCachedManifest(t *testing.T) { + // First successful fetch writes the manifest to disk cache. + srv := serveFixture(t, newCFixture()) + cacheDir := t.TempDir() + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r.LoadManifest(noopLogger{})) + + // Spin up a second loader pointed at a dead URL but sharing the same cache. + r2 := NewCStdlibRegistryRemote("http://127.0.0.1:1/registries", core.PlatformLinux) + r2.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r2.LoadManifest(noopLogger{}), "stale cache must serve when network is down") + assert.Equal(t, 1, r2.HeaderCount()) +} + +func TestCStdlibRegistry_HTTP_HeaderFallsBackToCachedOnNetworkFailure(t *testing.T) { + srv := serveFixture(t, newCFixture()) + cacheDir := t.TempDir() + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetHeader("stdio.h") // populates header cache on disk + require.NoError(t, err) + + // Tear the server down — subsequent loaders must serve from cache only. + srv.Close() + + r2 := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r2.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r2.LoadManifest(noopLogger{}), "loader should pick up cached manifest") + h, err := r2.GetHeader("stdio.h") + require.NoError(t, err) + require.Contains(t, h.Functions, "printf") +} + +func TestCStdlibRegistry_HTTP_404Surfaces(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.NotFound(w, nil) + })) + t.Cleanup(srv.Close) + + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r.diskCache = nil + err := r.LoadManifest(noopLogger{}) + require.Error(t, err) + assert.Contains(t, err.Error(), "404") +} + +func TestCStdlibRegistry_HTTP_ChecksumValid(t *testing.T) { + f := newCFixture() + headerBytes, err := json.Marshal(f.headerC) + require.NoError(t, err) + sum := sha256.Sum256(headerBytes) + f.manifestC.Headers[0].Checksum = "sha256:" + hex.EncodeToString(sum[:]) + + srv := serveFixture(t, f) + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err = r.GetHeader("stdio.h") + require.NoError(t, err) +} + +func TestCStdlibRegistry_HTTP_ChecksumMismatch(t *testing.T) { + f := newCFixture() + f.manifestC.Headers[0].Checksum = "sha256:deadbeef" // wrong hash + + srv := serveFixture(t, f) + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetHeader("stdio.h") + require.Error(t, err) + assert.Contains(t, err.Error(), "digest mismatch") +} + +func TestCStdlibRegistry_HTTP_ChecksumUnsupportedFormat(t *testing.T) { + f := newCFixture() + f.manifestC.Headers[0].Checksum = "md5:abcdef" + + srv := serveFixture(t, f) + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetHeader("stdio.h") + require.Error(t, err) + assert.Contains(t, err.Error(), "unsupported checksum format") +} + +func TestCStdlibRegistry_HTTP_ManifestEmbeddedURL(t *testing.T) { + f := newCFixture() + srv := serveFixture(t, f) + + // Override the entry's URL to something at a different path; the loader + // must follow it instead of constructing one from baseURL. + f.manifestC.Headers[0].URL = srv.URL + "/registries/linux/c/v1/stdio_stdlib.json" + + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + h, err := r.GetHeader("stdio.h") + require.NoError(t, err) + require.Contains(t, h.Functions, "printf") +} + +func TestCStdlibRegistry_HTTP_ParallelHeaderFetches(t *testing.T) { + srv := serveFixture(t, newCFixture()) + + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + + const workers = 50 + results := make([]*core.CStdlibHeader, workers) + errs := make([]error, workers) + done := make(chan struct{}) + for i := range workers { + go func(idx int) { + results[idx], errs[idx] = r.GetHeader("stdio.h") + done <- struct{}{} + }(i) + } + for range workers { + <-done + } + for i := range workers { + require.NoError(t, errs[i]) + } + for i := 1; i < workers; i++ { + assert.Same(t, results[0], results[i], "worker %d saw different pointer", i) + } +} + +func TestCStdlibRegistry_HTTP_NoCachePropagatesHeaderError(t *testing.T) { + srv := serveFixture(t, newCFixture()) + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + srv.Close() // cause subsequent fetches to fail + r.diskCache = nil + + _, err := r.GetHeader("stdio.h") + require.Error(t, err) + assert.Contains(t, err.Error(), "fetchHeaderFromHTTP") +} + +func TestCStdlibRegistry_HTTP_ParseErrorOnGarbageBody(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte("not json")) + })) + t.Cleanup(srv.Close) + + r := NewCStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRoot(t, r) + err := r.LoadManifest(noopLogger{}) + require.Error(t, err) + assert.Contains(t, err.Error(), "parsing manifest") +} + +// --- C++ HTTP loader ------------------------------------------------------- + +func TestCppStdlibRegistry_HTTP_LoadManifestAndHeader(t *testing.T) { + srv := serveFixture(t, newCFixture()) + r := NewCppStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRootCpp(t, r) + + require.NoError(t, r.LoadManifest(noopLogger{})) + assert.Equal(t, 1, r.HeaderCount()) + + cls, err := r.GetClass("vector", "std::vector") + require.NoError(t, err) + assert.Equal(t, []string{"T"}, cls.TypeParams) +} + +func TestCppStdlibRegistry_HTTP_NetworkFailureFallsBackToCachedHeader(t *testing.T) { + srv := serveFixture(t, newCFixture()) + cacheDir := t.TempDir() + r := NewCppStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetClass("vector", "std::vector") + require.NoError(t, err) + + srv.Close() + + r2 := NewCppStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + r2.diskCache = newDiskCacheStore(cacheDir) + require.NoError(t, r2.LoadManifest(noopLogger{})) + cls, err := r2.GetClass("vector", "std::vector") + require.NoError(t, err) + assert.Equal(t, []string{"T"}, cls.TypeParams) +} + +func TestCppStdlibRegistry_HTTP_ChecksumMismatch(t *testing.T) { + f := newCFixture() + f.manifestCpp.Headers[0].Checksum = "sha256:deadbeef" + + srv := serveFixture(t, f) + r := NewCppStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRootCpp(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetClass("vector", "std::vector") + require.Error(t, err) + assert.Contains(t, err.Error(), "digest mismatch") +} + +func TestCppStdlibRegistry_HTTP_ManifestEmbeddedURL(t *testing.T) { + f := newCFixture() + srv := serveFixture(t, f) + f.manifestCpp.Headers[0].URL = srv.URL + "/registries/linux/cpp/v1/vector_stdlib.json" + + r := NewCppStdlibRegistryRemote(srv.URL+"/registries", core.PlatformLinux) + withTempCacheRootCpp(t, r) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetClass("vector", "std::vector") + require.NoError(t, err) +} + +// --- helpers -------------------------------------------------------------- + +func TestVerifyChecksum_EmptyExpectedSkips(t *testing.T) { + require.NoError(t, verifyChecksum([]byte("anything"), "")) +} + +func TestJoinURL(t *testing.T) { + tests := []struct { + name string + base string + segs []string + want string + }{ + {"trailing slash on base", "https://x/", []string{"a", "b"}, "https://x/a/b"}, + {"leading slash on segment", "https://x", []string{"/a", "/b"}, "https://x/a/b"}, + {"empty segments dropped", "https://x", []string{"", "a", ""}, "https://x/a"}, + {"no segments", "https://x", nil, "https://x"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, joinURL(tt.base, tt.segs...)) + }) + } +} + +func TestFetchURL_NilClient(t *testing.T) { + _, err := fetchURL(nil, "http://x") + require.Error(t, err) + assert.Contains(t, err.Error(), "nil HTTP client") +} + +func TestFetchURL_BadRequestURL(t *testing.T) { + _, err := fetchURL(&http.Client{}, "http://[::1:bad") + require.Error(t, err) +} diff --git a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go index 093182f7..ce2d7ff3 100644 --- a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go +++ b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go @@ -64,13 +64,58 @@ func NewCppStdlibRegistryRemote(baseURL, platform string) *CppStdlibRegistryRemo } } -// LoadManifest reads the top-level manifest.json. file:// path is wired in -// PR-02; HTTP path is the PR-03 stub. +// LoadManifest reads the top-level manifest.json over file:// or HTTPS. +// HTTP failures fall back to the on-disk cache regardless of TTL so an +// offline scan still resolves stdlib calls when a previous run populated +// the cache. func (r *CppStdlibRegistryRemote) LoadManifest(logger core.CStdlibLogger) error { if r.fileBase != "" { return r.loadManifestFromFile(logger) } - return errors.New("CppStdlibRegistryRemote: HTTP loader not yet implemented; tracked in PR-03") + return r.loadManifestFromHTTP(logger) +} + +// loadManifestFromHTTP is the C++ counterpart to the C loader's HTTP path. +// The URL layout is identical except for the language segment ("/cpp/v1/" +// instead of "/c/v1/"). +func (r *CppStdlibRegistryRemote) loadManifestFromHTTP(logger core.CStdlibLogger) error { + url := joinURL(r.baseURL, r.platform, "cpp", "v1", "manifest.json") + if logger != nil { + logger.Debug("Downloading C++ stdlib manifest: %s", url) + } + + data, err := fetchURL(r.httpClient, url) + if err != nil { + if cached, cerr := r.diskCache.GetManifest(); cerr == nil { + if logger != nil { + logger.Warning("Network failed for %s; serving cached manifest. Underlying: %v", url, err) + } + r.cacheMutex.Lock() + r.manifest = cached + r.cacheMutex.Unlock() + return nil + } + return fmt.Errorf("loadManifestFromHTTP: %w", err) + } + + var manifest core.CStdlibManifest + if err := json.Unmarshal(data, &manifest); err != nil { + return fmt.Errorf("loadManifestFromHTTP: parsing manifest from %s: %w", url, err) + } + + if cerr := r.diskCache.SaveManifest(data); cerr != nil && logger != nil { + logger.Warning("Failed to save C++ manifest to disk cache: %v", cerr) + } + + r.cacheMutex.Lock() + r.manifest = &manifest + r.cacheMutex.Unlock() + + if logger != nil { + logger.Statistic("Loaded C++ stdlib manifest over HTTP: %d headers for %s", + len(manifest.Headers), r.platform) + } + return nil } func (r *CppStdlibRegistryRemote) loadManifestFromFile(logger core.CStdlibLogger) error { @@ -132,7 +177,50 @@ func (r *CppStdlibRegistryRemote) fetchHeaderLocked(name string) (*core.CStdlibH if r.fileBase != "" { return r.fetchHeaderFromFile(entry) } - return nil, errors.New("CppStdlibRegistryRemote: HTTP fetch not yet implemented; tracked in PR-03") + return r.fetchHeaderFromHTTP(entry) +} + +// fetchHeaderFromHTTP downloads one per-header JSON over HTTP. Mirrors +// the C loader's strategy: disk-cache freshness on the way in, stale-cache +// fallback on network failure, optional checksum verification. +func (r *CppStdlibRegistryRemote) fetchHeaderFromHTTP(entry *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) { + if r.diskCache.IsFresh(entry.File, stdlibCacheTTL) { + if cached, err := r.diskCache.GetHeader(entry.File); err == nil { + return cached, nil + } + } + + url := r.headerURL(entry) + data, err := fetchURL(r.httpClient, url) + if err != nil { + if cached, cerr := r.diskCache.GetHeader(entry.File); cerr == nil { + return cached, nil + } + return nil, fmt.Errorf("fetchHeaderFromHTTP: %w", err) + } + + if err := verifyChecksum(data, entry.Checksum); err != nil { + return nil, fmt.Errorf("fetchHeaderFromHTTP: %s: %w", entry.Header, err) + } + + var h core.CStdlibHeader + if err := json.Unmarshal(data, &h); err != nil { + return nil, fmt.Errorf("fetchHeaderFromHTTP: parsing %s: %w", url, err) + } + + _ = r.diskCache.SaveHeader(entry.File, data) // best-effort + + return &h, nil +} + +// headerURL prefers the manifest-embedded URL when present and falls back +// to //cpp/v1/ for manifests that predate the +// per-entry URL field. +func (r *CppStdlibRegistryRemote) headerURL(entry *core.CStdlibHeaderEntry) string { + if entry.URL != "" { + return entry.URL + } + return joinURL(r.baseURL, r.platform, "cpp", "v1", entry.File) } func (r *CppStdlibRegistryRemote) fetchHeaderFromFile(entry *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) { diff --git a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go index e7c75cd3..a4b13361 100644 --- a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go +++ b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go @@ -148,22 +148,15 @@ func TestCppStdlibRegistry_GetFunctionMissing(t *testing.T) { require.Error(t, err) } -func TestCppStdlibRegistry_HTTPStub(t *testing.T) { - r := NewCppStdlibRegistryRemote("https://x/", core.PlatformLinux) +// TestCppStdlibRegistry_HTTPMode_NetworkFailureNoCacheSurfacesError mirrors +// the C-loader test: with the disk cache explicitly disabled, an HTTP-only +// loader pointed at an unreachable port must surface the error. +func TestCppStdlibRegistry_HTTPMode_NetworkFailureNoCacheSurfacesError(t *testing.T) { + r := NewCppStdlibRegistryRemote("http://127.0.0.1:1/registries", core.PlatformLinux) + r.diskCache = nil err := r.LoadManifest(noopLogger{}) require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") -} - -func TestCppStdlibRegistry_HTTPFetchStub(t *testing.T) { - dir := t.TempDir() - writeCppRegistry(t, dir) - r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) - require.NoError(t, r.LoadManifest(noopLogger{})) - r.fileBase = "" // simulate HTTP-only mode - _, err := r.GetHeader("vector") - require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") + assert.Contains(t, err.Error(), "loadManifestFromHTTP") } func TestCppStdlibRegistry_HeaderCountBeforeLoad(t *testing.T) { From 016b18744b96ede41fd493dd9829a93dba511375 Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 19:05:57 -0400 Subject: [PATCH 2/3] feat(generator): Windows + Darwin walker dispatch Replaces the PR-01 PR-03-error stubs in DiscoverHeaderSources with working source-tree probes for the four cross-platform combinations the workflow targets next. walker_xplat.go: - windowsCSource / windowsCppSource probe the mingw-w64 install at /usr/x86_64-w64-mingw32 (the layout `apt install mingw-w64` produces on ubuntu-latest). The C source covers Win32 + MSVCRT; the C++ source picks the freshest libstdc++ version directory. - darwinCSource / darwinCppSource probe Apple's Command Line Tools first and fall back to a full Xcode install. The C source uses the macOS SDK; the C++ source uses the clang-shipped libc++ tree. - findVersionedDir, firstExistingDir, detectMingwVersion, detectDarwinSDKTag, detectDarwinCppTag are small helpers that derive the version tag stamped into manifest.SystemTag. All probe paths are package vars rather than literals so tests can override them to exercise both hit and miss branches without depending on whether the host actually has mingw/Xcode installed. Two PR-01 tests rewritten to assert the new "headers not found" error messages instead of the deferred-PR-03 stub. Coverage on the new helpers: 75-100%. Co-Authored-By: Claude Sonnet 4.5 --- .../internal/clikeextract/extractor_test.go | 8 +- .../tools/internal/clikeextract/walker.go | 34 ++- .../internal/clikeextract/walker_test.go | 25 +- .../internal/clikeextract/walker_xplat.go | 202 ++++++++++++++++ .../clikeextract/walker_xplat_test.go | 223 ++++++++++++++++++ 5 files changed, 474 insertions(+), 18 deletions(-) create mode 100644 sast-engine/tools/internal/clikeextract/walker_xplat.go create mode 100644 sast-engine/tools/internal/clikeextract/walker_xplat_test.go diff --git a/sast-engine/tools/internal/clikeextract/extractor_test.go b/sast-engine/tools/internal/clikeextract/extractor_test.go index 7e19221c..cdf3175e 100644 --- a/sast-engine/tools/internal/clikeextract/extractor_test.go +++ b/sast-engine/tools/internal/clikeextract/extractor_test.go @@ -140,14 +140,18 @@ func TestRun_OverlayLoadError(t *testing.T) { } func TestRun_DiscoveryError(t *testing.T) { + // Windows + missing mingw toolchain → discovery error with remediation + // hint. Pre-PR-03 this asserted the stub message; now it asserts the + // "headers not found" path. + withTempMingwRoot(t, "/definitely/missing") cfg := Config{ - Target: core.PlatformWindows, // PR-01 doesn't ship windows + Target: core.PlatformWindows, Language: core.LanguageC, OutputDir: t.TempDir(), } err := NewExtractor(cfg).Run() require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") + assert.Contains(t, err.Error(), "mingw-w64") } func TestRun_WalkError(t *testing.T) { diff --git a/sast-engine/tools/internal/clikeextract/walker.go b/sast-engine/tools/internal/clikeextract/walker.go index 6e144a36..7cc5e445 100644 --- a/sast-engine/tools/internal/clikeextract/walker.go +++ b/sast-engine/tools/internal/clikeextract/walker.go @@ -77,13 +77,33 @@ func DiscoverHeaderSources(target, language string) ([]HeaderSource, error) { } return []HeaderSource{src}, nil - case core.PlatformWindows + "/" + core.LanguageC, - core.PlatformWindows + "/" + core.LanguageCpp, - core.PlatformDarwin + "/" + core.LanguageC, - core.PlatformDarwin + "/" + core.LanguageCpp: - return nil, fmt.Errorf("DiscoverHeaderSources: target %q language %q is scheduled for PR-03; "+ - "PR-01 only ships %s/%s and %s/%s", target, language, - core.PlatformLinux, core.LanguageC, core.PlatformLinux, core.LanguageCpp) + case core.PlatformWindows + "/" + core.LanguageC: + src, err := windowsCSource() + if err != nil { + return nil, err + } + return []HeaderSource{src}, nil + + case core.PlatformWindows + "/" + core.LanguageCpp: + src, err := windowsCppSource() + if err != nil { + return nil, err + } + return []HeaderSource{src}, nil + + case core.PlatformDarwin + "/" + core.LanguageC: + src, err := darwinCSource() + if err != nil { + return nil, err + } + return []HeaderSource{src}, nil + + case core.PlatformDarwin + "/" + core.LanguageCpp: + src, err := darwinCppSource() + if err != nil { + return nil, err + } + return []HeaderSource{src}, nil default: return nil, fmt.Errorf("DiscoverHeaderSources: unknown target+language combination %q+%q", target, language) diff --git a/sast-engine/tools/internal/clikeextract/walker_test.go b/sast-engine/tools/internal/clikeextract/walker_test.go index 21dfdd60..0a6be958 100644 --- a/sast-engine/tools/internal/clikeextract/walker_test.go +++ b/sast-engine/tools/internal/clikeextract/walker_test.go @@ -41,19 +41,26 @@ func TestDiscoverHeaderSources_LinuxCpp_NotInstalled(t *testing.T) { assert.Contains(t, err.Error(), "libstdc++") } -func TestDiscoverHeaderSources_NotImplementedTargets(t *testing.T) { - deferred := []struct { - platform, language string +// TestDiscoverHeaderSources_CrossPlatformHeadersMissing verifies that when +// the cross-platform toolchains aren't installed on the host, each target +// surfaces a remediation hint instead of crashing. PR-03 implements the +// dispatch; the host-installation case is covered by walker_xplat_test.go. +func TestDiscoverHeaderSources_CrossPlatformHeadersMissing(t *testing.T) { + withTempMingwRoot(t, "/definitely/missing") + withTempDarwinRoots(t, []string{"/missing/sdk"}, []string{"/missing/cpp"}) + + cases := []struct { + platform, language, fragment string }{ - {core.PlatformWindows, core.LanguageC}, - {core.PlatformWindows, core.LanguageCpp}, - {core.PlatformDarwin, core.LanguageC}, - {core.PlatformDarwin, core.LanguageCpp}, + {core.PlatformWindows, core.LanguageC, "mingw-w64"}, + {core.PlatformWindows, core.LanguageCpp, "mingw libstdc++"}, + {core.PlatformDarwin, core.LanguageC, "macOS SDK"}, + {core.PlatformDarwin, core.LanguageCpp, "libc++"}, } - for _, tt := range deferred { + for _, tt := range cases { _, err := DiscoverHeaderSources(tt.platform, tt.language) require.Error(t, err) - assert.Contains(t, err.Error(), "PR-03") + assert.Contains(t, err.Error(), tt.fragment) } } diff --git a/sast-engine/tools/internal/clikeextract/walker_xplat.go b/sast-engine/tools/internal/clikeextract/walker_xplat.go new file mode 100644 index 00000000..3f0c15e6 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/walker_xplat.go @@ -0,0 +1,202 @@ +package clikeextract + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "sort" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" +) + +// Windows headers, accessed cross-platform via mingw-w64 on Ubuntu. +// +// `apt install mingw-w64` on ubuntu-latest places the Win32 + MSVCRT headers +// under /usr/x86_64-w64-mingw32/include and the mingw libstdc++ tree at +// /usr/x86_64-w64-mingw32/include/c++/. Using mingw on Linux beats +// running a Windows GitHub Actions runner for cost and simplicity, and gives +// us a faithful Win32 surface for stdlib resolution. +// +// All paths exposed as package vars (rather than literals) so tests can +// override them to exercise both the hit and miss branches without depending +// on whether the host actually has mingw installed. +var ( + // windowsMingwRoot is the canonical mingw-w64 install root. Subdirectories + // `include` (C) and `include/c++/` (C++) live underneath. + windowsMingwRoot = "/usr/x86_64-w64-mingw32" + + // darwinSDKRoots is the ordered list of macOS SDK include directories the + // generator probes. Command Line Tools first because that's the lighter + // install used in CI; Xcode.app is a fallback for full developer setups. + darwinSDKRoots = []string{ + "/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include", + "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include", + } + + // darwinCppRoots is the ordered list of clang-shipped libc++ trees under + // the Apple toolchain. macos-latest's xcrun typically lands the headers + // under Command Line Tools. + darwinCppRoots = []string{ + "/Library/Developer/CommandLineTools/usr/include/c++/v1", + "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include/c++/v1", + } +) + +// windowsCSource constructs the HeaderSource for Win32 + MSVCRT C headers. +// Probes the canonical mingw include dir; on miss, returns an error with a +// concrete remediation hint so CI logs make the cause obvious. +func windowsCSource() (HeaderSource, error) { + dir := filepath.Join(windowsMingwRoot, "include") + if !dirExists(dir) { + return HeaderSource{}, fmt.Errorf("windowsCSource: mingw-w64 headers not found at %s "+ + "(install with: apt install mingw-w64)", dir) + } + return HeaderSource{ + Platform: core.PlatformWindows, + Language: core.LanguageC, + SearchDirs: []string{dir}, + HeaderExts: []string{".h"}, + SystemTag: "mingw-w64-" + detectMingwVersion(), + }, nil +} + +// windowsCppSource finds the mingw libstdc++ header tree. Returns an error +// when no version directory exists — without one, the C++ surface is +// unrecoverable (the directory name encodes the version). +// +// The walk lists the C++ STL tree only. Win32 C headers are exposed via +// windowsCSource; mixing both into one source would conflate languages. +func windowsCppSource() (HeaderSource, error) { + root := filepath.Join(windowsMingwRoot, "include", "c++") + dir, version := findVersionedDir(root) + if dir == "" { + return HeaderSource{}, fmt.Errorf("windowsCppSource: no mingw libstdc++ headers under %s "+ + "(install with: apt install g++-mingw-w64)", root) + } + return HeaderSource{ + Platform: core.PlatformWindows, + Language: core.LanguageCpp, + SearchDirs: []string{dir}, + HeaderExts: []string{".h", ".hpp", ".hxx", ""}, + SystemTag: "mingw-w64-libstdc++-" + version, + }, nil +} + +// darwinCSource probes the canonical macOS SDK include locations and uses +// the first one that exists. Apple ships Command Line Tools and full Xcode +// installs in different subtrees; the loader tries CLT first because that's +// the cheaper macos-latest layout. +func darwinCSource() (HeaderSource, error) { + dir := firstExistingDir(darwinSDKRoots) + if dir == "" { + return HeaderSource{}, errors.New("darwinCSource: macOS SDK headers not found at any of " + + fmt.Sprint(darwinSDKRoots) + " (install Command Line Tools: xcode-select --install)") + } + return HeaderSource{ + Platform: core.PlatformDarwin, + Language: core.LanguageC, + SearchDirs: []string{dir}, + HeaderExts: []string{".h"}, + SystemTag: "darwin-" + detectDarwinSDKTag(dir), + }, nil +} + +// darwinCppSource probes the libc++ tree shipped with the Apple toolchain. +// Apple's libc++ has a notably different surface from libstdc++ — different +// container ABI, different `__1::` inline namespace — but the same public +// API; the manifest is generated against the actual installed headers so +// resolution stays correct on the host platform. +func darwinCppSource() (HeaderSource, error) { + dir := firstExistingDir(darwinCppRoots) + if dir == "" { + return HeaderSource{}, errors.New("darwinCppSource: libc++ headers not found at any of " + + fmt.Sprint(darwinCppRoots) + " (install Xcode or Command Line Tools)") + } + return HeaderSource{ + Platform: core.PlatformDarwin, + Language: core.LanguageCpp, + SearchDirs: []string{dir}, + HeaderExts: []string{".h", ".hpp", ".hxx", ""}, + SystemTag: "libc++-darwin-" + detectDarwinCppTag(dir), + }, nil +} + +// findVersionedDir lists root and returns the lexically-largest entry name +// containing a digit (canonical "13", "13.2.0", "v1") together with its +// version. Returns ("","") on missing root or empty result. +// +// Shared between windowsCppSource (looks for c++/) and the +// darwin probes (looks for c++/v) — both want the freshest version dir +// without parsing semver explicitly. +func findVersionedDir(root string) (dir, version string) { + entries, err := os.ReadDir(root) + if err != nil { + return "", "" + } + var versions []string + for _, e := range entries { + if !e.IsDir() { + continue + } + name := e.Name() + if !containsDigit(name) { + continue + } + versions = append(versions, name) + } + if len(versions) == 0 { + return "", "" + } + sort.Strings(versions) + v := versions[len(versions)-1] + return filepath.Join(root, v), v +} + +// firstExistingDir returns the first directory in the list that exists on +// disk, or "" if none do. Order matters — callers list the cheaper / more +// likely option first. +func firstExistingDir(candidates []string) string { + for _, c := range candidates { + if dirExists(c) { + return c + } + } + return "" +} + +// detectMingwVersion derives a version string from the libstdc++ directory +// name embedded under the mingw root. Returns "unknown" when the tree has +// not been probed yet (windowsCppSource hits this path before the C source +// builder runs). Lightweight on purpose: parsing `gcc --version` would add +// an exec dependency that complicates testing for marginal accuracy gain. +func detectMingwVersion() string { + root := filepath.Join(windowsMingwRoot, "include", "c++") + _, v := findVersionedDir(root) + if v == "" { + return "unknown" + } + return v +} + +// detectDarwinSDKTag returns a short identifier for the SDK whose include +// dir was selected. Currently uses the parent directory name (e.g. +// "MacOSX.sdk") which is enough to tell CommandLineTools apart from Xcode. +func detectDarwinSDKTag(includeDir string) string { + // includeDir ends in `.../MacOSX.sdk/usr/include`. + parent := filepath.Base(filepath.Dir(filepath.Dir(includeDir))) + if parent == "" { + return "unknown" + } + return parent +} + +// detectDarwinCppTag returns the libc++ version directory name (typically +// "v1") so the manifest can distinguish future ABI bumps. +func detectDarwinCppTag(includeDir string) string { + base := filepath.Base(includeDir) + if base == "" { + return "unknown" + } + return base +} diff --git a/sast-engine/tools/internal/clikeextract/walker_xplat_test.go b/sast-engine/tools/internal/clikeextract/walker_xplat_test.go new file mode 100644 index 00000000..f952dd1f --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/walker_xplat_test.go @@ -0,0 +1,223 @@ +package clikeextract + +import ( + "os" + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// withTempMingwRoot wires a temporary directory in as the mingw root for +// the duration of t. Restores the original on cleanup so tests are +// independent of order. +func withTempMingwRoot(t *testing.T, root string) { + t.Helper() + orig := windowsMingwRoot + windowsMingwRoot = root + t.Cleanup(func() { windowsMingwRoot = orig }) +} + +// withTempDarwinRoots replaces both the C and C++ Darwin probe lists for t. +func withTempDarwinRoots(t *testing.T, sdkRoots, cppRoots []string) { + t.Helper() + origC := darwinSDKRoots + origCpp := darwinCppRoots + darwinSDKRoots = sdkRoots + darwinCppRoots = cppRoots + t.Cleanup(func() { + darwinSDKRoots = origC + darwinCppRoots = origCpp + }) +} + +func TestWindowsCSource_Found(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "include"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(root, "include", "c++", "13"), 0o755)) + withTempMingwRoot(t, root) + + src, err := windowsCSource() + require.NoError(t, err) + assert.Equal(t, core.PlatformWindows, src.Platform) + assert.Equal(t, core.LanguageC, src.Language) + assert.Equal(t, []string{filepath.Join(root, "include")}, src.SearchDirs) + assert.Equal(t, "mingw-w64-13", src.SystemTag) +} + +func TestWindowsCSource_Missing(t *testing.T) { + withTempMingwRoot(t, filepath.Join(t.TempDir(), "absent")) + _, err := windowsCSource() + require.Error(t, err) + assert.Contains(t, err.Error(), "mingw-w64 headers not found") +} + +func TestWindowsCSource_VersionUnknownWhenCppTreeMissing(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "include"), 0o755)) + withTempMingwRoot(t, root) + + src, err := windowsCSource() + require.NoError(t, err) + assert.Equal(t, "mingw-w64-unknown", src.SystemTag) +} + +func TestWindowsCppSource_Found(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "include", "c++", "13"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(root, "include", "c++", "12"), 0o755)) + withTempMingwRoot(t, root) + + src, err := windowsCppSource() + require.NoError(t, err) + assert.Equal(t, core.PlatformWindows, src.Platform) + assert.Equal(t, core.LanguageCpp, src.Language) + assert.Equal(t, []string{filepath.Join(root, "include", "c++", "13")}, src.SearchDirs, + "freshest version directory must win") + assert.Equal(t, "mingw-w64-libstdc++-13", src.SystemTag) +} + +func TestWindowsCppSource_Missing(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "include"), 0o755)) + withTempMingwRoot(t, root) + + _, err := windowsCppSource() + require.Error(t, err) + assert.Contains(t, err.Error(), "no mingw libstdc++ headers") +} + +func TestDarwinCSource_Found(t *testing.T) { + sdkInclude := filepath.Join(t.TempDir(), "MacOSX.sdk", "usr", "include") + require.NoError(t, os.MkdirAll(sdkInclude, 0o755)) + withTempDarwinRoots(t, []string{sdkInclude}, []string{"/nope"}) + + src, err := darwinCSource() + require.NoError(t, err) + assert.Equal(t, core.PlatformDarwin, src.Platform) + assert.Equal(t, core.LanguageC, src.Language) + assert.Equal(t, "darwin-MacOSX.sdk", src.SystemTag) +} + +func TestDarwinCSource_Missing(t *testing.T) { + withTempDarwinRoots(t, []string{"/no", "/where"}, []string{"/no"}) + _, err := darwinCSource() + require.Error(t, err) + assert.Contains(t, err.Error(), "macOS SDK headers not found") +} + +func TestDarwinCppSource_Found(t *testing.T) { + cppInclude := filepath.Join(t.TempDir(), "v1") + require.NoError(t, os.MkdirAll(cppInclude, 0o755)) + withTempDarwinRoots(t, []string{"/no"}, []string{cppInclude}) + + src, err := darwinCppSource() + require.NoError(t, err) + assert.Equal(t, core.PlatformDarwin, src.Platform) + assert.Equal(t, core.LanguageCpp, src.Language) + assert.Equal(t, "libc++-darwin-v1", src.SystemTag) +} + +func TestDarwinCppSource_Missing(t *testing.T) { + withTempDarwinRoots(t, []string{"/no"}, []string{"/no", "/where"}) + _, err := darwinCppSource() + require.Error(t, err) + assert.Contains(t, err.Error(), "libc++ headers not found") +} + +func TestFindVersionedDir(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "13"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(root, "12"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(root, "experimental"), 0o755)) + + dir, ver := findVersionedDir(root) + assert.Equal(t, filepath.Join(root, "13"), dir) + assert.Equal(t, "13", ver) +} + +func TestFindVersionedDir_MissingRoot(t *testing.T) { + dir, ver := findVersionedDir(filepath.Join(t.TempDir(), "missing")) + assert.Empty(t, dir) + assert.Empty(t, ver) +} + +func TestFindVersionedDir_NoVersionedEntries(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "experimental"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(root, "internal"), 0o755)) + dir, ver := findVersionedDir(root) + assert.Empty(t, dir) + assert.Empty(t, ver) +} + +func TestFirstExistingDir(t *testing.T) { + good := t.TempDir() + assert.Equal(t, good, firstExistingDir([]string{"/missing", good})) + assert.Empty(t, firstExistingDir([]string{"/missing", "/also-missing"})) +} + +func TestDetectDarwinSDKTag(t *testing.T) { + assert.Equal(t, "MacOSX.sdk", + detectDarwinSDKTag("/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include")) +} + +func TestDetectDarwinCppTag(t *testing.T) { + assert.Equal(t, "v1", detectDarwinCppTag("/Library/Developer/CommandLineTools/usr/include/c++/v1")) +} + +// TestDiscoverHeaderSources_DarwinAndWindowsDispatched is the integration +// test for DiscoverHeaderSources: it confirms each (target, language) reaches +// the right per-platform source builder when fixture trees exist on disk. +func TestDiscoverHeaderSources_DarwinAndWindowsDispatched(t *testing.T) { + mingw := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(mingw, "include"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(mingw, "include", "c++", "13"), 0o755)) + withTempMingwRoot(t, mingw) + + sdkInclude := filepath.Join(t.TempDir(), "MacOSX.sdk", "usr", "include") + require.NoError(t, os.MkdirAll(sdkInclude, 0o755)) + cppInclude := filepath.Join(t.TempDir(), "v1") + require.NoError(t, os.MkdirAll(cppInclude, 0o755)) + withTempDarwinRoots(t, []string{sdkInclude}, []string{cppInclude}) + + for _, tt := range []struct { + target, language, wantTag string + }{ + {core.PlatformWindows, core.LanguageC, "mingw-w64-13"}, + {core.PlatformWindows, core.LanguageCpp, "mingw-w64-libstdc++-13"}, + {core.PlatformDarwin, core.LanguageC, "darwin-MacOSX.sdk"}, + {core.PlatformDarwin, core.LanguageCpp, "libc++-darwin-v1"}, + } { + t.Run(tt.target+"/"+tt.language, func(t *testing.T) { + sources, err := DiscoverHeaderSources(tt.target, tt.language) + require.NoError(t, err) + require.Len(t, sources, 1) + assert.Equal(t, tt.wantTag, sources[0].SystemTag) + }) + } +} + +// TestDiscoverHeaderSources_DarwinAndWindowsErrorWhenAbsent surfaces the +// error path: missing toolchain → clear error message. +func TestDiscoverHeaderSources_DarwinAndWindowsErrorWhenAbsent(t *testing.T) { + withTempMingwRoot(t, "/definitely/missing") + withTempDarwinRoots(t, []string{"/missing/sdk"}, []string{"/missing/cpp"}) + + for _, tc := range []struct { + target, language, fragment string + }{ + {core.PlatformWindows, core.LanguageC, "mingw-w64 headers not found"}, + {core.PlatformWindows, core.LanguageCpp, "no mingw libstdc++"}, + {core.PlatformDarwin, core.LanguageC, "macOS SDK headers not found"}, + {core.PlatformDarwin, core.LanguageCpp, "libc++ headers not found"}, + } { + t.Run(tc.target+"/"+tc.language, func(t *testing.T) { + _, err := DiscoverHeaderSources(tc.target, tc.language) + require.Error(t, err) + assert.Contains(t, err.Error(), tc.fragment) + }) + } +} From 2d5c1493a07d519f8a92f05e282a81ed46d7db67 Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 19:06:07 -0400 Subject: [PATCH 3/3] ci: workflow to generate C/C++ stdlib registries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three-runner matrix that produces the six manifest sets the loader expects (linux/{c,cpp}, windows/{c,cpp}, darwin/{c,cpp}): - generate-linux ubuntu-latest with libc6-dev + libstdc++-13-dev - generate-windows ubuntu-latest with mingw-w64 (Win32 + libstdc++) - generate-darwin macos-latest using Command Line Tools' SDK + libc++ tree The publish job collects the artifacts, asserts all 6 manifest.json files are present, and uploads to Cloudflare R2 under the same bucket/prefix scheme the existing Go stdlib pipeline uses (s3://code-pathfinder-assets/registries///v1/). The R2 upload + CDN verification steps run only when both R2_ACCOUNT_ID and R2_ACCESS_KEY_ID are configured — keeps the workflow green on forks and during operator setup. Triggered on push to main when generator code or overlays change, plus manual workflow_dispatch. Co-Authored-By: Claude Sonnet 4.5 --- .github/workflows/generate-clike-stdlib.yml | 218 ++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 .github/workflows/generate-clike-stdlib.yml diff --git a/.github/workflows/generate-clike-stdlib.yml b/.github/workflows/generate-clike-stdlib.yml new file mode 100644 index 00000000..e50e13b4 --- /dev/null +++ b/.github/workflows/generate-clike-stdlib.yml @@ -0,0 +1,218 @@ +name: Generate C/C++ Stdlib Registries + +# Generates C and C++ stdlib registries for Linux, Windows, and Darwin and +# uploads them to Cloudflare R2 (the same backend that serves the Go stdlib +# registries). Three jobs run in parallel: +# +# - generate-linux ubuntu-latest with libc6-dev + libstdc++-13-dev +# - generate-windows ubuntu-latest with mingw-w64 (Win32 + mingw libstdc++) +# - generate-darwin macos-latest using xcrun's Command Line Tools SDK +# +# The publish job collects the six manifest sets and uploads them to R2 only +# when both R2 secrets are present — running this workflow on a fork without +# secrets still validates that generation works. +# +# Trigger: push to main when generator code or overlays change, or on demand +# via workflow_dispatch. + +on: + push: + branches: [main] + paths: + - 'sast-engine/tools/generate_clike_stdlib_registry.go' + - 'sast-engine/tools/internal/clikeextract/**' + - 'sast-engine/tools/c_stdlib_overlay.yaml' + - 'sast-engine/tools/cpp_stdlib_overlay.yaml' + - '.github/workflows/generate-clike-stdlib.yml' + workflow_dispatch: + +permissions: + contents: read + +jobs: + generate-linux: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version: '1.23' + cache: false + + - name: Install Linux stdlib headers + run: | + sudo apt-get update + sudo apt-get install -y libc6-dev libstdc++-13-dev + + - name: Generate Linux C registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=linux --language=c \ + --output-dir=/tmp/clike-out/linux/c/v1 + + - name: Generate Linux C++ registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=linux --language=cpp \ + --output-dir=/tmp/clike-out/linux/cpp/v1 + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: clike-registries-linux + path: /tmp/clike-out/linux/ + retention-days: 7 + + generate-windows: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version: '1.23' + cache: false + + - name: Install mingw-w64 (Win32 + libstdc++ headers on Linux) + run: | + sudo apt-get update + sudo apt-get install -y mingw-w64 g++-mingw-w64 + + - name: Generate Windows C registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=windows --language=c \ + --output-dir=/tmp/clike-out/windows/c/v1 + + - name: Generate Windows C++ registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=windows --language=cpp \ + --output-dir=/tmp/clike-out/windows/cpp/v1 + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: clike-registries-windows + path: /tmp/clike-out/windows/ + retention-days: 7 + + generate-darwin: + runs-on: macos-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version: '1.23' + cache: false + + - name: Verify SDK availability + run: | + xcrun --show-sdk-path + ls /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/stdio.h + ls /Library/Developer/CommandLineTools/usr/include/c++/v1/vector + + - name: Generate Darwin C registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=darwin --language=c \ + --output-dir=/tmp/clike-out/darwin/c/v1 + + - name: Generate Darwin C++ registry + run: | + cd sast-engine + go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \ + --target=darwin --language=cpp \ + --output-dir=/tmp/clike-out/darwin/cpp/v1 + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: clike-registries-darwin + path: /tmp/clike-out/darwin/ + retention-days: 7 + + publish: + needs: [generate-linux, generate-windows, generate-darwin] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: /tmp/clike-out + pattern: clike-registries-* + merge-multiple: false + + - name: Stage registries into the expected layout + run: | + mkdir -p /tmp/clike-staged + for plat in linux windows darwin; do + mkdir -p /tmp/clike-staged/$plat + if [ -d "/tmp/clike-out/clike-registries-$plat/$plat" ]; then + cp -R /tmp/clike-out/clike-registries-$plat/$plat/* /tmp/clike-staged/$plat/ + fi + done + + - name: Verify all 6 manifest.json files + run: | + for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do + path="/tmp/clike-staged/$combo/v1/manifest.json" + test -f "$path" || { echo "Missing: $path"; exit 1; } + echo "OK $combo/v1/manifest.json ($(wc -c < "$path") bytes)" + done + + # Cloudflare R2 upload runs only when both secrets are configured. On a + # fork or before the secrets are provisioned the workflow still passes + # generation + verification and skips the upload — keeps the pipeline + # green during operator setup. + - name: Upload to Cloudflare R2 + if: ${{ env.HAS_R2_CREDS == 'true' }} + env: + HAS_R2_CREDS: ${{ secrets.R2_ACCOUNT_ID != '' && secrets.R2_ACCESS_KEY_ID != '' && secrets.R2_SECRET_ACCESS_KEY != '' }} + R2_ACCOUNT_ID: ${{ secrets.R2_ACCOUNT_ID }} + R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} + R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} + run: | + export AWS_ACCESS_KEY_ID="$R2_ACCESS_KEY_ID" + export AWS_SECRET_ACCESS_KEY="$R2_SECRET_ACCESS_KEY" + R2_ENDPOINT="https://${R2_ACCOUNT_ID}.r2.cloudflarestorage.com" + + for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do + echo "Uploading $combo..." + aws s3 sync "/tmp/clike-staged/$combo/v1/" \ + "s3://code-pathfinder-assets/registries/$combo/v1/" \ + --endpoint-url "$R2_ENDPOINT" \ + --delete \ + --content-type "application/json" \ + --cache-control "public, max-age=86400" + done + + - name: Verify CDN URLs + if: ${{ env.HAS_R2_CREDS == 'true' }} + env: + HAS_R2_CREDS: ${{ secrets.R2_ACCOUNT_ID != '' && secrets.R2_ACCESS_KEY_ID != '' && secrets.R2_SECRET_ACCESS_KEY != '' }} + run: | + for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do + url="https://assets.codepathfinder.dev/registries/$combo/v1/manifest.json" + status=$(curl -s -o /dev/null -w "%{http_code}" "$url") + echo "$status $url" + test "$status" = "200" || { echo "URL not reachable: $url"; exit 1; } + done + + - name: Summary + run: | + echo "## C/C++ Stdlib Registry Generation" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do + mf="/tmp/clike-staged/$combo/v1/manifest.json" + if [ -f "$mf" ]; then + size=$(wc -c < "$mf") + echo "- ${combo} manifest.json: ${size} bytes" >> "$GITHUB_STEP_SUMMARY" + fi + done