From 8278ab206d6f7f5af4b7569602a43858b77d2e31 Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 18:38:08 -0400 Subject: [PATCH 1/4] feat(core+registry): C/C++ stdlib loaders, platform detector, disk cache Introduces the loader infrastructure that PR-02 will plug into the Phase 1 call-graph resolvers: - core: CStdlibLoader / CppStdlibLoader interfaces, SecurityTag on CallSite, and StdlibRegistry / StdlibCppRegistry hooks on the C and C++ module registries plus a SystemIncludes index. - registry/c_stdlib_remote.go + cpp_stdlib_remote.go: dual-mode loaders (file:// active, HTTP stubbed for PR-03) with double-check locked header caches mirroring the Go stdlib loader. - registry/clike_platform_detector.go: macro + path-hint based linux/darwin/windows detection, host-platform fallback. - registry/clike_disk_cache.go: 24h-TTL on-disk cache wired for the PR-03 HTTP path; tested in isolation here. - registry/c_module.go: BuildCSystemIncludeMap so the resolver can walk a caller file's
list. Coverage: 91.4% on registry, 94.7% on core. HTTP fetch paths return explicit "PR-03" errors and stay tested via stub assertions. Co-Authored-By: Claude Sonnet 4.5 --- .../graph/callgraph/core/c_module_types.go | 38 ++- .../callgraph/core/clike_stdlib_types.go | 36 +++ sast-engine/graph/callgraph/core/types.go | 7 + .../graph/callgraph/registry/c_module.go | 48 ++++ .../callgraph/registry/c_stdlib_remote.go | 223 +++++++++++++++ .../registry/c_stdlib_remote_test.go | 267 ++++++++++++++++++ .../callgraph/registry/clike_disk_cache.go | 150 ++++++++++ .../registry/clike_disk_cache_test.go | 148 ++++++++++ .../registry/clike_platform_detector.go | 209 ++++++++++++++ .../registry/clike_platform_detector_test.go | 197 +++++++++++++ .../callgraph/registry/cpp_stdlib_remote.go | 227 +++++++++++++++ .../registry/cpp_stdlib_remote_test.go | 236 ++++++++++++++++ 12 files changed, 1782 insertions(+), 4 deletions(-) create mode 100644 sast-engine/graph/callgraph/registry/c_stdlib_remote.go create mode 100644 sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go create mode 100644 sast-engine/graph/callgraph/registry/clike_disk_cache.go create mode 100644 sast-engine/graph/callgraph/registry/clike_disk_cache_test.go create mode 100644 sast-engine/graph/callgraph/registry/clike_platform_detector.go create mode 100644 sast-engine/graph/callgraph/registry/clike_platform_detector_test.go create mode 100644 sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go create mode 100644 sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go diff --git a/sast-engine/graph/callgraph/core/c_module_types.go b/sast-engine/graph/callgraph/core/c_module_types.go index 0c2e1675..ab041532 100644 --- a/sast-engine/graph/callgraph/core/c_module_types.go +++ b/sast-engine/graph/callgraph/core/c_module_types.go @@ -51,6 +51,26 @@ type CModuleRegistry struct { // computation. Stored so consumers can re-derive prefixes for ad-hoc // files (e.g. an include resolved at query time). ProjectRoot string + + // SystemIncludes maps a project-relative file path to the system header + // names (e.g., "stdio.h", "sys/socket.h", "vector") referenced via + // `#include <...>` from that file. Populated alongside Includes by the + // registry builder. + // + // Phase 2 (C/C++ stdlib): the call-graph builder consults this when an + // unresolved call site needs to be matched against the stdlib registry. + // Kept separate from Includes (which holds project-local headers only) + // so Phase 1's project-internal resolution stays untouched. + SystemIncludes map[string][]string + + // StdlibRegistry is the stdlib loader injected by cmd/scan.go before the + // builder runs. nil when stdlib resolution is disabled (manifest failed + // to load, the --target platform isn't supported yet, or the user passed + // --no-stdlib). Builders MUST nil-check before consulting. + // + // Phase 2 — added in PR-02; populated by registry.NewCStdlibRegistryFile + // (or its HTTP counterpart in PR-03). + StdlibRegistry CStdlibLoader } // NewCModuleRegistry returns an empty CModuleRegistry rooted at projectRoot. @@ -58,10 +78,11 @@ type CModuleRegistry struct { // checks. func NewCModuleRegistry(projectRoot string) *CModuleRegistry { return &CModuleRegistry{ - FileToPrefix: make(map[string]string), - Includes: make(map[string][]string), - FunctionIndex: make(map[string][]string), - ProjectRoot: projectRoot, + FileToPrefix: make(map[string]string), + Includes: make(map[string][]string), + FunctionIndex: make(map[string][]string), + SystemIncludes: make(map[string][]string), + ProjectRoot: projectRoot, } } @@ -104,6 +125,15 @@ type CppModuleRegistry struct { // "include/socket.hpp::mylib::Socket", // ] ClassIndex map[string][]string + + // StdlibCppRegistry is the C++-specific stdlib loader (vector, string, + // std::move, …). Distinct from the embedded CModuleRegistry.StdlibRegistry + // because C++ has class methods + free functions in namespaces that C + // cannot reach. Both fields may be set on a single CppModuleRegistry — + // one for C-shape headers () and one for C++-shape (). + // + // Phase 2 — added in PR-02. + StdlibCppRegistry CppStdlibLoader } // NewCppModuleRegistry returns an empty CppModuleRegistry rooted at diff --git a/sast-engine/graph/callgraph/core/clike_stdlib_types.go b/sast-engine/graph/callgraph/core/clike_stdlib_types.go index 78a04714..63ffdf6e 100644 --- a/sast-engine/graph/callgraph/core/clike_stdlib_types.go +++ b/sast-engine/graph/callgraph/core/clike_stdlib_types.go @@ -1,5 +1,41 @@ package core +// CStdlibLoader is the interface the C call-graph builder uses to query the +// stdlib registry. The PR-02 file:// loader and the upcoming PR-03 HTTP loader +// both satisfy it; tests substitute fakes. +// +// The interface deliberately keeps Logger out of the per-symbol accessors: +// LoadManifest is the one-shot startup operation that needs progress logging, +// and the per-symbol queries happen in tight resolver loops where logging +// would be noise. +type CStdlibLoader interface { + LoadManifest(logger CStdlibLogger) error + GetHeader(name string) (*CStdlibHeader, error) + GetFunction(headerName, funcName string) (*CStdlibFunction, error) + Platform() string + HeaderCount() int +} + +// CppStdlibLoader extends CStdlibLoader with C++-specific accessors. Free +// functions like std::move that live in a namespace map are looked up via +// GetFreeFunction; class methods (vector::push_back) via GetMethod. +type CppStdlibLoader interface { + CStdlibLoader + GetClass(headerName, classFQN string) (*CppStdlibClass, error) + GetMethod(headerName, classFQN, methodName string) (*CStdlibFunction, error) + GetFreeFunction(headerName, fqn string) (*CStdlibFunction, error) +} + +// CStdlibLogger is the subset of *output.Logger that loaders need. Defining a +// narrow interface here (rather than importing the full output package into +// core/) keeps the dependency direction clean — core has no upstream +// dependencies on output. +type CStdlibLogger interface { + Debug(format string, args ...any) + Statistic(format string, args ...any) + Warning(format string, args ...any) +} + // CStdlibRegistry is the root in-memory container for C/C++ stdlib data on a single // (platform, language) axis (e.g. linux/c, linux/cpp, windows/c). It is populated by // the loader (PR-02) from registry JSON hosted on the CDN and consulted by the call diff --git a/sast-engine/graph/callgraph/core/types.go b/sast-engine/graph/callgraph/core/types.go index 17a1178e..e580086d 100644 --- a/sast-engine/graph/callgraph/core/types.go +++ b/sast-engine/graph/callgraph/core/types.go @@ -34,6 +34,13 @@ type CallSite struct { // IsStdlib is true when the resolved target is a Go standard library function. // Set during Go call graph construction when StdlibLoader is available. IsStdlib bool + + // Phase 2 (C/C++ stdlib): SecurityTag carries an overlay-curated marker + // (e.g. "command_injection_sink", "format_string_sink") propagated from the + // stdlib registry. Empty when the resolved target has no security tag or + // the call did not resolve via stdlib. Consumed by future C/C++ rules and + // by `pathfinder resolution-report --diagnose-stdlib` (PR-04). + SecurityTag string } // Resolution failure reason categories for diagnostics: diff --git a/sast-engine/graph/callgraph/registry/c_module.go b/sast-engine/graph/callgraph/registry/c_module.go index f1e39661..0b79bf5e 100644 --- a/sast-engine/graph/callgraph/registry/c_module.go +++ b/sast-engine/graph/callgraph/registry/c_module.go @@ -80,6 +80,7 @@ func BuildCModuleRegistry(projectPath string, codeGraph *graph.CodeGraph) *core. indexFilesAndFunctions(codeGraph, projectPath, languageC, registry, nil) registry.Includes = BuildCIncludeMap(projectPath, codeGraph, languageC) + registry.SystemIncludes = BuildCSystemIncludeMap(projectPath, codeGraph, languageC) return registry } @@ -118,6 +119,7 @@ func BuildCppModuleRegistry(projectPath string, codeGraph *graph.CodeGraph) *cor }) registry.Includes = BuildCIncludeMap(projectPath, codeGraph, languageCpp) + registry.SystemIncludes = BuildCSystemIncludeMap(projectPath, codeGraph, languageCpp) return registry } @@ -172,6 +174,52 @@ func BuildCIncludeMap(projectPath string, codeGraph *graph.CodeGraph, language s return includes } +// BuildCSystemIncludeMap collects system-include directives (`#include <...>`) +// per source file. The returned map is keyed by project-relative source path, +// values are the bare header names (`stdio.h`, `vector`, `sys/socket.h`). +// +// This is the data structure the C/C++ stdlib resolver (PR-02 c_builder / +// cpp_builder fallback) consults to decide which registry header to query +// when a call goes unresolved. Counterpart to BuildCIncludeMap, which holds +// project-local includes only. +// +// Files with no system includes are absent from the map (not present with an +// empty slice) so callers can iterate without nil checks. The function never +// returns nil; an empty map signals "no system includes" rather than "registry +// not built". +func BuildCSystemIncludeMap(projectPath string, codeGraph *graph.CodeGraph, language string) map[string][]string { + systems := make(map[string][]string) + if codeGraph == nil { + return systems + } + + for _, node := range codeGraph.Nodes { + if !isSystemInclude(node, language) { + continue + } + relSource, ok := relativeProjectPath(projectPath, node.File) + if !ok { + continue + } + systems[relSource] = appendUnique(systems[relSource], node.Name) + } + return systems +} + +// isSystemInclude is the mirror of isProjectInclude: returns true exactly when +// the node is an `#include <...>` for the given language. Empty header names +// and nodes from other languages are excluded. +func isSystemInclude(node *graph.Node, language string) bool { + if node == nil || node.Language != language || node.Type != cNodeIncludeStatement { + return false + } + if node.Name == "" || node.File == "" { + return false + } + v, ok := node.Metadata[metaSystemInclude].(bool) + return ok && v +} + // indexFilesAndFunctions populates FileToPrefix and FunctionIndex on the // supplied CModuleRegistry. The optional onFunction callback fires for // every indexed function node with the file's project-relative prefix diff --git a/sast-engine/graph/callgraph/registry/c_stdlib_remote.go b/sast-engine/graph/callgraph/registry/c_stdlib_remote.go new file mode 100644 index 00000000..d2fdc71d --- /dev/null +++ b/sast-engine/graph/callgraph/registry/c_stdlib_remote.go @@ -0,0 +1,223 @@ +package registry + +import ( + "encoding/json" + "errors" + "fmt" + "net/http" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" +) + +// CStdlibRegistryRemote is the loader behind core.CStdlibLoader for C stdlib +// registries. It supports two source modes — file:// (PR-02; used by tests +// and local-validation) and HTTP (stub here, full implementation in PR-03). +// +// The struct intentionally serves both modes through one type: file:// vs +// HTTP differs only in fetch (network vs filesystem read), and everything +// else (manifest validation, in-memory cache, double-check locking) is the +// same. This mirrors the existing GoStdlibRegistryRemote pattern. +type CStdlibRegistryRemote struct { + baseURL string // for HTTP path; empty in file:// mode + platform string // "linux" | "windows" | "darwin" + + // fileBase is set on construction via NewCStdlibRegistryFile; non-empty + // indicates file:// mode (skip HTTP machinery, read from disk). + fileBase string + + manifest *core.CStdlibManifest + headerCache map[string]*core.CStdlibHeader + cacheMutex sync.RWMutex + + httpClient *http.Client // unused in file:// mode + diskCache *diskCacheStore // unused in file:// mode (no point caching local reads) +} + +// NewCStdlibRegistryFile constructs a file:// loader rooted at localPath. +// localPath should point at the directory that contains manifest.json plus +// the per-header *_stdlib.json files (the output of the PR-01 generator). +// +// PR-02 uses this constructor exclusively. PR-03 will switch the production +// default to NewCStdlibRegistryRemote. +func NewCStdlibRegistryFile(localPath, platform string) *CStdlibRegistryRemote { + return &CStdlibRegistryRemote{ + platform: platform, + fileBase: localPath, + headerCache: make(map[string]*core.CStdlibHeader), + } +} + +// NewCStdlibRegistryRemote constructs an HTTP loader. The path under baseURL +// is `{platform}/c/v1/{file}` — matching the URL layout the manifest entries +// embed. The HTTP fetch implementation is stubbed in PR-02 and lands in PR-03. +func NewCStdlibRegistryRemote(baseURL, platform string) *CStdlibRegistryRemote { + cacheRoot := getStdlibCacheRoot() + var dc *diskCacheStore + if cacheRoot != "" { + dc = newDiskCacheStore(filepath.Join(cacheRoot, platform, "c", "v1")) + } + return &CStdlibRegistryRemote{ + baseURL: strings.TrimSuffix(baseURL, "/"), + platform: platform, + headerCache: make(map[string]*core.CStdlibHeader), + httpClient: &http.Client{Timeout: 30 * time.Second}, + diskCache: dc, + } +} + +// LoadManifest is the one-shot startup operation. It reads (or fetches) the +// top-level manifest.json and populates the in-memory pointer; subsequent +// per-header GetHeader calls consult that manifest for the URL/path of each +// per-header JSON. +// +// Logger is optional but expected — pass nil only when wiring the loader +// from tests where logging would be noise. +func (r *CStdlibRegistryRemote) LoadManifest(logger core.CStdlibLogger) error { + if r.fileBase != "" { + return r.loadManifestFromFile(logger) + } + return r.loadManifestFromHTTP(logger) +} + +func (r *CStdlibRegistryRemote) loadManifestFromFile(logger core.CStdlibLogger) error { + path := filepath.Join(r.fileBase, "manifest.json") + data, err := os.ReadFile(path) //nolint:gosec // path is operator-supplied via CLI flag + if err != nil { + return fmt.Errorf("loadManifestFromFile: reading %s: %w", path, err) + } + + var manifest core.CStdlibManifest + if err := json.Unmarshal(data, &manifest); err != nil { + return fmt.Errorf("loadManifestFromFile: parsing %s: %w", path, err) + } + + r.cacheMutex.Lock() + r.manifest = &manifest + r.cacheMutex.Unlock() + + if logger != nil { + logger.Statistic("Loaded C stdlib manifest from file: %d headers for %s", + len(manifest.Headers), r.platform) + } + return nil +} + +// loadManifestFromHTTP is the PR-03 hook. PR-02 ships it as a deliberate stub +// so the type satisfies CStdlibLoader without any half-built network code +// shipping early. +func (r *CStdlibRegistryRemote) loadManifestFromHTTP(_ core.CStdlibLogger) error { + return errors.New("CStdlibRegistryRemote: HTTP loader not yet implemented; tracked in PR-03") +} + +// GetHeader retrieves the per-header content, fetching on first reference and +// caching afterward. Concurrent access is safe — the double-check pattern +// guarantees at most one fetch per header even under heavy parallelism. The +// implementation mirrors GoStdlibRegistryRemote.GetPackage at +// graph/callgraph/registry/go_stdlib_remote.go:96-160. +func (r *CStdlibRegistryRemote) GetHeader(name string) (*core.CStdlibHeader, error) { + // Fast path: read lock + cache hit. + r.cacheMutex.RLock() + if h, ok := r.headerCache[name]; ok { + r.cacheMutex.RUnlock() + return h, nil + } + r.cacheMutex.RUnlock() + + // Slow path: write lock, double-check, then fetch. + r.cacheMutex.Lock() + defer r.cacheMutex.Unlock() + + if h, ok := r.headerCache[name]; ok { + return h, nil + } + + h, err := r.fetchHeaderLocked(name) + if err != nil { + return nil, err + } + r.headerCache[name] = h + return h, nil +} + +// fetchHeaderLocked is the inner fetch routine — caller already holds the +// write lock. Re-checks the cache (in case another goroutine populated it), +// then dispatches to file:// or HTTP based on mode. +func (r *CStdlibRegistryRemote) fetchHeaderLocked(name string) (*core.CStdlibHeader, error) { + if h, ok := r.headerCache[name]; ok { + return h, nil + } + if r.manifest == nil { + return nil, errors.New("fetchHeaderLocked: manifest not loaded; call LoadManifest first") + } + + entry := r.manifest.GetHeaderEntry(name) + if entry == nil { + return nil, fmt.Errorf("fetchHeaderLocked: header %q not in stdlib manifest", name) + } + + if r.fileBase != "" { + return r.fetchHeaderFromFile(entry) + } + return r.fetchHeaderFromHTTP(entry) +} + +func (r *CStdlibRegistryRemote) fetchHeaderFromFile(entry *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) { + path := filepath.Join(r.fileBase, entry.File) + data, err := os.ReadFile(path) //nolint:gosec // path is operator-supplied via CLI flag + if err != nil { + return nil, fmt.Errorf("fetchHeaderFromFile: reading %s: %w", path, err) + } + var h core.CStdlibHeader + if err := json.Unmarshal(data, &h); err != nil { + return nil, fmt.Errorf("fetchHeaderFromFile: parsing %s: %w", path, err) + } + return &h, nil +} + +// fetchHeaderFromHTTP is the PR-03 hook. PR-02 stub keeps the type +// satisfying its interface contract without shipping half-built network code. +func (r *CStdlibRegistryRemote) fetchHeaderFromHTTP(_ *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) { + return nil, errors.New("CStdlibRegistryRemote: HTTP fetch not yet implemented; tracked in PR-03") +} + +// GetFunction is a convenience accessor: GetHeader followed by a function +// lookup. Returns the same error shapes the underlying calls produce. +func (r *CStdlibRegistryRemote) GetFunction(headerName, funcName string) (*core.CStdlibFunction, error) { + h, err := r.GetHeader(headerName) + if err != nil { + return nil, err + } + if f, ok := h.Functions[funcName]; ok { + return f, nil + } + if f, ok := h.FreeFunctions[funcName]; ok { + return f, nil + } + return nil, fmt.Errorf("GetFunction: %q not in header %q", funcName, headerName) +} + +// Platform returns the platform tag this registry was configured for. +// Used by the resolver to log diagnostics; not part of the cache key. +func (r *CStdlibRegistryRemote) Platform() string { + return r.platform +} + +// HeaderCount returns the total number of headers listed in the loaded +// manifest. Returns 0 before LoadManifest has been called. +func (r *CStdlibRegistryRemote) HeaderCount() int { + r.cacheMutex.RLock() + defer r.cacheMutex.RUnlock() + if r.manifest == nil { + return 0 + } + return len(r.manifest.Headers) +} + +// Compile-time interface checks — fail at build time if the struct ever +// drifts from the contract. +var _ core.CStdlibLoader = (*CStdlibRegistryRemote)(nil) diff --git a/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go b/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go new file mode 100644 index 00000000..9a572705 --- /dev/null +++ b/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go @@ -0,0 +1,267 @@ +package registry + +import ( + "encoding/json" + "os" + "path/filepath" + "sync" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// writeRegistry materialises a manifest plus per-header JSON files in dir +// and returns the dir. Reused by both C and C++ loader tests via copies in +// each test file. +func writeCRegistry(t *testing.T, dir string) { + t.Helper() + + stdio := &core.CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "stdio.h", + ModuleID: "c::stdio", + Language: core.LanguageC, + Platform: core.PlatformLinux, + Functions: map[string]*core.CStdlibFunction{ + "printf": { + FQN: "c::stdio::printf", + ReturnType: "int", + Source: core.SourceOverlay, + Confidence: 1.0, + }, + }, + } + stdioBytes, err := json.Marshal(stdio) + require.NoError(t, err) + require.NoError(t, os.WriteFile(filepath.Join(dir, "stdio_stdlib.json"), stdioBytes, 0o644)) + + stdlib := &core.CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "stdlib.h", + ModuleID: "c::stdlib", + Language: core.LanguageC, + Functions: map[string]*core.CStdlibFunction{ + "malloc": {FQN: "c::stdlib::malloc", ReturnType: "void*", Source: core.SourceHeader, Confidence: 1.0}, + "system": { + FQN: "c::stdlib::system", ReturnType: "int", + SecurityTag: "command_injection_sink", + Source: core.SourceOverlay, Confidence: 1.0, + }, + }, + } + stdlibBytes, err := json.Marshal(stdlib) + require.NoError(t, err) + require.NoError(t, os.WriteFile(filepath.Join(dir, "stdlib_stdlib.json"), stdlibBytes, 0o644)) + + manifest := &core.CStdlibManifest{ + SchemaVersion: "1.0.0", + RegistryVersion: "v1", + Platform: core.PlatformLinux, + Language: core.LanguageC, + Headers: []*core.CStdlibHeaderEntry{ + {Header: "stdio.h", ModuleID: "c::stdio", File: "stdio_stdlib.json"}, + {Header: "stdlib.h", ModuleID: "c::stdlib", File: "stdlib_stdlib.json"}, + }, + Statistics: &core.CStdlibStatistics{TotalHeaders: 2, TotalFunctions: 3}, + } + mBytes, err := json.Marshal(manifest) + require.NoError(t, err) + require.NoError(t, os.WriteFile(filepath.Join(dir, "manifest.json"), mBytes, 0o644)) +} + +// noopLogger satisfies core.CStdlibLogger without writing anywhere. +type noopLogger struct{} + +func (noopLogger) Debug(string, ...any) {} +func (noopLogger) Statistic(string, ...any) {} +func (noopLogger) Warning(string, ...any) {} + +func TestCStdlibRegistry_FileMode_LoadManifest(t *testing.T) { + dir := t.TempDir() + writeCRegistry(t, dir) + + r := NewCStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + assert.Equal(t, core.PlatformLinux, r.Platform()) + assert.Equal(t, 2, r.HeaderCount()) +} + +func TestCStdlibRegistry_FileMode_LoadManifest_NilLogger(t *testing.T) { + dir := t.TempDir() + writeCRegistry(t, dir) + + r := NewCStdlibRegistryFile(dir, core.PlatformLinux) + // nil logger is allowed. + require.NoError(t, r.LoadManifest(nil)) +} + +func TestCStdlibRegistry_FileMode_MissingManifest(t *testing.T) { + r := NewCStdlibRegistryFile(filepath.Join(t.TempDir(), "absent"), core.PlatformLinux) + err := r.LoadManifest(noopLogger{}) + require.Error(t, err) + assert.Contains(t, err.Error(), "reading") +} + +func TestCStdlibRegistry_FileMode_CorruptManifest(t *testing.T) { + dir := t.TempDir() + require.NoError(t, os.WriteFile(filepath.Join(dir, "manifest.json"), []byte("{not json"), 0o644)) + r := NewCStdlibRegistryFile(dir, core.PlatformLinux) + err := r.LoadManifest(noopLogger{}) + require.Error(t, err) + assert.Contains(t, err.Error(), "parsing") +} + +func TestCStdlibRegistry_FileMode_GetHeader(t *testing.T) { + dir := t.TempDir() + writeCRegistry(t, dir) + + r := NewCStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + h, err := r.GetHeader("stdio.h") + require.NoError(t, err) + assert.Equal(t, "stdio.h", h.Header) + require.Contains(t, h.Functions, "printf") + + // Same header again — should hit cache. + h2, err := r.GetHeader("stdio.h") + require.NoError(t, err) + assert.Same(t, h, h2, "second GetHeader should return cached pointer") +} + +func TestCStdlibRegistry_FileMode_GetHeaderUnknown(t *testing.T) { + dir := t.TempDir() + writeCRegistry(t, dir) + r := NewCStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + _, err := r.GetHeader("missing.h") + require.Error(t, err) + assert.Contains(t, err.Error(), "not in stdlib manifest") +} + +func TestCStdlibRegistry_FileMode_GetHeader_BeforeLoad(t *testing.T) { + r := NewCStdlibRegistryFile(t.TempDir(), core.PlatformLinux) + _, err := r.GetHeader("stdio.h") + require.Error(t, err) + assert.Contains(t, err.Error(), "manifest not loaded") +} + +func TestCStdlibRegistry_FileMode_GetHeaderMissingFile(t *testing.T) { + dir := t.TempDir() + writeCRegistry(t, dir) + // Delete the per-header file; manifest still references it. + require.NoError(t, os.Remove(filepath.Join(dir, "stdio_stdlib.json"))) + + r := NewCStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetHeader("stdio.h") + require.Error(t, err) + assert.Contains(t, err.Error(), "reading") +} + +func TestCStdlibRegistry_FileMode_GetHeaderCorruptFile(t *testing.T) { + dir := t.TempDir() + writeCRegistry(t, dir) + require.NoError(t, os.WriteFile(filepath.Join(dir, "stdio_stdlib.json"), []byte("garbage"), 0o644)) + + r := NewCStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetHeader("stdio.h") + require.Error(t, err) + assert.Contains(t, err.Error(), "parsing") +} + +func TestCStdlibRegistry_FileMode_GetFunction(t *testing.T) { + dir := t.TempDir() + writeCRegistry(t, dir) + r := NewCStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + got, err := r.GetFunction("stdio.h", "printf") + require.NoError(t, err) + assert.Equal(t, "c::stdio::printf", got.FQN) + + // Function in a different header. + got, err = r.GetFunction("stdlib.h", "system") + require.NoError(t, err) + assert.Equal(t, "command_injection_sink", got.SecurityTag) + + // Missing function in present header. + _, err = r.GetFunction("stdio.h", "scanf") + require.Error(t, err) + + // Missing header. + _, err = r.GetFunction("missing.h", "x") + require.Error(t, err) +} + +func TestCStdlibRegistry_HeaderCount_BeforeLoad(t *testing.T) { + r := NewCStdlibRegistryFile(t.TempDir(), core.PlatformLinux) + assert.Equal(t, 0, r.HeaderCount()) +} + +func TestCStdlibRegistry_DoubleCheckLocking(t *testing.T) { + dir := t.TempDir() + writeCRegistry(t, dir) + r := NewCStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + // Hammer the loader from many goroutines and confirm no panic / no + // data corruption. + var wg sync.WaitGroup + const workers = 100 + results := make([]*core.CStdlibHeader, workers) + for i := range workers { + wg.Add(1) + go func(idx int) { + defer wg.Done() + h, err := r.GetHeader("stdio.h") + if err == nil { + results[idx] = h + } + }(i) + } + wg.Wait() + + // All goroutines should see the same cached pointer. + for i := 1; i < workers; i++ { + assert.Same(t, results[0], results[i], "worker %d saw different pointer", i) + } +} + +func TestCStdlibRegistry_HTTPMode_Stub(t *testing.T) { + r := NewCStdlibRegistryRemote("https://example.com/registries", core.PlatformLinux) + err := r.LoadManifest(noopLogger{}) + require.Error(t, err) + assert.Contains(t, err.Error(), "PR-03") +} + +func TestCStdlibRegistry_HTTPMode_FetchHeaderStub(t *testing.T) { + // Construct the HTTP loader with an in-memory manifest by going through + // a file:// loader first, then forcing the fetch path to HTTP. + dir := t.TempDir() + writeCRegistry(t, dir) + r := NewCStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + // Switch to HTTP mode mid-flight by clearing fileBase. Tests-only — + // production code never does this. + r.fileBase = "" + _, err := r.GetHeader("stdio.h") // fresh header (not yet cached) + require.Error(t, err) + assert.Contains(t, err.Error(), "PR-03") +} + +func TestCStdlibRegistry_RemoteCtor_TrimsTrailingSlash(t *testing.T) { + r := NewCStdlibRegistryRemote("https://example.com/registries/", core.PlatformLinux) + assert.Equal(t, "https://example.com/registries", r.baseURL) +} + +func TestCStdlibRegistry_ImplementsInterface(t *testing.T) { + var _ core.CStdlibLoader = NewCStdlibRegistryFile(t.TempDir(), core.PlatformLinux) + var _ core.CStdlibLoader = NewCStdlibRegistryRemote("https://x", core.PlatformLinux) +} diff --git a/sast-engine/graph/callgraph/registry/clike_disk_cache.go b/sast-engine/graph/callgraph/registry/clike_disk_cache.go new file mode 100644 index 00000000..944dc181 --- /dev/null +++ b/sast-engine/graph/callgraph/registry/clike_disk_cache.go @@ -0,0 +1,150 @@ +package registry + +import ( + "encoding/json" + "errors" + "os" + "path/filepath" + "runtime" + "time" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" +) + +// diskCacheStore is a thin on-disk cache for stdlib registry JSON files. It +// mirrors the cache shape used by the Python and Go stdlib loaders: per-file +// entries with a 24h TTL and a "fall back to stale data on network failure" +// rule (the latter is enforced by the loader, not the cache). +// +// The cache is filesystem-backed. Each cached file is stored as a regular +// JSON file inside rootDir; freshness is determined by file mtime. On Linux +// and macOS the canonical rootDir is `~/.cache/pathfinder/registries/...`; +// on Windows we fall back to `%LOCALAPPDATA%\pathfinder\registries\...`. +// +// PR-02 ships the structure but the file:// loader does NOT use it (no point +// caching a local-file read). The HTTP loader in PR-03 will exercise it. +type diskCacheStore struct { + rootDir string +} + +// newDiskCacheStore returns a store rooted at rootDir. The directory is not +// created eagerly — Save* methods MkdirAll on first write so cold-start +// scans without write permission still succeed (with cache silently +// disabled). +func newDiskCacheStore(rootDir string) *diskCacheStore { + return &diskCacheStore{rootDir: rootDir} +} + +// SaveManifest writes the manifest JSON bytes to disk. Errors are non-fatal: +// returning err lets the caller log a warning, but loaders should still serve +// the manifest from memory and continue scanning. +func (s *diskCacheStore) SaveManifest(data []byte) error { + if s == nil || s.rootDir == "" { + return errors.New("diskCacheStore: not configured") + } + if err := os.MkdirAll(s.rootDir, 0o755); err != nil { + return err + } + return os.WriteFile(filepath.Join(s.rootDir, "manifest.json"), data, 0o644) +} + +// GetManifest reads the cached manifest JSON. Returns ErrCacheMiss when no +// cached copy exists. +func (s *diskCacheStore) GetManifest() (*core.CStdlibManifest, error) { + if s == nil || s.rootDir == "" { + return nil, ErrCacheMiss + } + path := filepath.Join(s.rootDir, "manifest.json") + data, err := os.ReadFile(path) //nolint:gosec // path is under our managed cache dir + if err != nil { + if os.IsNotExist(err) { + return nil, ErrCacheMiss + } + return nil, err + } + var m core.CStdlibManifest + if err := json.Unmarshal(data, &m); err != nil { + return nil, err + } + return &m, nil +} + +// SaveHeader writes one per-header JSON file. headerName is the manifest's +// File field (e.g., "stdio_stdlib.json") — the cache uses the same filename +// to keep on-disk layout identical to the CDN layout. +func (s *diskCacheStore) SaveHeader(filename string, data []byte) error { + if s == nil || s.rootDir == "" { + return errors.New("diskCacheStore: not configured") + } + if err := os.MkdirAll(s.rootDir, 0o755); err != nil { + return err + } + return os.WriteFile(filepath.Join(s.rootDir, filename), data, 0o644) +} + +// GetHeader reads a cached per-header JSON. Returns ErrCacheMiss when the +// file is absent. +func (s *diskCacheStore) GetHeader(filename string) (*core.CStdlibHeader, error) { + if s == nil || s.rootDir == "" { + return nil, ErrCacheMiss + } + path := filepath.Join(s.rootDir, filename) + data, err := os.ReadFile(path) //nolint:gosec // path is under our managed cache dir + if err != nil { + if os.IsNotExist(err) { + return nil, ErrCacheMiss + } + return nil, err + } + var h core.CStdlibHeader + if err := json.Unmarshal(data, &h); err != nil { + return nil, err + } + return &h, nil +} + +// IsFresh reports whether the on-disk file at filename was written within +// the given TTL. A file that doesn't exist is always stale; the test is +// inclusive of the TTL boundary (file mtime == now-TTL is still fresh). +func (s *diskCacheStore) IsFresh(filename string, ttl time.Duration) bool { + if s == nil || s.rootDir == "" { + return false + } + info, err := os.Stat(filepath.Join(s.rootDir, filename)) + if err != nil { + return false + } + return time.Since(info.ModTime()) <= ttl +} + +// ErrCacheMiss is returned by GetManifest / GetHeader when no cached copy is +// present. Loaders distinguish ErrCacheMiss from other errors so they can +// fall through to a network fetch without surfacing the miss as a warning. +var ErrCacheMiss = errors.New("cache miss") + +// stdlibCacheTTL is the freshness window for cached registry files. Stdlib +// libraries change slowly so a 24h cache is a good balance between fresh +// content and avoiding network calls on every scan. +const stdlibCacheTTL = 24 * time.Hour + +// getStdlibCacheRoot returns the platform-conventional cache root for stdlib +// registries. On Linux/macOS: $XDG_CACHE_HOME/pathfinder/registries (falling +// back to $HOME/.cache/pathfinder/registries). On Windows: $LOCALAPPDATA/ +// pathfinder/registries. +// +// Returns "" if no usable directory can be discovered — callers should treat +// that as "cache disabled" and continue without one. +func getStdlibCacheRoot() string { + if runtime.GOOS == "windows" { + if appdata := os.Getenv("LOCALAPPDATA"); appdata != "" { + return filepath.Join(appdata, "pathfinder", "registries") + } + } + if xdg := os.Getenv("XDG_CACHE_HOME"); xdg != "" { + return filepath.Join(xdg, "pathfinder", "registries") + } + if home, err := os.UserHomeDir(); err == nil && home != "" { + return filepath.Join(home, ".cache", "pathfinder", "registries") + } + return "" +} diff --git a/sast-engine/graph/callgraph/registry/clike_disk_cache_test.go b/sast-engine/graph/callgraph/registry/clike_disk_cache_test.go new file mode 100644 index 00000000..e160991c --- /dev/null +++ b/sast-engine/graph/callgraph/registry/clike_disk_cache_test.go @@ -0,0 +1,148 @@ +package registry + +import ( + "encoding/json" + "errors" + "os" + "path/filepath" + "testing" + "time" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDiskCache_ManifestRoundTrip(t *testing.T) { + c := newDiskCacheStore(t.TempDir()) + m := &core.CStdlibManifest{ + SchemaVersion: "1.0.0", + Platform: core.PlatformLinux, + Language: core.LanguageC, + } + data, err := json.Marshal(m) + require.NoError(t, err) + + require.NoError(t, c.SaveManifest(data)) + + got, err := c.GetManifest() + require.NoError(t, err) + assert.Equal(t, "1.0.0", got.SchemaVersion) + assert.Equal(t, core.PlatformLinux, got.Platform) +} + +func TestDiskCache_ManifestMiss(t *testing.T) { + c := newDiskCacheStore(t.TempDir()) + _, err := c.GetManifest() + require.Error(t, err) + assert.True(t, errors.Is(err, ErrCacheMiss)) +} + +func TestDiskCache_ManifestCorrupt(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.WriteFile(filepath.Join(root, "manifest.json"), []byte("{not json"), 0o644)) + c := newDiskCacheStore(root) + _, err := c.GetManifest() + require.Error(t, err) + assert.False(t, errors.Is(err, ErrCacheMiss)) +} + +func TestDiskCache_HeaderRoundTrip(t *testing.T) { + c := newDiskCacheStore(t.TempDir()) + h := &core.CStdlibHeader{Header: "stdio.h", Language: core.LanguageC} + data, err := json.Marshal(h) + require.NoError(t, err) + + require.NoError(t, c.SaveHeader("stdio_stdlib.json", data)) + + got, err := c.GetHeader("stdio_stdlib.json") + require.NoError(t, err) + assert.Equal(t, "stdio.h", got.Header) +} + +func TestDiskCache_HeaderMiss(t *testing.T) { + c := newDiskCacheStore(t.TempDir()) + _, err := c.GetHeader("absent.json") + require.Error(t, err) + assert.True(t, errors.Is(err, ErrCacheMiss)) +} + +func TestDiskCache_HeaderCorrupt(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.WriteFile(filepath.Join(root, "stdio_stdlib.json"), []byte("garbage"), 0o644)) + c := newDiskCacheStore(root) + _, err := c.GetHeader("stdio_stdlib.json") + require.Error(t, err) +} + +func TestDiskCache_IsFreshWithinTTL(t *testing.T) { + c := newDiskCacheStore(t.TempDir()) + require.NoError(t, c.SaveHeader("x.json", []byte("{}"))) + assert.True(t, c.IsFresh("x.json", 24*time.Hour)) +} + +func TestDiskCache_IsFreshExpired(t *testing.T) { + root := t.TempDir() + c := newDiskCacheStore(root) + require.NoError(t, c.SaveHeader("x.json", []byte("{}"))) + // Backdate the file to 25h ago. + old := time.Now().Add(-25 * time.Hour) + require.NoError(t, os.Chtimes(filepath.Join(root, "x.json"), old, old)) + assert.False(t, c.IsFresh("x.json", 24*time.Hour)) +} + +func TestDiskCache_IsFreshMissingFile(t *testing.T) { + c := newDiskCacheStore(t.TempDir()) + assert.False(t, c.IsFresh("nope.json", 24*time.Hour)) +} + +func TestDiskCache_NilStore(t *testing.T) { + var c *diskCacheStore + require.Error(t, c.SaveManifest([]byte("{}"))) + require.Error(t, c.SaveHeader("x.json", []byte("{}"))) + + _, err := c.GetManifest() + require.True(t, errors.Is(err, ErrCacheMiss)) + _, err = c.GetHeader("x.json") + require.True(t, errors.Is(err, ErrCacheMiss)) + assert.False(t, c.IsFresh("x.json", time.Hour)) +} + +func TestDiskCache_EmptyRoot(t *testing.T) { + c := newDiskCacheStore("") + require.Error(t, c.SaveManifest([]byte("{}"))) + _, err := c.GetManifest() + require.True(t, errors.Is(err, ErrCacheMiss)) +} + +func TestDiskCache_SavesCreatesDir(t *testing.T) { + root := filepath.Join(t.TempDir(), "nested", "deep") + c := newDiskCacheStore(root) + require.NoError(t, c.SaveHeader("x.json", []byte("{}"))) + _, err := os.Stat(filepath.Join(root, "x.json")) + require.NoError(t, err) +} + +func TestGetStdlibCacheRoot(t *testing.T) { + got := getStdlibCacheRoot() + // Either "" (no usable dir) or a path under the user's home/cache. + if got == "" { + return + } + assert.Contains(t, got, "pathfinder") + assert.Contains(t, got, "registries") +} + +func TestGetStdlibCacheRoot_PrefersXDG(t *testing.T) { + if got := os.Getenv("XDG_CACHE_HOME"); got == "" { + t.Setenv("XDG_CACHE_HOME", t.TempDir()) + } else { + // Already set; just confirm it propagates. + } + got := getStdlibCacheRoot() + assert.Contains(t, got, "pathfinder") +} + +func TestStdlibCacheTTL_Constant(t *testing.T) { + assert.Equal(t, 24*time.Hour, stdlibCacheTTL) +} diff --git a/sast-engine/graph/callgraph/registry/clike_platform_detector.go b/sast-engine/graph/callgraph/registry/clike_platform_detector.go new file mode 100644 index 00000000..3ec55678 --- /dev/null +++ b/sast-engine/graph/callgraph/registry/clike_platform_detector.go @@ -0,0 +1,209 @@ +package registry + +import ( + "io/fs" + "os" + "path/filepath" + "strings" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" +) + +// DetectClikeTarget chooses the stdlib registry target for a project — one of +// "linux", "windows", or "darwin". The decision uses a three-source heuristic: +// +// 1. CLI override (the --target flag value, passed in as `override`). If +// non-empty and matches a known platform, returned verbatim. An unknown +// value falls through to the heuristic so the user gets a sensible +// default rather than a startup error. +// 2. Macro signal — count occurrences of `_WIN32`, `_WIN64`, `__APPLE__`, +// `__MACH__`, `__linux__`, `__GLIBC__` across the project's C/C++ source +// and header files. +// 3. Path signal — count `/win32/`, `/windows/`, `/darwin/`, `/macos/`, +// `/linux/`, `/unix/` segments in file paths. Each match is worth 5 points +// (heavier than a single macro mention because path conventions are more +// deliberate). +// +// On a tie or empty signal, returns "linux" — the default target for most +// security-critical C/C++ codebases. +// +// The walk is bounded: we read at most platformProbeMaxBytes per file and +// platformProbeMaxFiles files in total. A multi-million-line project still +// returns a result in well under a second. +func DetectClikeTarget(projectPath, override string) string { + if t := normaliseTarget(override); t != "" { + return t + } + + macroScores := scanPlatformMacros(projectPath) + pathScores := scanPathHints(projectPath) + + scores := map[string]int{ + core.PlatformLinux: macroScores["__linux__"] + macroScores["__GLIBC__"] + + 5*pathScores["linux"], + core.PlatformWindows: macroScores["_WIN32"] + macroScores["_WIN64"] + + 5*pathScores["windows"], + core.PlatformDarwin: macroScores["__APPLE__"] + macroScores["__MACH__"] + + 5*pathScores["darwin"], + } + + best := core.PlatformLinux + bestScore := 0 + for plat, score := range scores { + if score > bestScore { + best = plat + bestScore = score + } + } + return best +} + +// normaliseTarget validates an explicit --target flag value. Returns the +// canonical platform string on a recognised input, or "" so the caller knows +// to fall through to the heuristic. +func normaliseTarget(s string) string { + switch strings.ToLower(strings.TrimSpace(s)) { + case "": + return "" + case "linux", "ubuntu", "debian", "alpine": + return core.PlatformLinux + case "windows", "win32", "win64", "win", "msvc", "mingw": + return core.PlatformWindows + case "darwin", "macos", "macosx", "osx", "apple": + return core.PlatformDarwin + default: + return "" + } +} + +const ( + // platformProbeMaxBytes caps the per-file read length. Macros usually + // appear within the first kilobyte; reading more wastes I/O on enormous + // generated headers. + platformProbeMaxBytes = 4096 + + // platformProbeMaxFiles caps the total number of files inspected. The + // signal saturates well before we hit it. + platformProbeMaxFiles = 5000 +) + +// scanPlatformMacros walks the project tree and counts platform-defining +// macro mentions in source files. Recognises the canonical preprocessor +// guards used by glibc, MSVCRT, and Apple SDK headers. +func scanPlatformMacros(projectPath string) map[string]int { + counts := map[string]int{ + "_WIN32": 0, + "_WIN64": 0, + "__APPLE__": 0, + "__MACH__": 0, + "__linux__": 0, + "__GLIBC__": 0, + } + + scanned := 0 + _ = filepath.WalkDir(projectPath, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return nil //nolint:nilerr // ignore unreadable dirs; signal will sample what's reachable + } + if d.IsDir() { + if shouldSkipProjectDir(d.Name()) { + return fs.SkipDir + } + return nil + } + if scanned >= platformProbeMaxFiles { + return fs.SkipAll + } + if !isClikeSourceExt(filepath.Ext(d.Name())) { + return nil + } + scanned++ + + data, readErr := readFileLimited(path, platformProbeMaxBytes) + if readErr != nil { + // Skip unreadable files (permissions, transient I/O errors) without + // failing the whole detection — best-effort probe by design. + return nil //nolint:nilerr // intentional: skip unreadable file + } + text := string(data) + for macro := range counts { + counts[macro] += strings.Count(text, macro) + } + return nil + }) + + return counts +} + +// scanPathHints counts platform-named directories in the project tree. Maps +// a project layout convention ("src/linux/...", "third_party/win32/...") into +// a strong platform vote. +func scanPathHints(projectPath string) map[string]int { + hits := map[string]int{ + "linux": 0, + "windows": 0, + "darwin": 0, + } + + _ = filepath.WalkDir(projectPath, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return nil //nolint:nilerr // unreadable dirs are skipped silently + } + if !d.IsDir() { + return nil + } + if shouldSkipProjectDir(d.Name()) { + return fs.SkipDir + } + switch strings.ToLower(d.Name()) { + case "linux", "unix": + hits["linux"]++ + case "windows", "win32", "win64", "msvc": + hits["windows"]++ + case "darwin", "macos", "apple": + hits["darwin"]++ + } + return nil + }) + return hits +} + +// shouldSkipProjectDir prunes directories that should not contribute to the +// platform signal — vendored dependencies, build artifacts, version control +// metadata. The list is conservative; everything else is walked. +func shouldSkipProjectDir(name string) bool { + switch name { + case ".git", ".hg", ".svn", + "node_modules", "vendor", "third_party", "_build", "build", + "target", "dist", "out", ".venv", "__pycache__": + return true + } + return false +} + +// isClikeSourceExt reports whether the file extension marks a C/C++ source +// or header. Used to avoid reading every file in the project. +func isClikeSourceExt(ext string) bool { + switch strings.ToLower(ext) { + case ".c", ".cc", ".cpp", ".cxx", ".h", ".hh", ".hpp", ".hxx": + return true + } + return false +} + +// readFileLimited opens path and reads up to limit bytes, then closes the +// file. Returns the bytes read (possibly fewer than limit on a short file) +// or any I/O error. +func readFileLimited(path string, limit int64) ([]byte, error) { + f, err := os.Open(path) //nolint:gosec // path comes from filepath.WalkDir under projectPath + if err != nil { + return nil, err + } + defer f.Close() + buf := make([]byte, limit) + n, err := f.Read(buf) + if err != nil && n == 0 { + return nil, err + } + return buf[:n], nil +} diff --git a/sast-engine/graph/callgraph/registry/clike_platform_detector_test.go b/sast-engine/graph/callgraph/registry/clike_platform_detector_test.go new file mode 100644 index 00000000..2319c586 --- /dev/null +++ b/sast-engine/graph/callgraph/registry/clike_platform_detector_test.go @@ -0,0 +1,197 @@ +package registry + +import ( + "os" + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func writeFile(t *testing.T, dir, rel, content string) { + t.Helper() + path := filepath.Join(dir, rel) + require.NoError(t, os.MkdirAll(filepath.Dir(path), 0o755)) + require.NoError(t, os.WriteFile(path, []byte(content), 0o644)) +} + +func TestDetectClikeTarget_DefaultsToLinux(t *testing.T) { + dir := t.TempDir() + // Empty project — no signal. + assert.Equal(t, core.PlatformLinux, DetectClikeTarget(dir, "")) +} + +func TestDetectClikeTarget_OverrideWinsOverHeuristic(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "main.c", "#ifdef __linux__\n#endif\n") + assert.Equal(t, core.PlatformDarwin, DetectClikeTarget(dir, "darwin")) + assert.Equal(t, core.PlatformWindows, DetectClikeTarget(dir, "windows")) +} + +func TestDetectClikeTarget_OverrideAliases(t *testing.T) { + dir := t.TempDir() + for _, alias := range []string{"linux", "Ubuntu", "ALPINE", " debian "} { + assert.Equal(t, core.PlatformLinux, DetectClikeTarget(dir, alias), alias) + } + for _, alias := range []string{"windows", "WIN", "msvc", "mingw"} { + assert.Equal(t, core.PlatformWindows, DetectClikeTarget(dir, alias), alias) + } + for _, alias := range []string{"darwin", "macos", "MacOSX", "osx", "apple"} { + assert.Equal(t, core.PlatformDarwin, DetectClikeTarget(dir, alias), alias) + } +} + +func TestDetectClikeTarget_OverrideUnknownFallsThrough(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "main.c", "#ifdef __APPLE__\n#endif\n") + // Unknown override → use heuristic, which sees __APPLE__. + assert.Equal(t, core.PlatformDarwin, DetectClikeTarget(dir, "haiku")) +} + +func TestDetectClikeTarget_LinuxByMacros(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "main.c", ` +#ifdef __linux__ +#include +#endif +#ifdef __GLIBC__ +int x; +#endif +`) + assert.Equal(t, core.PlatformLinux, DetectClikeTarget(dir, "")) +} + +func TestDetectClikeTarget_WindowsByMacros(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "main.c", ` +#ifdef _WIN32 +#include +#endif +#ifdef _WIN64 +#include +#endif +`) + assert.Equal(t, core.PlatformWindows, DetectClikeTarget(dir, "")) +} + +func TestDetectClikeTarget_DarwinByMacros(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "main.c", ` +#ifdef __APPLE__ +#include +#endif +#ifdef __MACH__ +int port; +#endif +`) + assert.Equal(t, core.PlatformDarwin, DetectClikeTarget(dir, "")) +} + +func TestDetectClikeTarget_PathHintsBeatSparseMacros(t *testing.T) { + dir := t.TempDir() + // Single __linux__ mention — weak macro signal. + writeFile(t, dir, "main.c", "// __linux__\n") + // But three windows directories — strong path signal (worth 5 pts each). + writeFile(t, dir, "win32/a.c", "//\n") + writeFile(t, dir, "win32/b/c.c", "//\n") + writeFile(t, dir, "msvc/d.c", "//\n") + assert.Equal(t, core.PlatformWindows, DetectClikeTarget(dir, "")) +} + +func TestDetectClikeTarget_IgnoresVendoredDirs(t *testing.T) { + dir := t.TempDir() + // Only signal lives in vendor/ — should be skipped. + writeFile(t, dir, "vendor/foo/x.c", "#ifdef _WIN32\n#endif\n") + writeFile(t, dir, "main.c", "// nothing\n") + assert.Equal(t, core.PlatformLinux, DetectClikeTarget(dir, "")) +} + +func TestDetectClikeTarget_IgnoresGitMetadata(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, ".git/HEAD", "ref: refs/heads/main\n") + writeFile(t, dir, "main.c", "#ifdef __linux__\n#endif\n") + assert.Equal(t, core.PlatformLinux, DetectClikeTarget(dir, "")) +} + +func TestDetectClikeTarget_NonexistentPathDefaultsToLinux(t *testing.T) { + // Path that doesn't exist — walker silently fails; we fall through to + // the default rather than panicking. + got := DetectClikeTarget("/nonexistent-project-pr02-test", "") + assert.Equal(t, core.PlatformLinux, got) +} + +func TestNormaliseTarget(t *testing.T) { + assert.Equal(t, "", normaliseTarget("")) + assert.Equal(t, "", normaliseTarget("freebsd")) + assert.Equal(t, "", normaliseTarget("solaris")) + assert.Equal(t, core.PlatformLinux, normaliseTarget("LINUX")) + assert.Equal(t, core.PlatformLinux, normaliseTarget(" ubuntu ")) +} + +func TestIsClikeSourceExt(t *testing.T) { + for _, ext := range []string{".c", ".cc", ".cpp", ".cxx", ".h", ".hh", ".hpp", ".hxx", ".C", ".HPP"} { + assert.True(t, isClikeSourceExt(ext), ext) + } + for _, ext := range []string{"", ".go", ".py", ".java", ".rs"} { + assert.False(t, isClikeSourceExt(ext), ext) + } +} + +func TestShouldSkipProjectDir(t *testing.T) { + for _, n := range []string{".git", "node_modules", "vendor", "third_party", "build"} { + assert.True(t, shouldSkipProjectDir(n), n) + } + for _, n := range []string{"src", "include", "linux", "win32", "darwin"} { + assert.False(t, shouldSkipProjectDir(n), n) + } +} + +func TestReadFileLimited(t *testing.T) { + dir := t.TempDir() + + // Short file: read fewer bytes than limit. + short := filepath.Join(dir, "short.txt") + require.NoError(t, os.WriteFile(short, []byte("abc"), 0o644)) + got, err := readFileLimited(short, 100) + require.NoError(t, err) + assert.Equal(t, "abc", string(got)) + + // Long file: capped at limit. + long := filepath.Join(dir, "long.txt") + require.NoError(t, os.WriteFile(long, []byte("0123456789"), 0o644)) + got, err = readFileLimited(long, 4) + require.NoError(t, err) + assert.Equal(t, "0123", string(got)) + + // Missing file → error. + _, err = readFileLimited(filepath.Join(dir, "nope"), 100) + require.Error(t, err) +} + +func TestScanPlatformMacros_FileSizeCap(t *testing.T) { + dir := t.TempDir() + // File with the macro AFTER the read cap → not counted. + pad := make([]byte, platformProbeMaxBytes+1024) + for i := range pad { + pad[i] = ' ' + } + content := string(pad) + "__linux__\n" + writeFile(t, dir, "huge.c", content) + + counts := scanPlatformMacros(dir) + assert.Equal(t, 0, counts["__linux__"], "macro past byte cap not counted") +} + +func TestScanPathHints_CaseInsensitive(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "Linux/x.c", "//\n") + writeFile(t, dir, "DARWIN/y.c", "//\n") + writeFile(t, dir, "WiN32/z.c", "//\n") + + hits := scanPathHints(dir) + assert.Equal(t, 1, hits["linux"]) + assert.Equal(t, 1, hits["darwin"]) + assert.Equal(t, 1, hits["windows"]) +} diff --git a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go new file mode 100644 index 00000000..093182f7 --- /dev/null +++ b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go @@ -0,0 +1,227 @@ +package registry + +import ( + "encoding/json" + "errors" + "fmt" + "net/http" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" +) + +// CppStdlibRegistryRemote is the C++ counterpart to CStdlibRegistryRemote. +// Same dual-mode structure, but exposes class- and method-level accessors +// the C interface does not have. +// +// We could compose by embedding CStdlibRegistryRemote, but that would force +// the URL path to use `c/v1` instead of `cpp/v1`. Defining a parallel struct +// keeps the URL/cache paths language-correct at the cost of ~30 LoC of +// near-duplicate file:// and cache plumbing — an acceptable tradeoff for +// clarity. +type CppStdlibRegistryRemote struct { + baseURL string + platform string + fileBase string + + manifest *core.CStdlibManifest + headerCache map[string]*core.CStdlibHeader + cacheMutex sync.RWMutex + + httpClient *http.Client + diskCache *diskCacheStore +} + +// NewCppStdlibRegistryFile constructs a file:// loader for C++ stdlib JSON. +// localPath is the directory containing manifest.json + the per-header files +// generated by PR-01. +func NewCppStdlibRegistryFile(localPath, platform string) *CppStdlibRegistryRemote { + return &CppStdlibRegistryRemote{ + platform: platform, + fileBase: localPath, + headerCache: make(map[string]*core.CStdlibHeader), + } +} + +// NewCppStdlibRegistryRemote constructs an HTTP loader. PR-03 fills in the +// network path; PR-02 keeps the constructor for API stability. +func NewCppStdlibRegistryRemote(baseURL, platform string) *CppStdlibRegistryRemote { + cacheRoot := getStdlibCacheRoot() + var dc *diskCacheStore + if cacheRoot != "" { + dc = newDiskCacheStore(filepath.Join(cacheRoot, platform, "cpp", "v1")) + } + return &CppStdlibRegistryRemote{ + baseURL: strings.TrimSuffix(baseURL, "/"), + platform: platform, + headerCache: make(map[string]*core.CStdlibHeader), + httpClient: &http.Client{Timeout: 30 * time.Second}, + diskCache: dc, + } +} + +// LoadManifest reads the top-level manifest.json. file:// path is wired in +// PR-02; HTTP path is the PR-03 stub. +func (r *CppStdlibRegistryRemote) LoadManifest(logger core.CStdlibLogger) error { + if r.fileBase != "" { + return r.loadManifestFromFile(logger) + } + return errors.New("CppStdlibRegistryRemote: HTTP loader not yet implemented; tracked in PR-03") +} + +func (r *CppStdlibRegistryRemote) loadManifestFromFile(logger core.CStdlibLogger) error { + path := filepath.Join(r.fileBase, "manifest.json") + data, err := os.ReadFile(path) //nolint:gosec // path is operator-supplied via CLI flag + if err != nil { + return fmt.Errorf("loadManifestFromFile: reading %s: %w", path, err) + } + var manifest core.CStdlibManifest + if err := json.Unmarshal(data, &manifest); err != nil { + return fmt.Errorf("loadManifestFromFile: parsing %s: %w", path, err) + } + r.cacheMutex.Lock() + r.manifest = &manifest + r.cacheMutex.Unlock() + if logger != nil { + logger.Statistic("Loaded C++ stdlib manifest from file: %d headers for %s", + len(manifest.Headers), r.platform) + } + return nil +} + +// GetHeader returns the per-header content with the same double-check-locked +// caching as the C loader. +func (r *CppStdlibRegistryRemote) GetHeader(name string) (*core.CStdlibHeader, error) { + r.cacheMutex.RLock() + if h, ok := r.headerCache[name]; ok { + r.cacheMutex.RUnlock() + return h, nil + } + r.cacheMutex.RUnlock() + + r.cacheMutex.Lock() + defer r.cacheMutex.Unlock() + + if h, ok := r.headerCache[name]; ok { + return h, nil + } + h, err := r.fetchHeaderLocked(name) + if err != nil { + return nil, err + } + r.headerCache[name] = h + return h, nil +} + +func (r *CppStdlibRegistryRemote) fetchHeaderLocked(name string) (*core.CStdlibHeader, error) { + if h, ok := r.headerCache[name]; ok { + return h, nil + } + if r.manifest == nil { + return nil, errors.New("fetchHeaderLocked: manifest not loaded; call LoadManifest first") + } + entry := r.manifest.GetHeaderEntry(name) + if entry == nil { + return nil, fmt.Errorf("fetchHeaderLocked: header %q not in stdlib manifest", name) + } + + if r.fileBase != "" { + return r.fetchHeaderFromFile(entry) + } + return nil, errors.New("CppStdlibRegistryRemote: HTTP fetch not yet implemented; tracked in PR-03") +} + +func (r *CppStdlibRegistryRemote) fetchHeaderFromFile(entry *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) { + path := filepath.Join(r.fileBase, entry.File) + data, err := os.ReadFile(path) //nolint:gosec // path is operator-supplied via CLI flag + if err != nil { + return nil, fmt.Errorf("fetchHeaderFromFile: reading %s: %w", path, err) + } + var h core.CStdlibHeader + if err := json.Unmarshal(data, &h); err != nil { + return nil, fmt.Errorf("fetchHeaderFromFile: parsing %s: %w", path, err) + } + return &h, nil +} + +// GetFunction looks up a free or top-level function by bare name. For +// namespaced free functions (std::move), the resolver should use +// GetFreeFunction with the full FQN. +func (r *CppStdlibRegistryRemote) GetFunction(headerName, funcName string) (*core.CStdlibFunction, error) { + h, err := r.GetHeader(headerName) + if err != nil { + return nil, err + } + if f, ok := h.Functions[funcName]; ok { + return f, nil + } + if f, ok := h.FreeFunctions[funcName]; ok { + return f, nil + } + return nil, fmt.Errorf("GetFunction: %q not in header %q", funcName, headerName) +} + +// GetClass returns the class metadata for the given fully-qualified class +// name within the named header. Returns an error when the class is absent +// (callers treat this as a "try the next include" signal). +func (r *CppStdlibRegistryRemote) GetClass(headerName, classFQN string) (*core.CppStdlibClass, error) { + h, err := r.GetHeader(headerName) + if err != nil { + return nil, err + } + cls, ok := h.Classes[classFQN] + if !ok { + return nil, fmt.Errorf("GetClass: %q not in header %q", classFQN, headerName) + } + return cls, nil +} + +// GetMethod is a two-step accessor: GetClass + look up the method by bare +// name. The receiver type's template arguments are NOT substituted here — +// the caller is responsible for that, since substitution depends on the +// concrete receiver instance (e.g. vector vs vector). +func (r *CppStdlibRegistryRemote) GetMethod(headerName, classFQN, methodName string) (*core.CStdlibFunction, error) { + cls, err := r.GetClass(headerName, classFQN) + if err != nil { + return nil, err + } + if m, ok := cls.Methods[methodName]; ok { + return m, nil + } + return nil, fmt.Errorf("GetMethod: %q not in class %q", methodName, classFQN) +} + +// GetFreeFunction looks up a namespaced free function by its full FQN +// (e.g. "std::move", "std::swap"). Distinct from GetFunction so the resolver +// can be explicit about which form it expects. +func (r *CppStdlibRegistryRemote) GetFreeFunction(headerName, fqn string) (*core.CStdlibFunction, error) { + h, err := r.GetHeader(headerName) + if err != nil { + return nil, err + } + if f, ok := h.FreeFunctions[fqn]; ok { + return f, nil + } + return nil, fmt.Errorf("GetFreeFunction: %q not in header %q", fqn, headerName) +} + +// Platform returns the platform tag for this registry. +func (r *CppStdlibRegistryRemote) Platform() string { + return r.platform +} + +// HeaderCount returns the total number of headers in the loaded manifest. +func (r *CppStdlibRegistryRemote) HeaderCount() int { + r.cacheMutex.RLock() + defer r.cacheMutex.RUnlock() + if r.manifest == nil { + return 0 + } + return len(r.manifest.Headers) +} + +var _ core.CppStdlibLoader = (*CppStdlibRegistryRemote)(nil) diff --git a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go new file mode 100644 index 00000000..e7c75cd3 --- /dev/null +++ b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go @@ -0,0 +1,236 @@ +package registry + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func writeCppRegistry(t *testing.T, dir string) { + t.Helper() + + vec := &core.CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "vector", + ModuleID: "std::vector", + Language: core.LanguageCpp, + Classes: map[string]*core.CppStdlibClass{ + "std::vector": { + FQN: "std::vector", + TypeParams: []string{"T", "Allocator"}, + Methods: map[string]*core.CStdlibFunction{ + "push_back": {FQN: "std::vector::push_back", ReturnType: "void", Source: core.SourceOverlay, Confidence: 1.0}, + "size": {FQN: "std::vector::size", ReturnType: "size_t", Source: core.SourceHeader, Confidence: 1.0}, + }, + }, + }, + FreeFunctions: map[string]*core.CStdlibFunction{ + "std::swap": {FQN: "std::swap", ReturnType: "void", Source: core.SourceHeader, Confidence: 1.0}, + }, + } + vBytes, err := json.Marshal(vec) + require.NoError(t, err) + require.NoError(t, os.WriteFile(filepath.Join(dir, "vector_stdlib.json"), vBytes, 0o644)) + + utility := &core.CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "utility", + ModuleID: "std::utility", + Language: core.LanguageCpp, + FreeFunctions: map[string]*core.CStdlibFunction{ + "std::move": {FQN: "std::move", ReturnType: "T&&", Source: core.SourceOverlay, Confidence: 1.0}, + "std::forward": {FQN: "std::forward", ReturnType: "T&&", Source: core.SourceOverlay, Confidence: 1.0}, + }, + } + uBytes, err := json.Marshal(utility) + require.NoError(t, err) + require.NoError(t, os.WriteFile(filepath.Join(dir, "utility_stdlib.json"), uBytes, 0o644)) + + manifest := &core.CStdlibManifest{ + SchemaVersion: "1.0.0", + RegistryVersion: "v1", + Platform: core.PlatformLinux, + Language: core.LanguageCpp, + Headers: []*core.CStdlibHeaderEntry{ + {Header: "vector", ModuleID: "std::vector", File: "vector_stdlib.json"}, + {Header: "utility", ModuleID: "std::utility", File: "utility_stdlib.json"}, + }, + Statistics: &core.CStdlibStatistics{TotalHeaders: 2, TotalClasses: 1, TotalFunctions: 5}, + } + mBytes, err := json.Marshal(manifest) + require.NoError(t, err) + require.NoError(t, os.WriteFile(filepath.Join(dir, "manifest.json"), mBytes, 0o644)) +} + +func TestCppStdlibRegistry_FileMode_LoadAndAccessors(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + assert.Equal(t, 2, r.HeaderCount()) + assert.Equal(t, core.PlatformLinux, r.Platform()) +} + +func TestCppStdlibRegistry_LoadManifestNilLogger(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(nil)) +} + +func TestCppStdlibRegistry_GetClassAndMethod(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + cls, err := r.GetClass("vector", "std::vector") + require.NoError(t, err) + assert.Equal(t, []string{"T", "Allocator"}, cls.TypeParams) + + pb, err := r.GetMethod("vector", "std::vector", "push_back") + require.NoError(t, err) + assert.Equal(t, "void", pb.ReturnType) + + // Missing method. + _, err = r.GetMethod("vector", "std::vector", "iterate") + require.Error(t, err) + + // Missing class. + _, err = r.GetClass("vector", "std::list") + require.Error(t, err) + + // Missing class via GetMethod. + _, err = r.GetMethod("vector", "std::list", "size") + require.Error(t, err) +} + +func TestCppStdlibRegistry_GetFreeFunction(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + mv, err := r.GetFreeFunction("utility", "std::move") + require.NoError(t, err) + assert.Equal(t, "T&&", mv.ReturnType) + + // Wrong header. + _, err = r.GetFreeFunction("vector", "std::move") + require.Error(t, err) +} + +func TestCppStdlibRegistry_GetFunctionFreeFunctionFallback(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + // std::swap is a free function under "vector"; GetFunction must find it + // via the free-function fallback. + got, err := r.GetFunction("vector", "std::swap") + require.NoError(t, err) + assert.Equal(t, "std::swap", got.FQN) +} + +func TestCppStdlibRegistry_GetFunctionMissing(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + _, err := r.GetFunction("vector", "nonexistent") + require.Error(t, err) +} + +func TestCppStdlibRegistry_HTTPStub(t *testing.T) { + r := NewCppStdlibRegistryRemote("https://x/", core.PlatformLinux) + err := r.LoadManifest(noopLogger{}) + require.Error(t, err) + assert.Contains(t, err.Error(), "PR-03") +} + +func TestCppStdlibRegistry_HTTPFetchStub(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + r.fileBase = "" // simulate HTTP-only mode + _, err := r.GetHeader("vector") + require.Error(t, err) + assert.Contains(t, err.Error(), "PR-03") +} + +func TestCppStdlibRegistry_HeaderCountBeforeLoad(t *testing.T) { + r := NewCppStdlibRegistryFile(t.TempDir(), core.PlatformLinux) + assert.Equal(t, 0, r.HeaderCount()) +} + +func TestCppStdlibRegistry_GetHeaderBeforeLoad(t *testing.T) { + r := NewCppStdlibRegistryFile(t.TempDir(), core.PlatformLinux) + _, err := r.GetHeader("vector") + require.Error(t, err) + assert.Contains(t, err.Error(), "manifest not loaded") +} + +func TestCppStdlibRegistry_GetHeaderUnknown(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetHeader("absent") + require.Error(t, err) +} + +func TestCppStdlibRegistry_GetHeaderCorrupt(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + require.NoError(t, os.WriteFile(filepath.Join(dir, "vector_stdlib.json"), []byte("trash"), 0o644)) + + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + _, err := r.GetHeader("vector") + require.Error(t, err) +} + +func TestCppStdlibRegistry_HeaderCacheHit(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + a, err := r.GetHeader("vector") + require.NoError(t, err) + b, err := r.GetHeader("vector") + require.NoError(t, err) + assert.Same(t, a, b) +} + +func TestCppStdlibRegistry_LoadManifestMissing(t *testing.T) { + r := NewCppStdlibRegistryFile(filepath.Join(t.TempDir(), "absent"), core.PlatformLinux) + err := r.LoadManifest(noopLogger{}) + require.Error(t, err) +} + +func TestCppStdlibRegistry_LoadManifestCorrupt(t *testing.T) { + dir := t.TempDir() + require.NoError(t, os.WriteFile(filepath.Join(dir, "manifest.json"), []byte("garbage"), 0o644)) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + err := r.LoadManifest(noopLogger{}) + require.Error(t, err) +} + +func TestCppStdlibRegistry_RemoteCtorTrimsSlash(t *testing.T) { + r := NewCppStdlibRegistryRemote("https://x/registries/", core.PlatformLinux) + assert.Equal(t, "https://x/registries", r.baseURL) +} + +func TestCppStdlibRegistry_ImplementsInterface(t *testing.T) { + var _ core.CppStdlibLoader = NewCppStdlibRegistryFile(t.TempDir(), core.PlatformLinux) + var _ core.CppStdlibLoader = NewCppStdlibRegistryRemote("https://x", core.PlatformLinux) +} From bb575d604c1307949d7e69e2edfc9694bb689b60 Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 18:38:21 -0400 Subject: [PATCH 2/4] feat(builder): wire stdlib fallback into C/C++ resolver chains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends Phase 1's resolution chain with a final stdlib lookup so calls into , , std::move, vector::push_back, etc. become resolved edges with type, confidence, and security-tag metadata. C builder (c_builder.go): - resolveCCallTarget signature → (string, bool, *CStdlibFunction). - New lookupCStdlib walks SystemIncludes for the caller file and consults StdlibRegistry; first include with a matching symbol wins. - buildCCallSite enriches the emitted CallSite from CStdlibFunction (TypeSource="stdlib", InferredType, TypeConfidence, SecurityTag). C++ builder (cpp_builder.go): - resolveCppCallTarget gains the same 3-tuple shape. - lookupCppStdlibMethod uses the type engine to read the receiver type, canonicalises std::vector → std::vector, and substitutes T/U/V/K placeholders into the return type when present. - lookupCppStdlibFreeFunction handles std::move / std::swap via CppStdlibLoader.GetFreeFunction. - C-shape calls (printf, malloc) from .cpp files keep flowing through the embedded C registry. Project-internal resolution still wins (project printf shadows stdlib printf); receiver-less or untyped calls fall back to the unresolved path with no panics. Coverage: 85.1% on builder package, including the new stdlib paths. Co-Authored-By: Claude Sonnet 4.5 --- .../graph/callgraph/builder/c_builder.go | 90 ++++-- .../builder/c_builder_stdlib_test.go | 159 +++++++++++ .../graph/callgraph/builder/cpp_builder.go | 258 +++++++++++++++++- .../builder/cpp_builder_stdlib_test.go | 243 +++++++++++++++++ 4 files changed, 724 insertions(+), 26 deletions(-) create mode 100644 sast-engine/graph/callgraph/builder/c_builder_stdlib_test.go create mode 100644 sast-engine/graph/callgraph/builder/cpp_builder_stdlib_test.go diff --git a/sast-engine/graph/callgraph/builder/c_builder.go b/sast-engine/graph/callgraph/builder/c_builder.go index 19d51403..1cf58a8c 100644 --- a/sast-engine/graph/callgraph/builder/c_builder.go +++ b/sast-engine/graph/callgraph/builder/c_builder.go @@ -235,10 +235,15 @@ func stringMetadata(node *graph.Node, key string) string { // FQN. Resolved sites add a forward edge in the call graph and record a // CallSite{Resolved:true} entry; unresolved sites are still recorded so // rule writers and diagnostics can see external/unknown calls. +// +// Phase 2 (PR-02): when the project-internal lookup fails AND +// registry.StdlibRegistry is configured, the resolver consults the stdlib +// registry. A stdlib hit also flows return-type metadata and any overlay +// security tag into the emitted core.CallSite. func resolveCCallSites(sites []*CallSiteInternal, callGraph *core.CallGraph, registry *core.CModuleRegistry) { for _, cs := range sites { - targetFQN, resolved := resolveCCallTarget(cs, callGraph, registry) - callSite := buildCCallSite(cs, targetFQN, resolved) + targetFQN, resolved, stdlibFn := resolveCCallTarget(cs, callGraph, registry) + callSite := buildCCallSite(cs, targetFQN, resolved, stdlibFn) callGraph.AddCallSite(cs.CallerFQN, callSite) if resolved { callGraph.AddEdge(cs.CallerFQN, targetFQN) @@ -263,29 +268,58 @@ func resolveCCallSites(sites []*CallSiteInternal, callGraph *core.CallGraph, reg // declaration FQN; later phases can still treat it as the entry // point for a stdlib/third-party call. // -// Returns ("", false) when no candidate matches. +// Returns ("", false, nil) when no candidate matches. When the third +// return value (the *core.CStdlibFunction) is non-nil, the caller knows the +// resolution went through the stdlib registry — return type, security tag, +// and confidence are read from that struct. func resolveCCallTarget( cs *CallSiteInternal, callGraph *core.CallGraph, registry *core.CModuleRegistry, -) (string, bool) { +) (string, bool, *core.CStdlibFunction) { if cs.FunctionName == "" { - return "", false + return "", false, nil } if fqn, ok := lookupSameFile(cs.CallerFile, cs.FunctionName, registry, callGraph, true); ok { - return fqn, true + return fqn, true, nil } if fqn, ok := lookupGlobalDefinition(cs.FunctionName, registry, callGraph); ok { - return fqn, true + return fqn, true, nil } if fqn, ok := lookupSameFile(cs.CallerFile, cs.FunctionName, registry, callGraph, false); ok { - return fqn, true + return fqn, true, nil } if fqn, ok := lookupViaIncludes(cs.CallerFile, cs.FunctionName, registry, callGraph); ok { - return fqn, true + return fqn, true, nil } - return "", false + + // Phase 2 fallback: consult the stdlib registry by walking the caller's + // system includes. First include with a matching symbol wins. + if registry.StdlibRegistry != nil { + if fqn, fn := lookupCStdlib(cs.CallerFile, cs.FunctionName, registry); fn != nil { + return fqn, true, fn + } + } + return "", false, nil +} + +// lookupCStdlib walks the caller file's `#include <...>` list and asks the +// stdlib registry for a function with the requested name in each header. +// First match wins; ties are unlikely (stdlib symbols are uniquely owned by +// one header) but if they do happen, the include order in source decides. +func lookupCStdlib(callerFile, funcName string, registry *core.CModuleRegistry) (string, *core.CStdlibFunction) { + prefix, ok := registry.FileToPrefix[callerFile] + if !ok { + return "", nil + } + for _, header := range registry.SystemIncludes[prefix] { + fn, err := registry.StdlibRegistry.GetFunction(header, funcName) + if err == nil && fn != nil { + return fn.FQN, fn + } + } + return "", nil } // lookupSameFile returns the FQN of a function named `name` declared in @@ -370,22 +404,40 @@ func isDeclaration(node *graph.Node) bool { // the resolution outcome. Tracking unresolved calls (rather than // dropping them) enables stdlib/third-party rules to inspect external // invocations. -func buildCCallSite(cs *CallSiteInternal, targetFQN string, resolved bool) core.CallSite { +// +// stdlibFn is non-nil only when resolution went through the stdlib registry +// (Phase 2). When set, its return type, confidence, and security tag are +// propagated into the emitted CallSite. +func buildCCallSite(cs *CallSiteInternal, targetFQN string, resolved bool, stdlibFn *core.CStdlibFunction) core.CallSite { site := core.CallSite{ Target: cs.FunctionName, Location: core.Location{File: cs.CallerFile, Line: int(cs.CallLine)}, Arguments: buildCallSiteArguments(cs.Arguments), Resolved: resolved, } - if resolved { - site.TargetFQN = targetFQN - // Confidence 1.0 because resolution went through the FQN - // registry, not type inference. Source kept consistent with - // the explicit-types convention used by the type engine. - site.TypeConfidence = 1.0 - site.TypeSource = declarationConfidenceSource - } else { + if !resolved { site.FailureReason = resolutionFailedExternal + return site } + site.TargetFQN = targetFQN + if stdlibFn != nil { + // Phase 2 stdlib resolution — populate type info from the registry. + site.TypeConfidence = stdlibFn.Confidence + site.TypeSource = stdlibSource + site.InferredType = stdlibFn.ReturnType + site.SecurityTag = stdlibFn.SecurityTag + return site + } + // Project-internal resolution — Phase 1 path. + // Confidence 1.0 because resolution went through the FQN registry, + // not type inference. Source kept consistent with the explicit-types + // convention used by the type engine. + site.TypeConfidence = 1.0 + site.TypeSource = declarationConfidenceSource return site } + +// stdlibSource is the value stamped on CallSite.TypeSource when resolution +// went through the C/C++ stdlib registry. Distinct from the Phase 1 +// declarationConfidenceSource so downstream consumers can filter on it. +const stdlibSource = "stdlib" diff --git a/sast-engine/graph/callgraph/builder/c_builder_stdlib_test.go b/sast-engine/graph/callgraph/builder/c_builder_stdlib_test.go new file mode 100644 index 00000000..524f939d --- /dev/null +++ b/sast-engine/graph/callgraph/builder/c_builder_stdlib_test.go @@ -0,0 +1,159 @@ +package builder_test + +import ( + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// fakeCStdlibLoader implements core.CStdlibLoader for builder tests +// without going through the registry package — keeps the test focused +// on the builder's resolution path rather than loader plumbing. +type fakeCStdlibLoader struct { + headers map[string]map[string]*core.CStdlibFunction +} + +func newFakeCStdlibLoader(headers map[string]map[string]*core.CStdlibFunction) *fakeCStdlibLoader { + return &fakeCStdlibLoader{headers: headers} +} + +func (f *fakeCStdlibLoader) LoadManifest(_ core.CStdlibLogger) error { return nil } + +func (f *fakeCStdlibLoader) GetHeader(name string) (*core.CStdlibHeader, error) { + fns, ok := f.headers[name] + if !ok { + return nil, assert.AnError + } + return &core.CStdlibHeader{Header: name, Functions: fns}, nil +} + +func (f *fakeCStdlibLoader) GetFunction(headerName, funcName string) (*core.CStdlibFunction, error) { + if fns, ok := f.headers[headerName]; ok { + if fn, ok := fns[funcName]; ok { + return fn, nil + } + } + return nil, assert.AnError +} + +func (f *fakeCStdlibLoader) Platform() string { return "linux" } + +func (f *fakeCStdlibLoader) HeaderCount() int { return len(f.headers) } + +// TestBuildCCallGraph_StdlibFallback verifies that an unresolved call +// falls through to the stdlib registry and emits an enriched CallSite +// (TargetFQN, return type, confidence, security tag). +func TestBuildCCallGraph_StdlibFallback(t *testing.T) { + root := fixtureRoot + mainC := root + "/src/main.c" + + f := newCFixture(t) + mainFn := f.addFunction(t, mainC, "src/main.c", "main", "int", false) + f.addCall(t, mainFn, "system", []string{`"ls"`}) + + // Wire the system include and stdlib loader for the caller file. + f.registry.SystemIncludes["src/main.c"] = []string{"stdlib.h"} + f.registry.StdlibRegistry = newFakeCStdlibLoader(map[string]map[string]*core.CStdlibFunction{ + "stdlib.h": { + "system": { + FQN: "c::stdlib::system", + ReturnType: "int", + SecurityTag: "command_injection_sink", + Source: core.SourceOverlay, + Confidence: 0.95, + }, + }, + }) + + cg, _ := f.build(t) + + sites := cg.CallSites["src/main.c::main"] + require.Len(t, sites, 1) + assert.True(t, sites[0].Resolved) + assert.Equal(t, "c::stdlib::system", sites[0].TargetFQN) + assert.Equal(t, "int", sites[0].InferredType) + assert.Equal(t, "stdlib", sites[0].TypeSource) + assert.Equal(t, "command_injection_sink", sites[0].SecurityTag) + assert.InDelta(t, 0.95, sites[0].TypeConfidence, 1e-6) +} + +// TestBuildCCallGraph_StdlibLookupRespectsIncludeOrder confirms the +// resolver walks SystemIncludes in order and stops at the first match — +// later headers must not override earlier ones. +func TestBuildCCallGraph_StdlibLookupRespectsIncludeOrder(t *testing.T) { + root := fixtureRoot + mainC := root + "/src/main.c" + + f := newCFixture(t) + mainFn := f.addFunction(t, mainC, "src/main.c", "main", "int", false) + f.addCall(t, mainFn, "abort", nil) + + f.registry.SystemIncludes["src/main.c"] = []string{"stdlib.h", "fake.h"} + f.registry.StdlibRegistry = newFakeCStdlibLoader(map[string]map[string]*core.CStdlibFunction{ + "stdlib.h": { + "abort": {FQN: "c::stdlib::abort", ReturnType: "void", Source: core.SourceHeader, Confidence: 1.0}, + }, + "fake.h": { + "abort": {FQN: "fake::abort", ReturnType: "void", Source: core.SourceHeader, Confidence: 1.0}, + }, + }) + + cg, _ := f.build(t) + + sites := cg.CallSites["src/main.c::main"] + require.Len(t, sites, 1) + assert.Equal(t, "c::stdlib::abort", sites[0].TargetFQN, "first include in order must win") +} + +// TestBuildCCallGraph_StdlibFallback_NotConsultedWhenProjectDefinitionExists +// guards against the stdlib path overriding a same-file definition. +// printf is normally a stdlib symbol, but a project that defines its +// own `printf` must keep the project FQN. +func TestBuildCCallGraph_StdlibFallback_NotConsultedWhenProjectDefinitionExists(t *testing.T) { + root := fixtureRoot + mainC := root + "/src/main.c" + + f := newCFixture(t) + mainFn := f.addFunction(t, mainC, "src/main.c", "main", "int", false) + f.addFunction(t, mainC, "src/main.c", "printf", "int", false) + f.addCall(t, mainFn, "printf", nil) + + f.registry.SystemIncludes["src/main.c"] = []string{"stdio.h"} + f.registry.StdlibRegistry = newFakeCStdlibLoader(map[string]map[string]*core.CStdlibFunction{ + "stdio.h": { + "printf": {FQN: "c::stdio::printf", ReturnType: "int", Source: core.SourceOverlay, Confidence: 1.0}, + }, + }) + + cg, _ := f.build(t) + + sites := cg.CallSites["src/main.c::main"] + require.Len(t, sites, 1) + assert.Equal(t, "src/main.c::printf", sites[0].TargetFQN, "project definition must shadow stdlib symbol") + assert.Empty(t, sites[0].SecurityTag, "project resolution must not pick up stdlib SecurityTag") +} + +// TestBuildCCallGraph_StdlibFallback_NoIncludesLeavesUnresolved verifies +// that a call to an unknown function with no matching system include +// stays unresolved — the registry must not return arbitrary symbols. +func TestBuildCCallGraph_StdlibFallback_NoIncludesLeavesUnresolved(t *testing.T) { + root := fixtureRoot + mainC := root + "/src/main.c" + + f := newCFixture(t) + mainFn := f.addFunction(t, mainC, "src/main.c", "main", "int", false) + f.addCall(t, mainFn, "printf", nil) + + // No SystemIncludes entry for this file → stdlib lookup is a no-op. + f.registry.StdlibRegistry = newFakeCStdlibLoader(map[string]map[string]*core.CStdlibFunction{ + "stdio.h": {"printf": {FQN: "c::stdio::printf", ReturnType: "int"}}, + }) + + cg, _ := f.build(t) + + sites := cg.CallSites["src/main.c::main"] + require.Len(t, sites, 1) + assert.False(t, sites[0].Resolved, "no matching include => stdlib must not be consulted") +} diff --git a/sast-engine/graph/callgraph/builder/cpp_builder.go b/sast-engine/graph/callgraph/builder/cpp_builder.go index c779a31e..9c2ad190 100644 --- a/sast-engine/graph/callgraph/builder/cpp_builder.go +++ b/sast-engine/graph/callgraph/builder/cpp_builder.go @@ -411,6 +411,11 @@ func isCppCallNode(node *graph.Node) bool { // corresponding edge / CallSite record to the call graph. Unresolved // sites are still recorded (Resolved=false) for diagnostics — stdlib // and external calls remain visible. +// +// Phase 2 (PR-02): if the C++ resolver and the embedded C resolver both +// fail, the C++ stdlib registry is consulted (handled inside +// resolveCppCallTarget). The optional *core.CStdlibFunction return value +// flows return type and security tag into the emitted CallSite. func resolveCppCallSites( sites []*CallSiteInternal, callGraph *core.CallGraph, @@ -419,8 +424,8 @@ func resolveCppCallSites( classes map[string][]cppClassByteRange, ) { for _, cs := range sites { - targetFQN, resolved := resolveCppCallTarget(cs, callGraph, registry, typeEngine, classes) - callSite := buildCCallSite(cs, targetFQN, resolved) + targetFQN, resolved, stdlibFn := resolveCppCallTarget(cs, callGraph, registry, typeEngine, classes) + callSite := buildCCallSite(cs, targetFQN, resolved, stdlibFn) callGraph.AddCallSite(cs.CallerFQN, callSite) if resolved { callGraph.AddEdge(cs.CallerFQN, targetFQN) @@ -441,32 +446,271 @@ func resolveCppCallSites( // // Each step short-circuits on the first hit; later steps are tried // only if earlier ones miss. +// +// Phase 2 (PR-02): a stdlib step is inserted between project-internal +// resolution and the C-fallthrough. The C++ stdlib registry is consulted +// for namespaced free functions (std::move) and for class methods on a +// receiver whose type the type engine has identified (vec.push_back +// where vec is std::vector). The C-fallthrough also picks up +// and friends via the embedded C registry's stdlib loader. func resolveCppCallTarget( cs *CallSiteInternal, callGraph *core.CallGraph, registry *core.CppModuleRegistry, typeEngine *resolution.CppTypeInferenceEngine, classes map[string][]cppClassByteRange, -) (string, bool) { +) (string, bool, *core.CStdlibFunction) { if cs.FunctionName == "" { - return "", false + return "", false, nil } if fqn, ok := lookupQualifiedCall(cs.FunctionName, registry); ok { - return fqn, true + return fqn, true, nil } if cs.ObjectName == receiverThis { if fqn, ok := lookupThisMethod(cs, callGraph, registry, classes); ok { - return fqn, true + return fqn, true, nil } } else if cs.ObjectName != "" { if fqn, ok := lookupReceiverMethod(cs, registry, typeEngine); ok { - return fqn, true + return fqn, true, nil + } + // Phase 2: try C++ stdlib method dispatch on the receiver's type. + if registry.StdlibCppRegistry != nil { + if fqn, fn := lookupCppStdlibMethod(cs, registry, typeEngine); fn != nil { + return fqn, true, fn + } } } + + // Phase 2: try C++ stdlib free-function (std::move, std::swap, …) before + // falling through to the C resolver. Qualified-call lookup above already + // caught the project-internal forms; this step covers the registry side. + if registry.StdlibCppRegistry != nil { + if fqn, fn := lookupCppStdlibFreeFunction(cs, ®istry.CModuleRegistry, registry.StdlibCppRegistry); fn != nil { + return fqn, true, fn + } + } + return resolveCCallTarget(cs, callGraph, ®istry.CModuleRegistry) } +// lookupCppStdlibMethod resolves `obj.method()` against the C++ stdlib +// registry. Steps: +// +// 1. Look up the receiver's declared type via the type engine. +// 2. Strip template arguments to get the registry key +// ("std::vector" → "std::vector"). +// 3. Walk the caller's `#include <...>` list and ask the registry for +// a method with that name on that class. First hit wins. +// +// Returns the synthetic FQN ("::") and the registry record +// on success; "", nil on miss. A receiver type the engine cannot resolve +// is silently treated as a miss — Phase 1's behaviour for unresolved +// receivers is preserved. +func lookupCppStdlibMethod( + cs *CallSiteInternal, + registry *core.CppModuleRegistry, + typeEngine *resolution.CppTypeInferenceEngine, +) (string, *core.CStdlibFunction) { + if typeEngine == nil { + return "", nil + } + scope := typeEngine.GetScope(cs.CallerFQN) + if scope == nil { + return "", nil + } + binding := scope.GetVariable(cs.ObjectName) + if binding == nil || binding.Type == nil { + return "", nil + } + receiverFQN := canonicalizeStdlibType(normaliseTypeName(binding.Type.TypeFQN)) + if receiverFQN == "" { + return "", nil + } + prefix, ok := registry.FileToPrefix[cs.CallerFile] + if !ok { + return "", nil + } + for _, header := range registry.SystemIncludes[prefix] { + method, err := registry.StdlibCppRegistry.GetMethod(header, receiverFQN, cs.FunctionName) + if err == nil && method != nil { + // Substitute template parameters using the receiver's concrete + // args (e.g. vector::operator[] T& → int&). Phase 2 covers + // the common single-T cases; extension to K/V/U is built in. + cloned := substituteTemplateMethodReturn(method, binding.Type.TypeFQN) + fqn := receiverFQN + fqnSeparator + cs.FunctionName + return fqn, cloned + } + } + return "", nil +} + +// lookupCppStdlibFreeFunction asks the registry for a namespaced free +// function. The call's FunctionName MUST already be the qualified form +// ("std::move") for the registry's GetFreeFunction to succeed — bare +// names like "move" without the namespace are a different lookup path +// and stay out of scope here. +func lookupCppStdlibFreeFunction( + cs *CallSiteInternal, + cReg *core.CModuleRegistry, + cppLoader core.CppStdlibLoader, +) (string, *core.CStdlibFunction) { + if !strings.Contains(cs.FunctionName, fqnSeparator) { + return "", nil + } + prefix, ok := cReg.FileToPrefix[cs.CallerFile] + if !ok { + return "", nil + } + for _, header := range cReg.SystemIncludes[prefix] { + fn, err := cppLoader.GetFreeFunction(header, cs.FunctionName) + if err == nil && fn != nil { + return fn.FQN, fn + } + } + return "", nil +} + +// canonicalizeStdlibType strips the template-argument suffix from a type +// FQN to produce the registry key used by GetClass / GetMethod. Examples: +// +// "std::vector" → "std::vector" +// "std::map" → "std::map" +// "std::unique_ptr" → "std::unique_ptr" +// "int" → "int" +// +// The first '<' wins; nested template arguments are ignored at the key +// level (the receiver's own FQN still carries them for substitution). +func canonicalizeStdlibType(typeFQN string) string { + if idx := strings.IndexByte(typeFQN, '<'); idx > 0 { + return strings.TrimSpace(typeFQN[:idx]) + } + return typeFQN +} + +// substituteTemplateMethodReturn replaces the canonical template parameter +// names (T, U, V, K) in the registry's recorded return type with the +// concrete arguments parsed from the receiver's FQN. Returns a SHALLOW +// COPY of the registry function with the substituted type — never mutates +// the registry's data, since the registry is shared across calls. +// +// Phase 2 covers common cases: T, T&, T*, const T&, std::pair&. More +// elaborate forms (`typename T::iterator`, conditional types, parameter +// packs) fall through unchanged — the resolver still records the resolution, +// just with the original generic form. PR-04's `--diagnose-stdlib` will +// surface these as opportunities for future overlay refinement. +func substituteTemplateMethodReturn(method *core.CStdlibFunction, receiverFQN string) *core.CStdlibFunction { + args := parseTemplateArgs(receiverFQN) + if len(args) == 0 { + return method + } + cloned := *method // shallow copy — Params slice still points at the original + cloned.ReturnType = applyTemplateSubstitution(method.ReturnType, args) + return &cloned +} + +// parseTemplateArgs extracts the comma-separated template arguments from a +// type FQN. "std::vector" → ["int"]; "std::map" → +// ["std::string", "int"]. Returns nil for non-template FQNs. The parser is +// brace-counting so nested templates are handled correctly: +// "std::map>" → ["int", "std::vector"]. +func parseTemplateArgs(typeFQN string) []string { + open := strings.IndexByte(typeFQN, '<') + if open < 0 { + return nil + } + closeIdx := strings.LastIndexByte(typeFQN, '>') + if closeIdx <= open { + return nil + } + body := typeFQN[open+1 : closeIdx] + + args := make([]string, 0, 2) + depth := 0 + start := 0 + for i, r := range body { + switch r { + case '<': + depth++ + case '>': + depth-- + case ',': + if depth == 0 { + args = append(args, strings.TrimSpace(body[start:i])) + start = i + 1 + } + } + } + args = append(args, strings.TrimSpace(body[start:])) + return args +} + +// applyTemplateSubstitution replaces the canonical placeholder names with +// concrete types in returnType. Substitution is whole-word — we don't +// replace "T" inside "Type" or "std::pair". Registered placeholders, in +// the order the receiver's args bind to: +// +// T → first arg +// U → second arg +// V → third arg +// K → first arg (alias used by std::map / std::unordered_map) +// +// This matches the convention the cpp_stdlib_overlay.yaml authors use. +func applyTemplateSubstitution(returnType string, args []string) string { + placeholders := []string{"T", "U", "V", "K"} + out := returnType + for i, ph := range placeholders { + if i >= len(args) && ph != "K" { + break + } + // "K" is an alias for the first arg in map-shaped containers. + idx := i + if ph == "K" { + idx = 0 + } + if idx >= len(args) { + continue + } + out = replaceWholeWord(out, ph, args[idx]) + } + return out +} + +// replaceWholeWord replaces every occurrence of `from` in `s` with `to`, +// but only when `from` is at a word boundary. This avoids T inside Type +// being replaced when args=["int"]: "Type" stays "Type", "T&" becomes "int&". +func replaceWholeWord(s, from, to string) string { + if from == "" { + return s + } + var b strings.Builder + b.Grow(len(s)) + for i := 0; i < len(s); { + if i+len(from) <= len(s) && s[i:i+len(from)] == from && + !isWordChar(byteAt(s, i-1)) && !isWordChar(byteAt(s, i+len(from))) { + b.WriteString(to) + i += len(from) + continue + } + b.WriteByte(s[i]) + i++ + } + return b.String() +} + +func byteAt(s string, i int) byte { + if i < 0 || i >= len(s) { + return 0 + } + return s[i] +} + +func isWordChar(b byte) bool { + return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || + (b >= '0' && b <= '9') || b == '_' +} + // lookupQualifiedCall handles `ns::func` and `Class::staticMethod` by // querying NamespaceIndex with the verbatim call name. The check is // scoped to names containing `::` so plain `func()` calls fall through diff --git a/sast-engine/graph/callgraph/builder/cpp_builder_stdlib_test.go b/sast-engine/graph/callgraph/builder/cpp_builder_stdlib_test.go new file mode 100644 index 00000000..5cdcade9 --- /dev/null +++ b/sast-engine/graph/callgraph/builder/cpp_builder_stdlib_test.go @@ -0,0 +1,243 @@ +package builder_test + +import ( + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/resolution" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// fakeCppStdlibLoader implements core.CppStdlibLoader for the +// builder tests. The unit tests construct the loader directly so the +// resolution path is observable in isolation. +type fakeCppStdlibLoader struct { + headers map[string]*core.CStdlibHeader +} + +func newFakeCppStdlibLoader(headers map[string]*core.CStdlibHeader) *fakeCppStdlibLoader { + return &fakeCppStdlibLoader{headers: headers} +} + +func (f *fakeCppStdlibLoader) LoadManifest(_ core.CStdlibLogger) error { return nil } + +func (f *fakeCppStdlibLoader) GetHeader(name string) (*core.CStdlibHeader, error) { + if h, ok := f.headers[name]; ok { + return h, nil + } + return nil, assert.AnError +} + +func (f *fakeCppStdlibLoader) GetFunction(headerName, funcName string) (*core.CStdlibFunction, error) { + h, err := f.GetHeader(headerName) + if err != nil { + return nil, err + } + if fn, ok := h.Functions[funcName]; ok { + return fn, nil + } + if fn, ok := h.FreeFunctions[funcName]; ok { + return fn, nil + } + return nil, assert.AnError +} + +func (f *fakeCppStdlibLoader) GetClass(headerName, classFQN string) (*core.CppStdlibClass, error) { + h, err := f.GetHeader(headerName) + if err != nil { + return nil, err + } + if cls, ok := h.Classes[classFQN]; ok { + return cls, nil + } + return nil, assert.AnError +} + +func (f *fakeCppStdlibLoader) GetMethod(headerName, classFQN, methodName string) (*core.CStdlibFunction, error) { + cls, err := f.GetClass(headerName, classFQN) + if err != nil { + return nil, err + } + if m, ok := cls.Methods[methodName]; ok { + return m, nil + } + return nil, assert.AnError +} + +func (f *fakeCppStdlibLoader) GetFreeFunction(headerName, fqn string) (*core.CStdlibFunction, error) { + h, err := f.GetHeader(headerName) + if err != nil { + return nil, err + } + if fn, ok := h.FreeFunctions[fqn]; ok { + return fn, nil + } + return nil, assert.AnError +} + +func (f *fakeCppStdlibLoader) Platform() string { return "linux" } +func (f *fakeCppStdlibLoader) HeaderCount() int { return len(f.headers) } + +// TestBuildCppCallGraph_StdlibClassMethod resolves `vec.push_back(...)` +// against the C++ stdlib registry. The receiver type comes from the +// type engine; the resolver canonicalises std::vector → std::vector +// before looking up the method. +func TestBuildCppCallGraph_StdlibClassMethod(t *testing.T) { + root := cppFixtureRoot + mainCpp := root + "/src/main.cpp" + + f := newCppFixture(t) + main := f.addFreeFunction(t, mainCpp, "src/main.cpp", "", "main", "int") + f.engine.ExtractVariableType("src/main.cpp::main", "vec", "std::vector", resolution.Location{Line: 5}) + f.addCall(t, main, "push_back", "vec") + + f.registry.SystemIncludes["src/main.cpp"] = []string{"vector"} + f.registry.StdlibCppRegistry = newFakeCppStdlibLoader(map[string]*core.CStdlibHeader{ + "vector": { + Header: "vector", + Classes: map[string]*core.CppStdlibClass{ + "std::vector": { + FQN: "std::vector", TypeParams: []string{"T"}, + Methods: map[string]*core.CStdlibFunction{ + "push_back": {FQN: "std::vector::push_back", ReturnType: "void", Source: core.SourceOverlay, Confidence: 1.0}, + }, + }, + }, + }, + }) + + cg := f.build(t) + + sites := cg.CallSites["src/main.cpp::main"] + require.Len(t, sites, 1) + assert.True(t, sites[0].Resolved) + assert.Equal(t, "std::vector::push_back", sites[0].TargetFQN) + assert.Equal(t, "stdlib", sites[0].TypeSource) +} + +// TestBuildCppCallGraph_StdlibClassMethod_TemplateSubstitution verifies +// that a method whose return type is the template parameter (`T`) gets +// the concrete argument substituted in (vector::operator[] → int&). +func TestBuildCppCallGraph_StdlibClassMethod_TemplateSubstitution(t *testing.T) { + root := cppFixtureRoot + mainCpp := root + "/src/main.cpp" + + f := newCppFixture(t) + main := f.addFreeFunction(t, mainCpp, "src/main.cpp", "", "main", "int") + f.engine.ExtractVariableType("src/main.cpp::main", "vec", "std::vector", resolution.Location{Line: 5}) + f.addCall(t, main, "front", "vec") + + f.registry.SystemIncludes["src/main.cpp"] = []string{"vector"} + f.registry.StdlibCppRegistry = newFakeCppStdlibLoader(map[string]*core.CStdlibHeader{ + "vector": { + Header: "vector", + Classes: map[string]*core.CppStdlibClass{ + "std::vector": { + FQN: "std::vector", TypeParams: []string{"T"}, + Methods: map[string]*core.CStdlibFunction{ + "front": {FQN: "std::vector::front", ReturnType: "T&", Source: core.SourceHeader, Confidence: 1.0}, + }, + }, + }, + }, + }) + + cg := f.build(t) + + sites := cg.CallSites["src/main.cpp::main"] + require.Len(t, sites, 1) + assert.True(t, sites[0].Resolved) + assert.Equal(t, "int&", sites[0].InferredType, "T must be replaced with the concrete template argument") +} + +// TestBuildCppCallGraph_StdlibFreeFunction handles `std::move(x)` — +// a namespaced free function looked up via GetFreeFunction. +func TestBuildCppCallGraph_StdlibFreeFunction(t *testing.T) { + root := cppFixtureRoot + mainCpp := root + "/src/main.cpp" + + f := newCppFixture(t) + main := f.addFreeFunction(t, mainCpp, "src/main.cpp", "", "main", "int") + // Mimic the parser emitting std::move via the qualified-call path. + f.addCall(t, main, "std::move", "") + + f.registry.SystemIncludes["src/main.cpp"] = []string{"utility"} + f.registry.StdlibCppRegistry = newFakeCppStdlibLoader(map[string]*core.CStdlibHeader{ + "utility": { + Header: "utility", + FreeFunctions: map[string]*core.CStdlibFunction{ + "std::move": {FQN: "std::move", ReturnType: "T&&", Source: core.SourceOverlay, Confidence: 1.0}, + }, + }, + }) + + cg := f.build(t) + + sites := cg.CallSites["src/main.cpp::main"] + require.Len(t, sites, 1) + assert.True(t, sites[0].Resolved) + assert.Equal(t, "std::move", sites[0].TargetFQN) +} + +// TestBuildCppCallGraph_StdlibCFallthrough confirms a `printf()` call +// from a .cpp file resolves through the C registry — C++ projects can +// (and routinely do) include / alongside STL headers. +func TestBuildCppCallGraph_StdlibCFallthrough(t *testing.T) { + root := cppFixtureRoot + mainCpp := root + "/src/main.cpp" + + f := newCppFixture(t) + main := f.addFreeFunction(t, mainCpp, "src/main.cpp", "", "main", "int") + f.addCall(t, main, "printf", "") + + // The caller-file's system includes are stored on the embedded + // CModuleRegistry, just like for C. + f.registry.SystemIncludes["src/main.cpp"] = []string{"stdio.h"} + f.registry.StdlibRegistry = newFakeCStdlibLoader(map[string]map[string]*core.CStdlibFunction{ + "stdio.h": { + "printf": {FQN: "c::stdio::printf", ReturnType: "int", Source: core.SourceOverlay, Confidence: 1.0}, + }, + }) + + cg := f.build(t) + + sites := cg.CallSites["src/main.cpp::main"] + require.Len(t, sites, 1) + assert.True(t, sites[0].Resolved) + assert.Equal(t, "c::stdio::printf", sites[0].TargetFQN) +} + +// TestBuildCppCallGraph_StdlibClassMethod_NoReceiver verifies the +// stdlib method path is skipped (gracefully) when the receiver type +// cannot be inferred — falls through to the unresolved branch. +func TestBuildCppCallGraph_StdlibClassMethod_NoReceiver(t *testing.T) { + root := cppFixtureRoot + mainCpp := root + "/src/main.cpp" + + f := newCppFixture(t) + main := f.addFreeFunction(t, mainCpp, "src/main.cpp", "", "main", "int") + // Caller with no registered variable type for "mystery". + f.addCall(t, main, "push_back", "mystery") + + f.registry.SystemIncludes["src/main.cpp"] = []string{"vector"} + f.registry.StdlibCppRegistry = newFakeCppStdlibLoader(map[string]*core.CStdlibHeader{ + "vector": { + Header: "vector", + Classes: map[string]*core.CppStdlibClass{ + "std::vector": { + FQN: "std::vector", + Methods: map[string]*core.CStdlibFunction{ + "push_back": {FQN: "std::vector::push_back", ReturnType: "void"}, + }, + }, + }, + }, + }) + + cg := f.build(t) + + sites := cg.CallSites["src/main.cpp::main"] + require.Len(t, sites, 1) + assert.False(t, sites[0].Resolved, "missing receiver type must keep the call unresolved") +} From 918814c9901921325a72b5ae3f81e860c2d4b1f9 Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 18:38:34 -0400 Subject: [PATCH 3/4] feat(cmd): plumb stdlib registry into scan + resolution-report Wires the C/C++ stdlib loaders into the CLI surface and adds smoke fixtures that exercise the full pipeline. scan.go / resolution_report.go: - New --target=linux|darwin|windows flag overrides platform auto-detection. - New --stdlib-base-url flag selects the registry source. file:// paths and bare local paths read from disk; http(s):// will be honoured by PR-03's HTTP loader. Empty value disables stdlib resolution and keeps Phase 1 behavior. - initClikeStdlib boots both loaders via DetectClikeTarget + buildC{,pp}StdlibLoader, calls LoadManifest with a logger adapter, and degrades to nil-loader-but-keep-scanning on every failure mode so a missing manifest never breaks a scan. - buildClikeCallGraphs takes a clikeStdlibConfig; the C and C++ merge helpers inject the loaders into the freshly-built registries before invoking the call-graph builders. testdata/c/stdlib/main.c + testdata/cpp/stl/main.cpp: small smoke fixtures covering printf/malloc/strlen and vector::push_back / std::move / std::printf for downstream e2e checks. Coverage on the new cmd helpers: 100% across initClikeStdlib, loadC{,pp}StdlibFromBase, buildC{,pp}StdlibLoader, and the logger adapter. Co-Authored-By: Claude Sonnet 4.5 --- sast-engine/cmd/resolution_report.go | 9 +- sast-engine/cmd/scan.go | 151 +++++++++++++++++- sast-engine/cmd/scan_stdlib_test.go | 212 ++++++++++++++++++++++++++ sast-engine/cmd/scan_test.go | 8 +- sast-engine/testdata/c/stdlib/main.c | 39 +++++ sast-engine/testdata/cpp/stl/main.cpp | 49 ++++++ 6 files changed, 457 insertions(+), 11 deletions(-) create mode 100644 sast-engine/cmd/scan_stdlib_test.go create mode 100644 sast-engine/testdata/c/stdlib/main.c create mode 100644 sast-engine/testdata/cpp/stl/main.cpp diff --git a/sast-engine/cmd/resolution_report.go b/sast-engine/cmd/resolution_report.go index 89b17bfe..9c09e44c 100644 --- a/sast-engine/cmd/resolution_report.go +++ b/sast-engine/cmd/resolution_report.go @@ -90,7 +90,12 @@ Use --csv to export unresolved calls with file, line, target, and reason.`, // Reuse scan.go's helper so both commands stay aligned. It gates // each builder on hasLanguageNodes and merges into cg in place. - buildClikeCallGraphs(cg, codeGraph, projectInput, logger) + // Stdlib resolution mirrors scan.go: --target overrides platform + // detection; --stdlib-base-url selects the registry source. + clikeTarget, _ := cmd.Flags().GetString("target") + stdlibBaseURL, _ := cmd.Flags().GetString("stdlib-base-url") + stdlibCfg, _ := initClikeStdlib(projectInput, clikeTarget, stdlibBaseURL, logger) + buildClikeCallGraphs(cg, codeGraph, projectInput, logger, stdlibCfg) fmt.Printf("\nResolution Report for %s\n", projectInput) fmt.Println("===============================================") @@ -936,4 +941,6 @@ func init() { resolutionReportCmd.Flags().String("csv", "", "Export unresolved calls to CSV file (e.g., --csv unresolved.csv)") resolutionReportCmd.Flags().String("dump-callsites-json", "", "Export all Go call sites as JSONL for accuracy validation (e.g., --dump-callsites-json callsites.jsonl)") resolutionReportCmd.Flags().Bool("enable-db-cache", false, "Enable SQLite-backed incremental analysis cache (experimental). Caches Pass 2b scopes and Pass 3 call sites per file keyed by content hash; only changed files are re-analysed on subsequent runs.") + resolutionReportCmd.Flags().String("target", "", "Override C/C++ target platform: linux, darwin, or windows (default: auto-detect)") + resolutionReportCmd.Flags().String("stdlib-base-url", "", "Base URL for the C/C++ stdlib registry (file://path or https://host). Empty disables stdlib resolution.") } diff --git a/sast-engine/cmd/scan.go b/sast-engine/cmd/scan.go index ae8dfb7a..e01658c2 100644 --- a/sast-engine/cmd/scan.go +++ b/sast-engine/cmd/scan.go @@ -288,7 +288,10 @@ Examples: } } - buildClikeCallGraphs(cg, codeGraph, projectPath, logger) + clikeTarget, _ := cmd.Flags().GetString("target") + stdlibBaseURL, _ := cmd.Flags().GetString("stdlib-base-url") + stdlibCfg, _ := initClikeStdlib(projectPath, clikeTarget, stdlibBaseURL, logger) + buildClikeCallGraphs(cg, codeGraph, projectPath, logger, stdlibCfg) // Step 4: Load Python SDK rules logger.StartProgress("Loading rules", -1) @@ -480,6 +483,18 @@ func countTotalCallSites(cg *core.CallGraph) int { return total } +// clikeStdlibConfig carries the optional stdlib loaders into the +// C / C++ call-graph builders. A zero value (all fields nil/empty) is +// valid: the builders fall back to Phase 1 behavior — system-include +// calls remain unresolved but everything else continues to work. +// +// Phase 2 — added in PR-02. +type clikeStdlibConfig struct { + cLoader core.CStdlibLoader + cppLoader core.CppStdlibLoader + platform string +} + // buildClikeCallGraphs runs the C and C++ call-graph builders against // codeGraph (when those languages are present) and merges the results // into cg. Each builder is independent: a failure or skip on one @@ -489,20 +504,26 @@ func countTotalCallSites(cg *core.CallGraph) int { // manifest file. We instead look at the already-parsed CodeGraph for // nodes tagged with the right `Language` so the builder skips the // work entirely on Python-only or Go-only projects. -func buildClikeCallGraphs(cg *core.CallGraph, codeGraph *graph.CodeGraph, projectPath string, logger *output.Logger) { +// +// stdlib is the optional Phase 2 loader bundle. Pass an empty +// clikeStdlibConfig to disable stdlib resolution (Phase 1 behavior). +func buildClikeCallGraphs(cg *core.CallGraph, codeGraph *graph.CodeGraph, projectPath string, logger *output.Logger, stdlib clikeStdlibConfig) { if hasLanguageNodes(codeGraph, "c") { - buildCCallGraphAndMerge(cg, codeGraph, projectPath, logger) + buildCCallGraphAndMerge(cg, codeGraph, projectPath, logger, stdlib) } if hasLanguageNodes(codeGraph, "cpp") { - buildCppCallGraphAndMerge(cg, codeGraph, projectPath, logger) + buildCppCallGraphAndMerge(cg, codeGraph, projectPath, logger, stdlib) } } // buildCCallGraphAndMerge constructs the C call graph and merges it // into cg. Build failures emit a warning and leave cg untouched. -func buildCCallGraphAndMerge(cg *core.CallGraph, codeGraph *graph.CodeGraph, projectPath string, logger *output.Logger) { +func buildCCallGraphAndMerge(cg *core.CallGraph, codeGraph *graph.CodeGraph, projectPath string, logger *output.Logger, stdlib clikeStdlibConfig) { logger.Debug("Detected C source files, building C call graph...") cRegistry := registry.BuildCModuleRegistry(projectPath, codeGraph) + if stdlib.cLoader != nil { + cRegistry.StdlibRegistry = stdlib.cLoader + } cTypeEngine := resolution.NewCTypeInferenceEngine(cRegistry) cCG, err := builder.BuildCCallGraph(codeGraph, cRegistry, cTypeEngine) if err != nil { @@ -516,9 +537,17 @@ func buildCCallGraphAndMerge(cg *core.CallGraph, codeGraph *graph.CodeGraph, pro // buildCppCallGraphAndMerge constructs the C++ call graph and merges // it into cg. Build failures emit a warning and leave cg untouched. -func buildCppCallGraphAndMerge(cg *core.CallGraph, codeGraph *graph.CodeGraph, projectPath string, logger *output.Logger) { +func buildCppCallGraphAndMerge(cg *core.CallGraph, codeGraph *graph.CodeGraph, projectPath string, logger *output.Logger, stdlib clikeStdlibConfig) { logger.Debug("Detected C++ source files, building C++ call graph...") cppRegistry := registry.BuildCppModuleRegistry(projectPath, codeGraph) + // C++ source files routinely call into / too, + // so wire BOTH loaders when available. + if stdlib.cLoader != nil { + cppRegistry.StdlibRegistry = stdlib.cLoader + } + if stdlib.cppLoader != nil { + cppRegistry.StdlibCppRegistry = stdlib.cppLoader + } cppTypeEngine := resolution.NewCppTypeInferenceEngine(cppRegistry) cppCG, err := builder.BuildCppCallGraph(codeGraph, cppRegistry, cppTypeEngine) if err != nil { @@ -530,6 +559,114 @@ func buildCppCallGraphAndMerge(cg *core.CallGraph, codeGraph *graph.CodeGraph, p len(cppCG.Functions), countTotalCallSites(cppCG)) } +// initClikeStdlib bootstraps the C and C++ stdlib loaders for a scan. +// +// targetOverride may be empty to trigger auto-detection (defaults to +// the host platform when no source-file evidence is found). baseURL +// selects the registry source: file:// paths read directly from disk; +// https:// is reserved for PR-03 and currently returns an error. +// +// The function ALWAYS returns a usable config: failures degrade to +// nil loaders + a warning, so the rest of the scan keeps Phase 1 +// behavior. The boolean reports whether at least one loader was +// successfully wired (useful for telemetry and resolution-report). +func initClikeStdlib(projectPath, targetOverride, baseURL string, logger *output.Logger) (clikeStdlibConfig, bool) { + platform := registry.DetectClikeTarget(projectPath, targetOverride) + + trimmed := strings.TrimSpace(baseURL) + if trimmed == "" { + return clikeStdlibConfig{platform: platform}, false + } + + cfg := clikeStdlibConfig{platform: platform} + wired := false + + if cLoader, ok := loadCStdlibFromBase(trimmed, platform, logger); ok { + cfg.cLoader = cLoader + wired = true + } + if cppLoader, ok := loadCppStdlibFromBase(trimmed, platform, logger); ok { + cfg.cppLoader = cppLoader + wired = true + } + return cfg, wired +} + +// loadCStdlibFromBase constructs the right C-stdlib loader for the +// given URL/path and calls LoadManifest. Failures log a warning and +// return false; callers fall back to Phase 1 resolution. +func loadCStdlibFromBase(baseURL, platform string, logger *output.Logger) (core.CStdlibLoader, bool) { + loader := buildCStdlibLoader(baseURL, platform) + if err := loader.LoadManifest(stdlibLoggerAdapter{logger}); err != nil { + logger.Warning("C stdlib manifest load failed: %v (continuing without C stdlib resolution)", err) + return nil, false + } + logger.Debug("C stdlib registry: loaded %d headers for platform %s", loader.HeaderCount(), platform) + return loader, true +} + +// loadCppStdlibFromBase is the C++ counterpart to loadCStdlibFromBase. +func loadCppStdlibFromBase(baseURL, platform string, logger *output.Logger) (core.CppStdlibLoader, bool) { + loader := buildCppStdlibLoader(baseURL, platform) + if err := loader.LoadManifest(stdlibLoggerAdapter{logger}); err != nil { + logger.Warning("C++ stdlib manifest load failed: %v (continuing without C++ stdlib resolution)", err) + return nil, false + } + logger.Debug("C++ stdlib registry: loaded %d headers for platform %s", loader.HeaderCount(), platform) + return loader, true +} + +// buildCStdlibLoader picks the right loader constructor for a base +// URL. file:// (or a bare path) → file loader; http(s):// → HTTP +// loader (PR-03). Splitting this from the load path keeps the +// constructor-selection logic isolated and easy to test. +func buildCStdlibLoader(baseURL, platform string) core.CStdlibLoader { + switch { + case strings.HasPrefix(baseURL, "file://"): + return registry.NewCStdlibRegistryFile(filepath.Join(strings.TrimPrefix(baseURL, "file://"), platform, "c", "v1"), platform) + case strings.HasPrefix(baseURL, "http://"), strings.HasPrefix(baseURL, "https://"): + return registry.NewCStdlibRegistryRemote(baseURL, platform) + default: + // Treat as a bare local path. + return registry.NewCStdlibRegistryFile(filepath.Join(baseURL, platform, "c", "v1"), platform) + } +} + +// buildCppStdlibLoader is the C++ counterpart to buildCStdlibLoader. +func buildCppStdlibLoader(baseURL, platform string) core.CppStdlibLoader { + switch { + case strings.HasPrefix(baseURL, "file://"): + return registry.NewCppStdlibRegistryFile(filepath.Join(strings.TrimPrefix(baseURL, "file://"), platform, "cpp", "v1"), platform) + case strings.HasPrefix(baseURL, "http://"), strings.HasPrefix(baseURL, "https://"): + return registry.NewCppStdlibRegistryRemote(baseURL, platform) + default: + return registry.NewCppStdlibRegistryFile(filepath.Join(baseURL, platform, "cpp", "v1"), platform) + } +} + +// stdlibLoggerAdapter bridges the cmd-package output.Logger into the +// core.CStdlibLogger interface the loader expects. Method names match +// the latter; calls forward unchanged. +type stdlibLoggerAdapter struct{ logger *output.Logger } + +func (a stdlibLoggerAdapter) Debug(format string, args ...any) { + if a.logger != nil { + a.logger.Debug(format, args...) + } +} + +func (a stdlibLoggerAdapter) Statistic(format string, args ...any) { + if a.logger != nil { + a.logger.Statistic(format, args...) + } +} + +func (a stdlibLoggerAdapter) Warning(format string, args ...any) { + if a.logger != nil { + a.logger.Warning(format, args...) + } +} + // hasLanguageNodes reports whether codeGraph contains at least one // node tagged with the given Language. Used to gate per-language call // graph builders so we skip the work when no source files of that @@ -1136,5 +1273,7 @@ func init() { scanCmd.Flags().String("base", "", "Base git ref for diff-aware scanning (required with --diff-aware)") scanCmd.Flags().String("head", "HEAD", "Head git ref for diff-aware scanning") scanCmd.Flags().Bool("enable-db-cache", false, "Enable SQLite-backed incremental analysis cache (experimental)") + scanCmd.Flags().String("target", "", "Override C/C++ target platform: linux, darwin, or windows (default: auto-detect)") + scanCmd.Flags().String("stdlib-base-url", "", "Base URL for the C/C++ stdlib registry (file://path or https://host). Empty disables stdlib resolution.") scanCmd.MarkFlagRequired("project") } diff --git a/sast-engine/cmd/scan_stdlib_test.go b/sast-engine/cmd/scan_stdlib_test.go new file mode 100644 index 00000000..811fe9b5 --- /dev/null +++ b/sast-engine/cmd/scan_stdlib_test.go @@ -0,0 +1,212 @@ +package cmd + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph" + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/registry" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// writeStdlibFixture materialises a minimal C/C++ stdlib registry on +// disk in the layout the file:// loader expects: +// +// //c/v1/manifest.json +// //c/v1/stdio_stdlib.json +// //cpp/v1/manifest.json +// //cpp/v1/vector_stdlib.json +// +// Returns the base directory. +func writeStdlibFixture(t *testing.T, platform string) string { + t.Helper() + base := t.TempDir() + + cDir := filepath.Join(base, platform, "c", "v1") + require.NoError(t, os.MkdirAll(cDir, 0o755)) + stdio := &core.CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "stdio.h", + ModuleID: "c::stdio", + Language: core.LanguageC, + Platform: platform, + Functions: map[string]*core.CStdlibFunction{ + "printf": {FQN: "c::stdio::printf", ReturnType: "int", Source: core.SourceOverlay, Confidence: 1.0}, + }, + } + stdioBytes, err := json.Marshal(stdio) + require.NoError(t, err) + require.NoError(t, os.WriteFile(filepath.Join(cDir, "stdio_stdlib.json"), stdioBytes, 0o644)) + + cManifest := &core.CStdlibManifest{ + SchemaVersion: "1.0.0", + RegistryVersion: "v1", + Platform: platform, + Language: core.LanguageC, + Headers: []*core.CStdlibHeaderEntry{ + {Header: "stdio.h", ModuleID: "c::stdio", File: "stdio_stdlib.json"}, + }, + Statistics: &core.CStdlibStatistics{TotalHeaders: 1, TotalFunctions: 1}, + } + cManifestBytes, err := json.Marshal(cManifest) + require.NoError(t, err) + require.NoError(t, os.WriteFile(filepath.Join(cDir, "manifest.json"), cManifestBytes, 0o644)) + + cppDir := filepath.Join(base, platform, "cpp", "v1") + require.NoError(t, os.MkdirAll(cppDir, 0o755)) + vec := &core.CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "vector", + ModuleID: "std::vector", + Language: core.LanguageCpp, + Classes: map[string]*core.CppStdlibClass{ + "std::vector": { + FQN: "std::vector", TypeParams: []string{"T"}, + Methods: map[string]*core.CStdlibFunction{ + "push_back": {FQN: "std::vector::push_back", ReturnType: "void", Source: core.SourceOverlay, Confidence: 1.0}, + }, + }, + }, + } + vecBytes, err := json.Marshal(vec) + require.NoError(t, err) + require.NoError(t, os.WriteFile(filepath.Join(cppDir, "vector_stdlib.json"), vecBytes, 0o644)) + + cppManifest := &core.CStdlibManifest{ + SchemaVersion: "1.0.0", + RegistryVersion: "v1", + Platform: platform, + Language: core.LanguageCpp, + Headers: []*core.CStdlibHeaderEntry{ + {Header: "vector", ModuleID: "std::vector", File: "vector_stdlib.json"}, + }, + Statistics: &core.CStdlibStatistics{TotalHeaders: 1, TotalClasses: 1, TotalFunctions: 1}, + } + cppManifestBytes, err := json.Marshal(cppManifest) + require.NoError(t, err) + require.NoError(t, os.WriteFile(filepath.Join(cppDir, "manifest.json"), cppManifestBytes, 0o644)) + + return base +} + +func TestInitClikeStdlib_DisabledWhenBaseURLEmpty(t *testing.T) { + cfg, wired := initClikeStdlib(t.TempDir(), "linux", "", newTestLogger()) + assert.False(t, wired, "empty base URL must leave loaders unwired") + assert.Nil(t, cfg.cLoader) + assert.Nil(t, cfg.cppLoader) + assert.Equal(t, "linux", cfg.platform) +} + +func TestInitClikeStdlib_FileSchemeWiresBothLoaders(t *testing.T) { + base := writeStdlibFixture(t, "linux") + cfg, wired := initClikeStdlib(t.TempDir(), "linux", "file://"+base, newTestLogger()) + require.True(t, wired) + require.NotNil(t, cfg.cLoader) + require.NotNil(t, cfg.cppLoader) + assert.Equal(t, 1, cfg.cLoader.HeaderCount()) + assert.Equal(t, 1, cfg.cppLoader.HeaderCount()) +} + +func TestInitClikeStdlib_BarePathTreatedAsFile(t *testing.T) { + base := writeStdlibFixture(t, "linux") + cfg, wired := initClikeStdlib(t.TempDir(), "linux", base, newTestLogger()) + require.True(t, wired) + require.NotNil(t, cfg.cLoader) + require.NotNil(t, cfg.cppLoader) +} + +func TestInitClikeStdlib_HTTPSchemeIsStubbed(t *testing.T) { + // HTTP path returns a constructed loader, but LoadManifest fails + // with the PR-03 stub error. Both loaders should be nil after the + // failed load. + cfg, wired := initClikeStdlib(t.TempDir(), "linux", "https://example.test/registries", newTestLogger()) + assert.False(t, wired) + assert.Nil(t, cfg.cLoader) + assert.Nil(t, cfg.cppLoader) +} + +func TestInitClikeStdlib_MissingFixtureDegradesGracefully(t *testing.T) { + // Point at a directory that contains no manifest. The loader is + // constructed but LoadManifest fails — the function must return a + // usable (empty) config rather than panic. + cfg, wired := initClikeStdlib(t.TempDir(), "linux", "file://"+t.TempDir(), newTestLogger()) + assert.False(t, wired) + assert.Nil(t, cfg.cLoader) + assert.Nil(t, cfg.cppLoader) + assert.Equal(t, "linux", cfg.platform) +} + +func TestBuildCStdlibLoader_SchemeDispatch(t *testing.T) { + t.Run("file scheme", func(t *testing.T) { + require.NotNil(t, buildCStdlibLoader("file:///tmp/foo", "linux")) + }) + t.Run("https scheme", func(t *testing.T) { + require.NotNil(t, buildCStdlibLoader("https://example.test/", "linux")) + }) + t.Run("http scheme", func(t *testing.T) { + require.NotNil(t, buildCStdlibLoader("http://example.test/", "linux")) + }) + t.Run("bare path", func(t *testing.T) { + require.NotNil(t, buildCStdlibLoader("/var/cache/pf", "linux")) + }) +} + +func TestBuildCppStdlibLoader_SchemeDispatch(t *testing.T) { + t.Run("file scheme", func(t *testing.T) { + require.NotNil(t, buildCppStdlibLoader("file:///tmp/foo", "linux")) + }) + t.Run("https scheme", func(t *testing.T) { + require.NotNil(t, buildCppStdlibLoader("https://example.test/", "linux")) + }) + t.Run("http scheme", func(t *testing.T) { + require.NotNil(t, buildCppStdlibLoader("http://example.test/", "linux")) + }) + t.Run("bare path", func(t *testing.T) { + require.NotNil(t, buildCppStdlibLoader("/var/cache/pf", "linux")) + }) +} + +func TestStdlibLoggerAdapter_ForwardsToLogger(t *testing.T) { + // Smoke-level: each forward simply must not panic with a real logger. + a := stdlibLoggerAdapter{logger: newTestLogger()} + a.Debug("debug %s", "msg") + a.Statistic("stat %d", 42) + a.Warning("warn %s", "msg") + + // Nil logger must also be tolerated — the adapter is the only call + // path for the loader, so a misconfigured caller must degrade + // silently rather than panic. + nilA := stdlibLoggerAdapter{logger: nil} + nilA.Debug("x") + nilA.Statistic("x") + nilA.Warning("x") +} + +func TestBuildClikeCallGraphs_StdlibConfigPropagates(t *testing.T) { + root := "/projects/app" + codeGraph := graph.NewCodeGraph() + codeGraph.AddNode(&graph.Node{ + ID: "fn:src/main.c::main", + Type: "function_definition", + Name: "main", + File: root + "/src/main.c", + Language: "c", + ReturnType: "int", + }) + + base := writeStdlibFixture(t, "linux") + cLoader := registry.NewCStdlibRegistryFile(filepath.Join(base, "linux", "c", "v1"), "linux") + require.NoError(t, cLoader.LoadManifest(stdlibLoggerAdapter{logger: newTestLogger()})) + + cfg := clikeStdlibConfig{cLoader: cLoader, platform: "linux"} + cg := core.NewCallGraph() + buildClikeCallGraphs(cg, codeGraph, root, newTestLogger(), cfg) + + // The call graph must still be populated; loader presence cannot + // regress Phase 1 behavior. + assert.Contains(t, cg.Functions, "src/main.c::main") +} diff --git a/sast-engine/cmd/scan_test.go b/sast-engine/cmd/scan_test.go index d9e9aaac..e4ed93f0 100644 --- a/sast-engine/cmd/scan_test.go +++ b/sast-engine/cmd/scan_test.go @@ -72,7 +72,7 @@ func TestBuildClikeCallGraphs_NoNodes(t *testing.T) { codeGraph := graph.NewCodeGraph() codeGraph.AddNode(&graph.Node{ID: "py-1", Language: "python", Type: "function_definition", Name: "f"}) - buildClikeCallGraphs(cg, codeGraph, "/projects/app", newTestLogger()) + buildClikeCallGraphs(cg, codeGraph, "/projects/app", newTestLogger(), clikeStdlibConfig{}) assert.Empty(t, cg.Functions, "no C/C++ nodes => no merge") } @@ -93,7 +93,7 @@ func TestBuildClikeCallGraphs_CFunctionsMerged(t *testing.T) { }) cg := core.NewCallGraph() - buildClikeCallGraphs(cg, codeGraph, root, newTestLogger()) + buildClikeCallGraphs(cg, codeGraph, root, newTestLogger(), clikeStdlibConfig{}) assert.Contains(t, cg.Functions, "src/main.c::main") } @@ -114,7 +114,7 @@ func TestBuildClikeCallGraphs_CppFunctionsMerged(t *testing.T) { }) cg := core.NewCallGraph() - buildClikeCallGraphs(cg, codeGraph, root, newTestLogger()) + buildClikeCallGraphs(cg, codeGraph, root, newTestLogger(), clikeStdlibConfig{}) assert.Contains(t, cg.Functions, "src/main.cpp::main") assert.NotContains(t, cg.Functions, "src/main.c::main", "C++ node must not appear in C namespace") @@ -137,7 +137,7 @@ func TestBuildClikeCallGraphs_MixedProject(t *testing.T) { }) cg := core.NewCallGraph() - buildClikeCallGraphs(cg, codeGraph, root, newTestLogger()) + buildClikeCallGraphs(cg, codeGraph, root, newTestLogger(), clikeStdlibConfig{}) assert.Contains(t, cg.Functions, "src/main.c::c_main") assert.Contains(t, cg.Functions, "src/main.cpp::cpp_main") diff --git a/sast-engine/testdata/c/stdlib/main.c b/sast-engine/testdata/c/stdlib/main.c new file mode 100644 index 00000000..5543d6cf --- /dev/null +++ b/sast-engine/testdata/c/stdlib/main.c @@ -0,0 +1,39 @@ +/* + * Smoke fixture for C stdlib resolution (PR-02). + * + * Each call below should resolve against the C stdlib registry: + * - printf, fprintf → stdio.h + * - malloc, free → stdlib.h + * - strlen → string.h + * + * The local helper (greet) verifies that user code coexists with + * stdlib calls without breaking Phase 1 resolution. + */ + +#include +#include +#include + +static void greet(const char *name) { + printf("hello, %s\n", name); +} + +int main(int argc, char **argv) { + (void)argc; + (void)argv; + + char *buf = (char *)malloc(64); + if (buf == NULL) { + fprintf(stderr, "malloc failed\n"); + return 1; + } + + const char *who = "world"; + size_t n = strlen(who); + snprintf(buf, 64, "%s (%zu bytes)", who, n); + + greet(buf); + + free(buf); + return 0; +} diff --git a/sast-engine/testdata/cpp/stl/main.cpp b/sast-engine/testdata/cpp/stl/main.cpp new file mode 100644 index 00000000..4094857a --- /dev/null +++ b/sast-engine/testdata/cpp/stl/main.cpp @@ -0,0 +1,49 @@ +/* + * Smoke fixture for C++ STL resolution (PR-02). + * + * The calls exercise the three dispatch paths the C++ resolver + * extends in PR-02: + * 1. Class methods on stdlib types (vec.push_back, vec.size) + * 2. Free functions in std namespace (std::move, std::swap) + * 3. C-shape stdlib calls from C++ code (printf, malloc) + * + * The user-defined Echo class verifies project resolution still + * works alongside stdlib resolution. + */ + +#include +#include +#include +#include +#include + +class Echo { + public: + explicit Echo(std::string prefix) : prefix_(std::move(prefix)) {} + void say(const std::string &msg) const { + std::printf("%s: %s\n", prefix_.c_str(), msg.c_str()); + } + + private: + std::string prefix_; +}; + +int main() { + std::vector nums; + nums.push_back(1); + nums.push_back(2); + nums.push_back(3); + + int a = 10, b = 20; + std::swap(a, b); + + Echo echo("info"); + echo.say("hello, stl"); + + void *raw = std::malloc(16); + if (raw != nullptr) { + std::free(raw); + } + + return static_cast(nums.size()); +} From 6204dda7a6e2f3c5a957b47489e9eb617023e52a Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 18:45:16 -0400 Subject: [PATCH 4/4] test(builder+registry): cover stdlib helpers + fix K-alias substitution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes that together push patch coverage past Codecov's 85.55% gate (84.98% → 85.7%): 1. White-box unit tests for the C++ stdlib helper suite: canonicalizeStdlibType, parseTemplateArgs, applyTemplateSubstitution, replaceWholeWord, substituteTemplateMethodReturn, plus the missing nil/empty-input guards in lookupCppStdlibMethod, lookupCStdlib, and lookupCppStdlibFreeFunction. All five helpers now hit 100%. 2. Bug fix uncovered while writing the K-alias test: the loop in applyTemplateSubstitution broke at V whenever args was shorter than 3, so K never ran. With map-style return types written as "K", the placeholder stayed un-substituted. Drop the early break and rely on the per-iteration idx-bounds check. 3. clike_disk_cache_test.go gains an env-clearing test for the $HOME fallback branch in getStdlibCacheRoot (37.5% → 75%). Co-Authored-By: Claude Sonnet 4.5 --- .../graph/callgraph/builder/cpp_builder.go | 8 +- .../builder/cpp_stdlib_helpers_test.go | 146 ++++++++++++++++++ .../registry/clike_disk_cache_test.go | 35 +++++ 3 files changed, 185 insertions(+), 4 deletions(-) create mode 100644 sast-engine/graph/callgraph/builder/cpp_stdlib_helpers_test.go diff --git a/sast-engine/graph/callgraph/builder/cpp_builder.go b/sast-engine/graph/callgraph/builder/cpp_builder.go index 9c2ad190..83df6a3e 100644 --- a/sast-engine/graph/callgraph/builder/cpp_builder.go +++ b/sast-engine/graph/callgraph/builder/cpp_builder.go @@ -661,10 +661,10 @@ func applyTemplateSubstitution(returnType string, args []string) string { placeholders := []string{"T", "U", "V", "K"} out := returnType for i, ph := range placeholders { - if i >= len(args) && ph != "K" { - break - } - // "K" is an alias for the first arg in map-shaped containers. + // "K" is an alias for the first arg in map-shaped containers — it + // shares an index with T but is iterated separately so a return + // type written as "K" still resolves when the registry exposes + // only T/U bindings. idx := i if ph == "K" { idx = 0 diff --git a/sast-engine/graph/callgraph/builder/cpp_stdlib_helpers_test.go b/sast-engine/graph/callgraph/builder/cpp_stdlib_helpers_test.go new file mode 100644 index 00000000..513a3f47 --- /dev/null +++ b/sast-engine/graph/callgraph/builder/cpp_stdlib_helpers_test.go @@ -0,0 +1,146 @@ +package builder + +import ( + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" +) + +func newTestStdlibFn(fqn, returnType string) *core.CStdlibFunction { + return &core.CStdlibFunction{FQN: fqn, ReturnType: returnType} +} + +func TestCanonicalizeStdlibType(t *testing.T) { + tests := []struct { + name string + in string + want string + }{ + {"plain class", "std::string", "std::string"}, + {"single template arg", "std::vector", "std::vector"}, + {"nested template", "std::map>", "std::map"}, + {"primitive", "int", "int"}, + {"empty string", "", ""}, + {"leading bracket is non-canonical, treated as plain", "", "std::vector"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, canonicalizeStdlibType(tt.in)) + }) + } +} + +func TestParseTemplateArgs(t *testing.T) { + tests := []struct { + name string + in string + want []string + }{ + {"no template", "std::string", nil}, + {"unmatched open", "std::vectorstd::vector<", nil}, + {"single arg", "std::vector", []string{"int"}}, + {"two args", "std::map", []string{"std::string", "int"}}, + {"nested template arg", "std::map>", []string{"int", "std::vector"}}, + {"deeply nested", "A>, G>", []string{"B>", "G"}}, + {"trailing whitespace trimmed", "std::pair< int , long >", []string{"int", "long"}}, + {"empty body", "x<>", []string{""}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, parseTemplateArgs(tt.in)) + }) + } +} + +func TestApplyTemplateSubstitution(t *testing.T) { + tests := []struct { + name string + returnType string + args []string + want string + }{ + {"single T", "T", []string{"int"}, "int"}, + {"reference to T", "T&", []string{"int"}, "int&"}, + {"const T pointer", "const T*", []string{"std::string"}, "const std::string*"}, + {"pair of T and U", "std::pair", []string{"int", "long"}, "std::pair"}, + {"K alias for first arg", "K", []string{"std::string", "int"}, "std::string"}, + {"V third arg", "std::tuple", []string{"int", "long", "char"}, "std::tuple"}, + {"non-placeholder T-prefix preserved", "Type", []string{"int"}, "Type"}, + {"only V with one arg leaves V untouched", "V", []string{"int"}, "V"}, + {"no placeholders", "iterator", []string{"int"}, "iterator"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, applyTemplateSubstitution(tt.returnType, tt.args)) + }) + } +} + +func TestReplaceWholeWord(t *testing.T) { + tests := []struct { + name string + s string + from string + to string + want string + }{ + {"empty needle returns input", "anything", "", "X", "anything"}, + {"start of string", "T&", "T", "int", "int&"}, + {"end of string", "&T", "T", "int", "&int"}, + {"middle word", "const T&", "T", "int", "const int&"}, + {"not a word boundary", "Type", "T", "X", "Type"}, + {"multiple replacements", "T, T", "T", "int", "int, int"}, + {"adjacent identifiers preserved", "const TX", "T", "int", "const TX"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, replaceWholeWord(tt.s, tt.from, tt.to)) + }) + } +} + +func TestSubstituteTemplateMethodReturn_NoArgs(t *testing.T) { + method := newTestStdlibFn("std::vector::push_back", "void") + got := substituteTemplateMethodReturn(method, "std::vector") + assert.Same(t, method, got, "no template args => return the same pointer (no allocation)") +} + +func TestSubstituteTemplateMethodReturn_ShallowCopyOnSubstitution(t *testing.T) { + method := newTestStdlibFn("std::vector::front", "T&") + got := substituteTemplateMethodReturn(method, "std::vector") + assert.NotSame(t, method, got, "substitution must not mutate the registry record") + assert.Equal(t, "T&", method.ReturnType, "original record stays unchanged") + assert.Equal(t, "int&", got.ReturnType) +} + +func TestLookupCppStdlibMethod_NilTypeEngine(t *testing.T) { + cs := &CallSiteInternal{CallerFQN: "src/main.cpp::main", FunctionName: "push_back", ObjectName: "vec"} + fqn, fn := lookupCppStdlibMethod(cs, core.NewCppModuleRegistry("/proj"), nil) + assert.Empty(t, fqn) + assert.Nil(t, fn) +} + +func TestLookupCppStdlibFreeFunction_RequiresQualifiedName(t *testing.T) { + cs := &CallSiteInternal{CallerFQN: "src/main.cpp::main", FunctionName: "move"} // bare, no namespace + cReg := core.NewCModuleRegistry("/proj") + cReg.FileToPrefix["/proj/src/main.cpp"] = "src/main.cpp" + fqn, fn := lookupCppStdlibFreeFunction(cs, cReg, nil) + assert.Empty(t, fqn) + assert.Nil(t, fn) +} + +func TestLookupCppStdlibFreeFunction_FileNotInRegistry(t *testing.T) { + cs := &CallSiteInternal{CallerFile: "/unknown.cpp", CallerFQN: "x::y", FunctionName: "std::move"} + fqn, fn := lookupCppStdlibFreeFunction(cs, core.NewCModuleRegistry("/proj"), nil) + assert.Empty(t, fqn) + assert.Nil(t, fn) +} + +func TestLookupCStdlib_FileNotInRegistry(t *testing.T) { + fqn, fn := lookupCStdlib("/unknown.c", "printf", core.NewCModuleRegistry("/proj")) + assert.Empty(t, fqn) + assert.Nil(t, fn) +} diff --git a/sast-engine/graph/callgraph/registry/clike_disk_cache_test.go b/sast-engine/graph/callgraph/registry/clike_disk_cache_test.go index e160991c..31a311e5 100644 --- a/sast-engine/graph/callgraph/registry/clike_disk_cache_test.go +++ b/sast-engine/graph/callgraph/registry/clike_disk_cache_test.go @@ -146,3 +146,38 @@ func TestGetStdlibCacheRoot_PrefersXDG(t *testing.T) { func TestStdlibCacheTTL_Constant(t *testing.T) { assert.Equal(t, 24*time.Hour, stdlibCacheTTL) } + +// TestGetStdlibCacheRoot_FallsBackToHome forces the XDG_CACHE_HOME and +// LOCALAPPDATA env vars to be empty so the fallback to $HOME/.cache is +// the only path that can produce a non-empty result. +func TestGetStdlibCacheRoot_FallsBackToHome(t *testing.T) { + t.Setenv("XDG_CACHE_HOME", "") + t.Setenv("LOCALAPPDATA", "") + + got := getStdlibCacheRoot() + if got == "" { + // UserHomeDir failed in this environment — accept the empty + // answer; the path-disabled behavior is also exercised here. + return + } + assert.Contains(t, got, ".cache") + assert.Contains(t, got, "pathfinder") + assert.Contains(t, got, "registries") +} + +// TestGetStdlibCacheRoot_NoHomeReturnsEmpty zeroes both env vars AND +// HOME so even the fallback can't construct a path. We expect "". +func TestGetStdlibCacheRoot_NoHomeReturnsEmpty(t *testing.T) { + t.Setenv("XDG_CACHE_HOME", "") + t.Setenv("LOCALAPPDATA", "") + t.Setenv("HOME", "") + t.Setenv("USERPROFILE", "") + + got := getStdlibCacheRoot() + // On most CI runners removing HOME causes UserHomeDir to fail; + // some platforms still resolve via /etc/passwd. Accept either as + // long as the function does not panic. + if got != "" { + assert.Contains(t, got, "pathfinder") + } +}