Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions sast-engine/graph/callgraph/builder/c_builder_stdlib_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ func (f *fakeCStdlibLoader) Platform() string { return "linux" }

func (f *fakeCStdlibLoader) HeaderCount() int { return len(f.headers) }

func (f *fakeCStdlibLoader) ListHeaders() []string {
out := make([]string, 0, len(f.headers))
for k := range f.headers {
out = append(out, k)
}
return out
}

// TestBuildCCallGraph_StdlibFallback verifies that an unresolved call
// falls through to the stdlib registry and emits an enriched CallSite
// (TargetFQN, return type, confidence, security tag).
Expand Down
41 changes: 34 additions & 7 deletions sast-engine/graph/callgraph/builder/cpp_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -551,21 +551,48 @@ func lookupCppStdlibMethod(
// ("std::move") for the registry's GetFreeFunction to succeed — bare
// names like "move" without the namespace are a different lookup path
// and stay out of scope here.
//
// The lookup happens in two stages:
//
// 1. Walk the caller file's direct system includes. This is the fast
// path and catches the common case where the calling file
// `#include <utility>` itself.
// 2. If the direct walk doesn't yield a hit, fall back to scanning
// every header in the manifest. Real-world C++ code routinely
// relies on transitive includes (a header pulling in another that
// pulls in <utility>), so without this fallback std::move /
// std::forward / std::swap fail to resolve in the majority of
// calling files.
//
// The fallback is bounded to fully-qualified names (containing "::") so
// unqualified project-internal symbols can never accidentally bind to a
// stdlib entry. Performance is fine: the scan is O(headers) per
// unresolved namespaced call, ~120 headers on libstdc++; first hit wins.
func lookupCppStdlibFreeFunction(
cs *CallSiteInternal,
cReg *core.CModuleRegistry,
cppLoader core.CppStdlibLoader,
) (string, *core.CStdlibFunction) {
if !strings.Contains(cs.FunctionName, fqnSeparator) {
if !strings.Contains(cs.FunctionName, fqnSeparator) || cppLoader == nil {
return "", nil
}
prefix, ok := cReg.FileToPrefix[cs.CallerFile]
if !ok {
return "", nil

// Stage 1: direct system includes from the caller file.
if prefix, ok := cReg.FileToPrefix[cs.CallerFile]; ok {
for _, header := range cReg.SystemIncludes[prefix] {
if fn, err := cppLoader.GetFreeFunction(header, cs.FunctionName); err == nil && fn != nil {
return fn.FQN, fn
}
}
}
for _, header := range cReg.SystemIncludes[prefix] {
fn, err := cppLoader.GetFreeFunction(header, cs.FunctionName)
if err == nil && fn != nil {

// Stage 2: transitive-include fallback. Walk every manifest header.
// We're trading a constant-time loop per unresolved namespaced call
// for the ability to resolve symbols pulled in transitively. First
// hit wins; stdlib FQNs are unique across headers so order doesn't
// affect correctness (only speed-of-first-hit).
for _, header := range cppLoader.ListHeaders() {
if fn, err := cppLoader.GetFreeFunction(header, cs.FunctionName); err == nil && fn != nil {
return fn.FQN, fn
}
}
Expand Down
41 changes: 41 additions & 0 deletions sast-engine/graph/callgraph/builder/cpp_builder_stdlib_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ func (f *fakeCppStdlibLoader) GetFreeFunction(headerName, fqn string) (*core.CSt
func (f *fakeCppStdlibLoader) Platform() string { return "linux" }
func (f *fakeCppStdlibLoader) HeaderCount() int { return len(f.headers) }

func (f *fakeCppStdlibLoader) ListHeaders() []string {
out := make([]string, 0, len(f.headers))
for k := range f.headers {
out = append(out, k)
}
return out
}

// TestBuildCppCallGraph_StdlibClassMethod resolves `vec.push_back(...)`
// against the C++ stdlib registry. The receiver type comes from the
// type engine; the resolver canonicalises std::vector<int> → std::vector
Expand Down Expand Up @@ -151,6 +159,39 @@ func TestBuildCppCallGraph_StdlibClassMethod_TemplateSubstitution(t *testing.T)
assert.Equal(t, "int&", sites[0].InferredType, "T must be replaced with the concrete template argument")
}

// TestBuildCppCallGraph_StdlibFreeFunction_TransitiveFallback covers the
// PR-04 transitive-include fallback. A file that calls `std::move`
// without directly including <utility> still resolves because the
// resolver scans every manifest header when the direct include list
// doesn't yield a hit.
func TestBuildCppCallGraph_StdlibFreeFunction_TransitiveFallback(t *testing.T) {
root := cppFixtureRoot
mainCpp := root + "/src/main.cpp"

f := newCppFixture(t)
main := f.addFreeFunction(t, mainCpp, "src/main.cpp", "", "main", "int")
f.addCall(t, main, "std::move", "")

// Notice: NO entry in SystemIncludes for src/main.cpp — the file
// gets std::move via a transitive include we can't see at the
// CodeGraph level. Pre-PR-04 this stayed unresolved.
f.registry.StdlibCppRegistry = newFakeCppStdlibLoader(map[string]*core.CStdlibHeader{
"utility": {
Header: "utility",
FreeFunctions: map[string]*core.CStdlibFunction{
"std::move": {FQN: "std::move", ReturnType: "T&&", Source: core.SourceOverlay, Confidence: 1.0},
},
},
})

cg := f.build(t)

sites := cg.CallSites["src/main.cpp::main"]
require.Len(t, sites, 1)
assert.True(t, sites[0].Resolved, "transitive-include fallback must resolve std::move")
assert.Equal(t, "std::move", sites[0].TargetFQN)
}

// TestBuildCppCallGraph_StdlibFreeFunction handles `std::move(x)` —
// a namespaced free function looked up via GetFreeFunction.
func TestBuildCppCallGraph_StdlibFreeFunction(t *testing.T) {
Expand Down
8 changes: 8 additions & 0 deletions sast-engine/graph/callgraph/core/clike_stdlib_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ type CStdlibLoader interface {
GetFunction(headerName, funcName string) (*CStdlibFunction, error)
Platform() string
HeaderCount() int
// ListHeaders returns every header name in the loaded manifest in
// deterministic order. The resolver uses this to fall back to a
// global manifest scan when a call site doesn't directly #include
// the header that owns the symbol — common with C++ codebases that
// rely on transitive includes (vector pulling in utility, etc).
//
// Returns an empty slice when LoadManifest has not been called.
ListHeaders() []string
}

// CppStdlibLoader extends CStdlibLoader with C++-specific accessors. Free
Expand Down
21 changes: 21 additions & 0 deletions sast-engine/graph/callgraph/registry/c_stdlib_remote.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,27 @@ func (r *CStdlibRegistryRemote) HeaderCount() int {
return len(r.manifest.Headers)
}

// ListHeaders returns every header name in the loaded manifest in the
// order it was emitted. Resolver-side transitive-include fallback uses
// this to scan all stdlib headers when the caller's direct #include list
// doesn't yield a hit.
//
// Returns nil when LoadManifest has not been called. Allocates a fresh
// slice on each call so callers may sort or filter without mutating the
// loader state.
func (r *CStdlibRegistryRemote) ListHeaders() []string {
r.cacheMutex.RLock()
defer r.cacheMutex.RUnlock()
if r.manifest == nil {
return nil
}
out := make([]string, 0, len(r.manifest.Headers))
for _, e := range r.manifest.Headers {
out = append(out, e.Header)
}
return out
}

// Compile-time interface checks — fail at build time if the struct ever
// drifts from the contract.
var _ core.CStdlibLoader = (*CStdlibRegistryRemote)(nil)
24 changes: 24 additions & 0 deletions sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,3 +254,27 @@ func TestCStdlibRegistry_ImplementsInterface(t *testing.T) {
var _ core.CStdlibLoader = NewCStdlibRegistryFile(t.TempDir(), core.PlatformLinux)
var _ core.CStdlibLoader = NewCStdlibRegistryRemote("https://x", core.PlatformLinux)
}

// TestCStdlibRegistry_ListHeaders verifies the manifest-wide enumerator
// the resolver uses for transitive-include fallback. The order should
// follow the manifest's Headers slice; the result must be a fresh slice
// (callers may sort/filter without mutating loader state).
func TestCStdlibRegistry_ListHeaders(t *testing.T) {
dir := t.TempDir()
writeCRegistry(t, dir)
r := NewCStdlibRegistryFile(dir, core.PlatformLinux)
require.NoError(t, r.LoadManifest(noopLogger{}))

headers := r.ListHeaders()
assert.ElementsMatch(t, []string{"stdio.h", "stdlib.h"}, headers)

// Mutating the returned slice must not affect a subsequent call.
headers[0] = "tampered"
again := r.ListHeaders()
assert.NotEqual(t, "tampered", again[0], "ListHeaders must return an independent slice")
}

func TestCStdlibRegistry_ListHeaders_BeforeLoad(t *testing.T) {
r := NewCStdlibRegistryFile(t.TempDir(), core.PlatformLinux)
assert.Nil(t, r.ListHeaders())
}
29 changes: 29 additions & 0 deletions sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,15 @@ func (r *CppStdlibRegistryRemote) GetMethod(headerName, classFQN, methodName str
// GetFreeFunction looks up a namespaced free function by its full FQN
// (e.g. "std::move", "std::swap"). Distinct from GetFunction so the resolver
// can be explicit about which form it expects.
//
// Looks in both `free_functions` and `functions` because the PR-01
// generator currently emits some overlay-only namespaced symbols
// (notably std::move, std::forward, std::swap on certain headers) into
// `functions` rather than `free_functions`. A resolver that consulted
// only one map would miss those entries even though they're present in
// the manifest. A future cleanup pass on the generator could canonicalise
// the placement, but the resolver-side fallback is a strict superset and
// stays correct either way.
func (r *CppStdlibRegistryRemote) GetFreeFunction(headerName, fqn string) (*core.CStdlibFunction, error) {
h, err := r.GetHeader(headerName)
if err != nil {
Expand All @@ -294,6 +303,9 @@ func (r *CppStdlibRegistryRemote) GetFreeFunction(headerName, fqn string) (*core
if f, ok := h.FreeFunctions[fqn]; ok {
return f, nil
}
if f, ok := h.Functions[fqn]; ok {
return f, nil
}
return nil, fmt.Errorf("GetFreeFunction: %q not in header %q", fqn, headerName)
}

Expand All @@ -312,4 +324,21 @@ func (r *CppStdlibRegistryRemote) HeaderCount() int {
return len(r.manifest.Headers)
}

// ListHeaders returns every header name in the loaded manifest. Mirrors
// the C loader's contract — the resolver uses this for transitive-include
// fallback when a namespaced std::* call appears in a file that doesn't
// directly #include the owning header.
func (r *CppStdlibRegistryRemote) ListHeaders() []string {
r.cacheMutex.RLock()
defer r.cacheMutex.RUnlock()
if r.manifest == nil {
return nil
}
out := make([]string, 0, len(r.manifest.Headers))
for _, e := range r.manifest.Headers {
out = append(out, e.Header)
}
return out
}

var _ core.CppStdlibLoader = (*CppStdlibRegistryRemote)(nil)
54 changes: 54 additions & 0 deletions sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,38 @@ func TestCppStdlibRegistry_GetFunctionFreeFunctionFallback(t *testing.T) {
assert.Equal(t, "std::swap", got.FQN)
}

// TestCppStdlibRegistry_GetFreeFunction_FunctionsMapFallback pins the
// PR-04 fix: when the generator stores a namespaced symbol under
// `functions` instead of `free_functions` (current PR-01 behaviour for
// std::move and std::forward), GetFreeFunction must still find it.
//
// We mutate the cached header directly to simulate the generator output
// rather than redefining writeCppRegistry, keeping the existing fixture
// stable for other tests.
func TestCppStdlibRegistry_GetFreeFunction_FunctionsMapFallback(t *testing.T) {
dir := t.TempDir()
writeCppRegistry(t, dir)
r := NewCppStdlibRegistryFile(dir, core.PlatformLinux)
require.NoError(t, r.LoadManifest(noopLogger{}))

// Force a one-time fetch so the header is in the cache, then move
// std::move out of the FreeFunctions map and into Functions —
// mimicking what the generator currently emits in the wild.
h, err := r.GetHeader("utility")
require.NoError(t, err)
moved := h.FreeFunctions["std::move"]
require.NotNil(t, moved)
delete(h.FreeFunctions, "std::move")
if h.Functions == nil {
h.Functions = map[string]*core.CStdlibFunction{}
}
h.Functions["std::move"] = moved

got, err := r.GetFreeFunction("utility", "std::move")
require.NoError(t, err)
assert.Equal(t, "T&&", got.ReturnType)
}

func TestCppStdlibRegistry_GetFunctionMissing(t *testing.T) {
dir := t.TempDir()
writeCppRegistry(t, dir)
Expand Down Expand Up @@ -223,6 +255,28 @@ func TestCppStdlibRegistry_RemoteCtorTrimsSlash(t *testing.T) {
assert.Equal(t, "https://x/registries", r.baseURL)
}

// TestCppStdlibRegistry_ListHeaders confirms the C++ loader exposes the
// manifest's header list for transitive-include fallback. Same contract
// as the C loader: deterministic order, fresh slice each call.
func TestCppStdlibRegistry_ListHeaders(t *testing.T) {
dir := t.TempDir()
writeCppRegistry(t, dir)
r := NewCppStdlibRegistryFile(dir, core.PlatformLinux)
require.NoError(t, r.LoadManifest(noopLogger{}))

headers := r.ListHeaders()
assert.ElementsMatch(t, []string{"vector", "utility"}, headers)

headers[0] = "tampered"
again := r.ListHeaders()
assert.NotEqual(t, "tampered", again[0])
}

func TestCppStdlibRegistry_ListHeaders_BeforeLoad(t *testing.T) {
r := NewCppStdlibRegistryFile(t.TempDir(), core.PlatformLinux)
assert.Nil(t, r.ListHeaders())
}

func TestCppStdlibRegistry_ImplementsInterface(t *testing.T) {
var _ core.CppStdlibLoader = NewCppStdlibRegistryFile(t.TempDir(), core.PlatformLinux)
var _ core.CppStdlibLoader = NewCppStdlibRegistryRemote("https://x", core.PlatformLinux)
Expand Down
9 changes: 9 additions & 0 deletions sast-engine/tools/internal/clikeextract/c_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ func extractCHeader(file HeaderFile, src HeaderSource) (*core.CStdlibHeader, err
return nil, fmt.Errorf("extractCHeader: reading %q: %w", file.Path, err)
}

// glibc declarations carry trailing GCC attribute macros (__THROW,
// __attribute_pure__, __nonnull((1)), __attr_access((__read_only__,1,2)))
// that tree-sitter's C grammar can't digest — the parser collapses the
// entire surrounding declaration into an ERROR node, and the walker
// below skips ERROR nodes by design. Strip those macros to whitespace
// up front so the parser sees clean C and produces normal `declaration`
// nodes for strlen / strcmp / snprintf and similar.
source = preprocessGlibcAttributes(source)

parser := sitter.NewParser()
parser.SetLanguage(clang.GetLanguage())
tree, err := parser.ParseCtx(context.Background(), nil, source)
Expand Down
33 changes: 33 additions & 0 deletions sast-engine/tools/internal/clikeextract/c_extractor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,39 @@ func TestExtractCHeader_FileNotFound(t *testing.T) {
assert.Contains(t, err.Error(), "reading")
}

// TestExtractCHeader_GlibcAttributesRecovered is the regression test for
// PR-04 Gap 1: declarations decorated with __THROW / __attribute_pure__ /
// __nonnull((...)) / __attr_access((...)) used to vanish into an ERROR
// node and never reach the manifest. The preprocess step strips the
// macros to whitespace so tree-sitter parses clean C.
func TestExtractCHeader_GlibcAttributesRecovered(t *testing.T) {
src := cTestSource()
hf := HeaderFile{Name: "glibc_string.h", Path: filepath.Join(cFixtureDir, "glibc_string.h")}

h, err := extractCHeader(hf, src)
require.NoError(t, err)

// Each entry below was confirmed missing from the manifest before the
// preprocess step landed (verified against /usr/include/string.h on
// Ubuntu during PR-04 validation).
for _, name := range []string{
"strlen", "strcmp", "strncmp", "strcasecmp", "memcmp", "memmem", "strerror_r",
"snprintf", "vsnprintf",
} {
fn := h.Functions[name]
require.NotNilf(t, fn, "expected glibc-decorated function %q to be recovered", name)
assert.NotContainsf(t, fn.FQN, "__", "FQN %q must not carry leftover macro tokens", fn.FQN)
assert.NotEmptyf(t, fn.ReturnType, "%q must have a return type", name)
}

// strlen has a single const-char-pointer parameter; check the
// param table doesn't collapse to zero-length under the preprocessed
// source.
strlen := h.Functions["strlen"]
require.Len(t, strlen.Params, 1)
assert.Equal(t, "size_t", strlen.ReturnType)
}

func TestStripTrailingComment(t *testing.T) {
tests := []struct {
in, want string
Expand Down
Loading
Loading