From 50e49dd388f2938f79c086f625450e3533a53d7d Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 20:31:17 -0400 Subject: [PATCH 1/4] fix(generator): recover glibc-decorated declarations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tree-sitter's C grammar can't parse the trailing GCC attribute macro chain on glibc declarations: extern size_t strlen (const char *__s) __THROW __attribute_pure__ __nonnull ((1)); The parser collapses the entire surrounding declaration into an ERROR node, the walker skips ERROR nodes by design, and the symbol vanishes from the manifest. Effect on Linux: ~50% of and parts of were missing — strlen, strcmp, strncmp, strcasecmp, memcmp, memmem, snprintf, vsnprintf and friends. clike_preprocess.go strips known no-op glibc attribute macros to whitespace before tree-sitter parses. Two passes: - Bare-token list (__THROW, __THROWNL, __attribute_pure__, __wur, __returns_nonnull, …) sourced from /usr/include/x86_64-linux-gnu/ sys/cdefs.h. Word-boundary regex so we don't match inside identifiers. - Function-like list (__attribute__, __nonnull, __attr_access, __attr_dealloc, __REDIRECT, …) with balanced-paren consumption so the trailing argument list is fully eaten. Length-preserving (replace with spaces, leave newlines intact) so tree-sitter's reported line numbers stay accurate. Idempotent. Doesn't strip __inline / __restrict / __flexarr — those are real keywords or have semantic effect on parsing. Wired into both extractCHeader and extractCppHeader since libstdc++ borrows the same decorations. Validation against /usr/include on Ubuntu 24.04 (libstdc++-13): - string.h: 38 → 48 functions - stdio.h: recovered scanf/sprintf/snprintf/vsnprintf - Total Linux C: 8467 → 9205 functions (+738) Smoke fixture (testdata/c/stdlib): 5/7 → 7/7 resolved. Redis full tree: 60.6% → 63.1% (+2.5 pp). Co-Authored-By: Claude Sonnet 4.5 --- .../internal/clikeextract/c_extractor.go | 9 + .../internal/clikeextract/c_extractor_test.go | 33 +++ .../internal/clikeextract/clike_preprocess.go | 189 ++++++++++++++++++ .../clikeextract/clike_preprocess_test.go | 142 +++++++++++++ .../internal/clikeextract/cpp_extractor.go | 6 + .../clikeextract/testdata/c/glibc_string.h | 55 +++++ 6 files changed, 434 insertions(+) create mode 100644 sast-engine/tools/internal/clikeextract/clike_preprocess.go create mode 100644 sast-engine/tools/internal/clikeextract/clike_preprocess_test.go create mode 100644 sast-engine/tools/internal/clikeextract/testdata/c/glibc_string.h diff --git a/sast-engine/tools/internal/clikeextract/c_extractor.go b/sast-engine/tools/internal/clikeextract/c_extractor.go index 62d6279a..478be587 100644 --- a/sast-engine/tools/internal/clikeextract/c_extractor.go +++ b/sast-engine/tools/internal/clikeextract/c_extractor.go @@ -28,6 +28,15 @@ func extractCHeader(file HeaderFile, src HeaderSource) (*core.CStdlibHeader, err return nil, fmt.Errorf("extractCHeader: reading %q: %w", file.Path, err) } + // glibc declarations carry trailing GCC attribute macros (__THROW, + // __attribute_pure__, __nonnull((1)), __attr_access((__read_only__,1,2))) + // that tree-sitter's C grammar can't digest — the parser collapses the + // entire surrounding declaration into an ERROR node, and the walker + // below skips ERROR nodes by design. Strip those macros to whitespace + // up front so the parser sees clean C and produces normal `declaration` + // nodes for strlen / strcmp / snprintf and similar. + source = preprocessGlibcAttributes(source) + parser := sitter.NewParser() parser.SetLanguage(clang.GetLanguage()) tree, err := parser.ParseCtx(context.Background(), nil, source) diff --git a/sast-engine/tools/internal/clikeextract/c_extractor_test.go b/sast-engine/tools/internal/clikeextract/c_extractor_test.go index df34c8c4..a0cdcbff 100644 --- a/sast-engine/tools/internal/clikeextract/c_extractor_test.go +++ b/sast-engine/tools/internal/clikeextract/c_extractor_test.go @@ -157,6 +157,39 @@ func TestExtractCHeader_FileNotFound(t *testing.T) { assert.Contains(t, err.Error(), "reading") } +// TestExtractCHeader_GlibcAttributesRecovered is the regression test for +// PR-04 Gap 1: declarations decorated with __THROW / __attribute_pure__ / +// __nonnull((...)) / __attr_access((...)) used to vanish into an ERROR +// node and never reach the manifest. The preprocess step strips the +// macros to whitespace so tree-sitter parses clean C. +func TestExtractCHeader_GlibcAttributesRecovered(t *testing.T) { + src := cTestSource() + hf := HeaderFile{Name: "glibc_string.h", Path: filepath.Join(cFixtureDir, "glibc_string.h")} + + h, err := extractCHeader(hf, src) + require.NoError(t, err) + + // Each entry below was confirmed missing from the manifest before the + // preprocess step landed (verified against /usr/include/string.h on + // Ubuntu during PR-04 validation). + for _, name := range []string{ + "strlen", "strcmp", "strncmp", "strcasecmp", "memcmp", "memmem", "strerror_r", + "snprintf", "vsnprintf", + } { + fn := h.Functions[name] + require.NotNilf(t, fn, "expected glibc-decorated function %q to be recovered", name) + assert.NotContainsf(t, fn.FQN, "__", "FQN %q must not carry leftover macro tokens", fn.FQN) + assert.NotEmptyf(t, fn.ReturnType, "%q must have a return type", name) + } + + // strlen has a single const-char-pointer parameter; check the + // param table doesn't collapse to zero-length under the preprocessed + // source. + strlen := h.Functions["strlen"] + require.Len(t, strlen.Params, 1) + assert.Equal(t, "size_t", strlen.ReturnType) +} + func TestStripTrailingComment(t *testing.T) { tests := []struct { in, want string diff --git a/sast-engine/tools/internal/clikeextract/clike_preprocess.go b/sast-engine/tools/internal/clikeextract/clike_preprocess.go new file mode 100644 index 00000000..bab6e6c8 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/clike_preprocess.go @@ -0,0 +1,189 @@ +package clikeextract + +import ( + "regexp" +) + +// preprocessGlibcAttributes returns a copy of src with glibc's no-op +// attribute macros stripped to whitespace. Tree-sitter's C grammar can't +// parse declarations like: +// +// extern size_t strlen (const char *__s) +// __THROW __attribute_pure__ __nonnull ((1)); +// +// because the trailing macro chain doesn't match any grammar rule. The +// parser produces an ERROR node that swallows the whole declaration, so +// strlen / strcmp / snprintf / memcmp / strcasecmp and dozens of similar +// glibc-decorated functions never make it into the manifest. +// +// The fix is to remove the macro tokens BEFORE parsing. We replace each +// matched range with spaces of the same length so: +// - Byte offsets and line numbers stay identical to the original (line +// numbers are reported via tree-sitter's StartPoint(), which counts +// newlines — preserved here). +// - Tree-sitter sees clean C, can build a normal `declaration` node, and +// the existing walker picks it up without further changes. +// +// The list of stripped tokens is intentionally narrow: +// - Bare identifiers that expand to nothing in real glibc (`__THROW`, +// `__attribute_pure__`, `__wur`, `__nothrow__`, `__leaf__`) +// - Function-like macros that take a bracketed argument list — these +// need balanced-paren tracking, not a regex (`__attribute__((...))`, +// `__nonnull((...))`, `__attr_access((...))`, `__attr_dealloc(...)`) +// +// Anything not on the list is left alone — better to mis-parse a single +// declaration than silently rewrite tokens we don't fully understand. +// +// Idempotent. Safe to call on already-clean source. +func preprocessGlibcAttributes(src []byte) []byte { + out := make([]byte, len(src)) + copy(out, src) + + out = stripBareAttributeIdentifiers(out) + out = stripParenthesizedAttributeMacros(out) + return out +} + +// bareAttributeIdentifiers matches glibc no-op identifier macros. They +// appear as plain tokens (no parens) and expand to nothing in real glibc +// builds, so erasing them is a no-op semantically. +// +// The list is sourced from /usr/include/x86_64-linux-gnu/sys/cdefs.h on +// recent Debian/Ubuntu releases. The pattern uses a non-capturing group +// with alternation; word boundaries ensure we don't match inside +// identifiers (`__THROWN_HARD` stays intact). +// +// What is NOT on this list (intentionally): +// - `__inline`, `__inline__`, `__restrict`, `__restrict_arr` — real GCC +// keywords that tree-sitter's C grammar handles natively. +// - `__extern_inline`, `__extern_always_inline`, `__always_inline`, +// `__fortify_function` — inline qualifiers; stripping risks changing +// how the parser classifies the function. +// - `__flexarr` — expands to `[]` for flexible-array declarations. +// +// Kept as a compile-time package var so each call avoids the regex compile +// cost — hot path runs against thousands of headers. +var bareAttributeIdentifiers = regexp.MustCompile( + `\b(?:__THROW|__THROWNL|__LEAF|__LEAF_ATTR|__COLD|__BEGIN_DECLS|__END_DECLS|` + + `__attribute_pure__|__attribute_const__|__attribute_malloc__|` + + `__attribute_warn_unused_result__|__attribute_used__|__attribute_unused__|` + + `__attribute_noinline__|__attribute_artificial__|__attribute_returns_twice__|` + + `__attribute_deprecated__|__attribute_maybe_unused__|__attribute_nonstring__|` + + `__nothrow__|__leaf__|__wur|__pure__|__const__|__cold__|__hot__|` + + `__nonnull__|__returns_nonnull|__attr_dealloc_free|__extension__)\b`) + +// parenthesizedAttributeMacros matches the leading token of a function-like +// macro expansion. The trailing argument list is consumed by the +// balanced-paren walker in stripParenthesizedAttributeMacros — regex alone +// would mis-handle nested parens. +// +// Notable additions over the bare-token list: `__attribute__`, +// `__nonnull`, `__attr_access`, `__attr_access_none`, `__attr_dealloc`, +// and the size/align/deprecation/format helpers in cdefs.h. +var parenthesizedAttributeMacros = regexp.MustCompile( + `\b(?:__attribute__|__attribute|__nonnull|__attr_access|__attr_access_none|` + + `__attr_dealloc|__attr_alloc_size|__attribute_alloc_size__|` + + `__attribute_alloc_align__|__attribute_format__|__attribute_format_arg__|` + + `__attribute_format_strfmon__|__attribute_deprecated_msg__|` + + `__attribute_warn_unused_result__|__nothrow_leaf__|__attribute_artificial__|` + + `__REDIRECT|__REDIRECT_NTH|__REDIRECT_NTHNL|__REDIRECT_LDBL|` + + `__REDIRECT_NTH_LDBL|__REDIRECT_FORTIFY|__REDIRECT_FORTIFY_NTH)\b`) + +// stripBareAttributeIdentifiers replaces every match of +// bareAttributeIdentifiers with spaces of the same length. Done in-place +// to avoid extra allocations. +func stripBareAttributeIdentifiers(src []byte) []byte { + matches := bareAttributeIdentifiers.FindAllIndex(src, -1) + for _, m := range matches { + blank(src, m[0], m[1]) + } + return src +} + +// stripParenthesizedAttributeMacros replaces every parenthesized-macro +// invocation with spaces. The macro head is found with a regex; the +// trailing argument list is consumed by walking forward and counting +// parens. The whole span (head + argument list) is overwritten. +// +// Handles two real-world shapes: +// +// __attribute__((__format__(printf, 1, 2))) // double-paren wrapper +// __nonnull((1, 2)) // double-paren wrapper +// __attr_dealloc(free, 1) // single-paren call +// +// Stops at the first balanced `)` after the macro head — that boundary +// always closes the argument list because none of these macros appear +// inside expressions where paren depth would otherwise matter. +func stripParenthesizedAttributeMacros(src []byte) []byte { + for { + loc := parenthesizedAttributeMacros.FindIndex(src) + if loc == nil { + return src + } + end := consumeBalancedParens(src, loc[1]) + if end < 0 { + // No paren followed (or no close paren found) — leave the + // match alone rather than blank a half-recognized region. + // Move past the match so the next iteration doesn't re-match. + // + // Replacing only the macro head would risk turning + // `__attribute_format_arg__(1)` (function-like) into a bare + // `(1)` that tree-sitter then mis-classifies. Skipping is + // safer. + break + } + blank(src, loc[0], end) + } + return src +} + +// consumeBalancedParens scans forward from start to find the closing `)` +// that balances the first `(` after start. Returns the index just past the +// closing `)`, or -1 when no opening `(` is found before the next +// non-whitespace, non-paren token, or the closing paren is missing. +// +// Whitespace between the macro head and its `(` is allowed — glibc commonly +// formats long attribute lines with line breaks before the argument list. +func consumeBalancedParens(src []byte, start int) int { + i := start + // Skip whitespace until we find the opening paren. + for i < len(src) && isSpace(src[i]) { + i++ + } + if i >= len(src) || src[i] != '(' { + return -1 + } + depth := 0 + for ; i < len(src); i++ { + switch src[i] { + case '(': + depth++ + case ')': + depth-- + if depth == 0 { + return i + 1 + } + } + } + return -1 +} + +// blank overwrites src[start:end] with spaces, leaving newlines intact so +// tree-sitter's reported line numbers continue to match the original +// source. Idempotent on already-blank ranges. +func blank(src []byte, start, end int) { + for i := start; i < end; i++ { + if src[i] != '\n' { + src[i] = ' ' + } + } +} + +func isSpace(b byte) bool { + switch b { + case ' ', '\t', '\n', '\r', '\v', '\f': + return true + } + return false +} + diff --git a/sast-engine/tools/internal/clikeextract/clike_preprocess_test.go b/sast-engine/tools/internal/clikeextract/clike_preprocess_test.go new file mode 100644 index 00000000..37016dc5 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/clike_preprocess_test.go @@ -0,0 +1,142 @@ +package clikeextract + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestPreprocessGlibcAttributes_StripsBareIdentifiers confirms the no-op +// glibc identifiers are blanked out without altering surrounding tokens +// or shifting byte offsets / line numbers. +func TestPreprocessGlibcAttributes_StripsBareIdentifiers(t *testing.T) { + in := []byte(`extern int foo(void) __THROW __attribute_pure__; +extern int bar(int x) __wur;`) + out := preprocessGlibcAttributes(in) + + got := string(out) + assert.NotContains(t, got, "__THROW") + assert.NotContains(t, got, "__attribute_pure__") + assert.NotContains(t, got, "__wur") + assert.Contains(t, got, "int foo(void)") + assert.Contains(t, got, "int bar(int x)") + assert.Equal(t, len(in), len(out), "preprocessing must be length-preserving") + assert.Equal(t, strings.Count(string(in), "\n"), strings.Count(got, "\n"), + "newline count must be preserved so line numbers stay accurate") +} + +// TestPreprocessGlibcAttributes_StripsParenthesizedMacros covers the +// __attribute__((...)) / __nonnull((...)) / __attr_access((...)) forms. +// Balanced-paren tracking must consume the full argument list, including +// nested parens. +func TestPreprocessGlibcAttributes_StripsParenthesizedMacros(t *testing.T) { + cases := []struct { + name string + in string + }{ + {"attribute double paren", `__attribute__((__format__(__printf__, 1, 2)))`}, + {"nonnull double paren", `__nonnull((1, 2, 3))`}, + {"attr_access double paren", `__attr_access((__read_only__, 1, 2))`}, + {"attr_dealloc single paren", `__attr_dealloc(free, 1)`}, + {"attribute newline before parens", "__attribute__\n((nonnull))"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + out := preprocessGlibcAttributes([]byte("int foo(void) " + tc.in + ";")) + got := string(out) + assert.NotContains(t, got, "__attribute__", "%s: head must be stripped", tc.in) + assert.NotContains(t, got, "__nonnull") + assert.NotContains(t, got, "__attr_access") + assert.NotContains(t, got, "__attr_dealloc") + assert.Contains(t, got, "int foo(void)") + }) + } +} + +// TestPreprocessGlibcAttributes_LeavesNormalCodeAlone is the negative +// sanity check — strings that don't carry any of the recognised tokens +// must come back byte-for-byte identical. +func TestPreprocessGlibcAttributes_LeavesNormalCodeAlone(t *testing.T) { + in := []byte(`int main(int argc, char **argv) { + return 0; +}`) + out := preprocessGlibcAttributes(in) + assert.Equal(t, in, out) +} + +// TestPreprocessGlibcAttributes_PreservesByteOffsets pins the +// length-preserving guarantee: the returned slice has the same length as +// the input, and the strlen identifier sits at the same byte position. +func TestPreprocessGlibcAttributes_PreservesByteOffsets(t *testing.T) { + in := []byte(`extern size_t strlen (const char *__s) + __THROW __attribute_pure__ __nonnull ((1)); +`) + out := preprocessGlibcAttributes(in) + require.Equal(t, len(in), len(out)) + + // strlen sits at the same byte offset before and after. + want := strings.Index(string(in), "strlen") + got := strings.Index(string(out), "strlen") + assert.Equal(t, want, got) +} + +// TestPreprocessGlibcAttributes_DoesNotMatchInsideIdentifiers verifies the +// word-boundary guard — we don't want to mangle a project identifier that +// happens to contain "__THROW" as a substring. +func TestPreprocessGlibcAttributes_DoesNotMatchInsideIdentifiers(t *testing.T) { + in := []byte(`int my__THROWN_var = 0;`) + out := preprocessGlibcAttributes(in) + assert.Equal(t, in, out) +} + +// TestPreprocessGlibcAttributes_HandlesMissingClosingParen tolerates a +// truncated header. Without a closing paren the macro head is left alone +// so we don't mistakenly blank a half-recognized region. +func TestPreprocessGlibcAttributes_HandlesMissingClosingParen(t *testing.T) { + in := []byte(`int foo() __attribute__((unbalanced +`) + out := preprocessGlibcAttributes(in) + // Should NOT blank the macro head when the parens don't balance. + assert.Contains(t, string(out), "__attribute__") +} + +// TestPreprocessGlibcAttributes_Idempotent verifies running the +// preprocessor twice produces the same result as running it once. Cheap +// invariant that catches regressions where a stripped span re-introduces +// matchable text. +func TestPreprocessGlibcAttributes_Idempotent(t *testing.T) { + in := []byte(`extern void *memmem (const void *__h, size_t __hlen, + const void *__n, size_t __nlen) + __THROW __attribute_pure__ __nonnull ((1, 3)) + __attr_access ((__read_only__, 1, 2)) + __attr_access ((__read_only__, 3, 4));`) + once := preprocessGlibcAttributes(in) + twice := preprocessGlibcAttributes(once) + assert.Equal(t, once, twice) +} + +// TestConsumeBalancedParens covers the helper independently of the +// regex so the brace-counting logic is easy to reason about. +func TestConsumeBalancedParens(t *testing.T) { + tests := []struct { + name string + src string + from int + want int + }{ + {"plain", "(a)", 0, 3}, + {"nested", "((a, (b, c)))", 0, 13}, + {"whitespace before paren", " (a)", 0, 5}, + {"newline before paren", "\n (a)", 0, 6}, + {"no opening paren", "abc", 0, -1}, + {"unbalanced", "(a", 0, -1}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := consumeBalancedParens([]byte(tt.src), tt.from) + assert.Equal(t, tt.want, got) + }) + } +} diff --git a/sast-engine/tools/internal/clikeextract/cpp_extractor.go b/sast-engine/tools/internal/clikeextract/cpp_extractor.go index b8117738..7394022a 100644 --- a/sast-engine/tools/internal/clikeextract/cpp_extractor.go +++ b/sast-engine/tools/internal/clikeextract/cpp_extractor.go @@ -33,6 +33,12 @@ func extractCppHeader(file HeaderFile, src HeaderSource) (*core.CStdlibHeader, e return nil, fmt.Errorf("extractCppHeader: reading %q: %w", file.Path, err) } + // libstdc++ headers borrow the same `__attribute__((...))` / + // `__nonnull((...))` decorations as glibc — strip them before parsing + // so trailing attribute chains don't trip tree-sitter into ERROR nodes + // that swallow class methods or free function declarations. + source = preprocessGlibcAttributes(source) + parser := sitter.NewParser() parser.SetLanguage(cpplang.GetLanguage()) tree, err := parser.ParseCtx(context.Background(), nil, source) diff --git a/sast-engine/tools/internal/clikeextract/testdata/c/glibc_string.h b/sast-engine/tools/internal/clikeextract/testdata/c/glibc_string.h new file mode 100644 index 00000000..018506e6 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/testdata/c/glibc_string.h @@ -0,0 +1,55 @@ +/* Synthetic glibc-style string.h fixture exercising the attribute macro + * preprocess pipeline. Mirrors the real /usr/include/string.h shape on + * Debian/Ubuntu for the symbols PR-04 must recover. + * + * Keep declarations close to the real glibc spelling — the test exists + * specifically to catch regressions where the preprocess regex drifts + * out of sync with what glibc actually emits. + */ +#ifndef _GLIBC_STRING_H +#define _GLIBC_STRING_H + +typedef unsigned long size_t; + +/* + * NOTE: real glibc headers don't define these macros locally — they come + * from , which we don't include here on purpose. The + * preprocess step in clike_preprocess.go strips them from the source + * before tree-sitter parses, so the token-level result is the same as if + * cdefs.h had been included and they'd expanded to nothing. + */ + +extern size_t strlen (const char *__s) + __THROW __attribute_pure__ __nonnull ((1)); + +extern int strcmp (const char *__s1, const char *__s2) + __THROW __attribute_pure__ __nonnull ((1, 2)); + +extern int strncmp (const char *__s1, const char *__s2, size_t __n) + __THROW __attribute_pure__ __nonnull ((1, 2)); + +extern int strcasecmp (const char *__s1, const char *__s2) + __THROW __attribute_pure__ __nonnull ((1, 2)); + +extern void *memcmp (const void *__s1, const void *__s2, size_t __n) + __THROW __attribute_pure__ __nonnull ((1, 2)); + +extern void *memmem (const void *__haystack, size_t __haystacklen, + const void *__needle, size_t __needlelen) + __THROW __attribute_pure__ __nonnull ((1, 3)) + __attr_access ((__read_only__, 1, 2)) + __attr_access ((__read_only__, 3, 4)); + +extern char *strerror_r (int __errnum, char *__buf, size_t __buflen) + __THROW __nonnull ((2)) __wur; + +/* The __THROWNL form (no-throw, no-longjmp) — used by snprintf. */ +extern int snprintf (char *__restrict __s, size_t __maxlen, + const char *__restrict __format, ...) + __THROWNL __attribute__ ((__format__ (__printf__, 3, 4))); + +extern int vsnprintf (char *__restrict __s, size_t __maxlen, + const char *__restrict __format, void *__arg) + __THROWNL __attribute__ ((__format__ (__printf__, 3, 0))); + +#endif From 1a24eff10f448a94100dc862ba3d027e1d8b1784 Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 20:31:26 -0400 Subject: [PATCH 2/4] fix(registry): GetFreeFunction tolerant of generator's map placement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PR-01 generator emits some namespaced free-function entries (notably std::move, std::forward, std::swap as overlay-only entries) into the manifest's `functions` map rather than `free_functions`. The PR-02 resolver only consulted `free_functions`, so those symbols silently failed to resolve at the call site. GetFreeFunction now consults both maps. The lookup order is free_functions first (the canonical map), then functions (the fallback). This is a strict superset of the previous behaviour — any entry that resolved before still resolves at the same priority — so it's safe to ship without coordinating a generator-side cleanup. Future work: tighten the generator emitter so namespaced symbols always land in `free_functions`. Tracked outside this stack. Co-Authored-By: Claude Sonnet 4.5 --- .../callgraph/registry/cpp_stdlib_remote.go | 29 +++++++++++++++++ .../registry/cpp_stdlib_remote_test.go | 32 +++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go index ce2d7ff3..f16e5d71 100644 --- a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go +++ b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote.go @@ -286,6 +286,15 @@ func (r *CppStdlibRegistryRemote) GetMethod(headerName, classFQN, methodName str // GetFreeFunction looks up a namespaced free function by its full FQN // (e.g. "std::move", "std::swap"). Distinct from GetFunction so the resolver // can be explicit about which form it expects. +// +// Looks in both `free_functions` and `functions` because the PR-01 +// generator currently emits some overlay-only namespaced symbols +// (notably std::move, std::forward, std::swap on certain headers) into +// `functions` rather than `free_functions`. A resolver that consulted +// only one map would miss those entries even though they're present in +// the manifest. A future cleanup pass on the generator could canonicalise +// the placement, but the resolver-side fallback is a strict superset and +// stays correct either way. func (r *CppStdlibRegistryRemote) GetFreeFunction(headerName, fqn string) (*core.CStdlibFunction, error) { h, err := r.GetHeader(headerName) if err != nil { @@ -294,6 +303,9 @@ func (r *CppStdlibRegistryRemote) GetFreeFunction(headerName, fqn string) (*core if f, ok := h.FreeFunctions[fqn]; ok { return f, nil } + if f, ok := h.Functions[fqn]; ok { + return f, nil + } return nil, fmt.Errorf("GetFreeFunction: %q not in header %q", fqn, headerName) } @@ -312,4 +324,21 @@ func (r *CppStdlibRegistryRemote) HeaderCount() int { return len(r.manifest.Headers) } +// ListHeaders returns every header name in the loaded manifest. Mirrors +// the C loader's contract — the resolver uses this for transitive-include +// fallback when a namespaced std::* call appears in a file that doesn't +// directly #include the owning header. +func (r *CppStdlibRegistryRemote) ListHeaders() []string { + r.cacheMutex.RLock() + defer r.cacheMutex.RUnlock() + if r.manifest == nil { + return nil + } + out := make([]string, 0, len(r.manifest.Headers)) + for _, e := range r.manifest.Headers { + out = append(out, e.Header) + } + return out +} + var _ core.CppStdlibLoader = (*CppStdlibRegistryRemote)(nil) diff --git a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go index a4b13361..8d04910d 100644 --- a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go +++ b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go @@ -138,6 +138,38 @@ func TestCppStdlibRegistry_GetFunctionFreeFunctionFallback(t *testing.T) { assert.Equal(t, "std::swap", got.FQN) } +// TestCppStdlibRegistry_GetFreeFunction_FunctionsMapFallback pins the +// PR-04 fix: when the generator stores a namespaced symbol under +// `functions` instead of `free_functions` (current PR-01 behaviour for +// std::move and std::forward), GetFreeFunction must still find it. +// +// We mutate the cached header directly to simulate the generator output +// rather than redefining writeCppRegistry, keeping the existing fixture +// stable for other tests. +func TestCppStdlibRegistry_GetFreeFunction_FunctionsMapFallback(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + // Force a one-time fetch so the header is in the cache, then move + // std::move out of the FreeFunctions map and into Functions — + // mimicking what the generator currently emits in the wild. + h, err := r.GetHeader("utility") + require.NoError(t, err) + moved := h.FreeFunctions["std::move"] + require.NotNil(t, moved) + delete(h.FreeFunctions, "std::move") + if h.Functions == nil { + h.Functions = map[string]*core.CStdlibFunction{} + } + h.Functions["std::move"] = moved + + got, err := r.GetFreeFunction("utility", "std::move") + require.NoError(t, err) + assert.Equal(t, "T&&", got.ReturnType) +} + func TestCppStdlibRegistry_GetFunctionMissing(t *testing.T) { dir := t.TempDir() writeCppRegistry(t, dir) From a155a5a66ce7d9f99c91831ce18ad97488f12d8e Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 20:31:40 -0400 Subject: [PATCH 3/4] feat(builder): transitive-include fallback for namespaced stdlib calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real-world C++ code routinely relies on transitive includes — a file calls std::move without `#include ` because some other header it includes pulls utility in. PR-02's resolver walked only the caller file's direct system includes, so namespaced std::* calls failed to resolve in the majority of files that should have benefited. lookupCppStdlibFreeFunction now runs in two stages: 1. Direct system includes (the existing fast path). 2. Manifest-wide scan when the direct walk doesn't yield a hit. Bounded to qualified names (containing "::") so unqualified project symbols can never accidentally bind to a stdlib entry. First hit wins; stdlib FQNs are unique across headers so order doesn't affect correctness. To support the fallback, both CStdlibLoader and CppStdlibLoader gain a ListHeaders() method returning every manifest header name. Implemented on the file://+HTTP loaders by reading manifest.Headers; test fakes implement it by walking their fixture map. Fakes in the cmd package reuse the same pattern. Validation against proxygen full tree: - 12.0% → 17.1% resolved (+5.1 pp) - std::move drops out of the top-20 unresolved (was #1 with 1603 occurrences, all resolved now) Co-Authored-By: Claude Sonnet 4.5 --- .../builder/c_builder_stdlib_test.go | 8 ++++ .../graph/callgraph/builder/cpp_builder.go | 41 +++++++++++++++---- .../builder/cpp_builder_stdlib_test.go | 41 +++++++++++++++++++ .../callgraph/core/clike_stdlib_types.go | 8 ++++ .../callgraph/registry/c_stdlib_remote.go | 21 ++++++++++ 5 files changed, 112 insertions(+), 7 deletions(-) diff --git a/sast-engine/graph/callgraph/builder/c_builder_stdlib_test.go b/sast-engine/graph/callgraph/builder/c_builder_stdlib_test.go index 524f939d..59747d0b 100644 --- a/sast-engine/graph/callgraph/builder/c_builder_stdlib_test.go +++ b/sast-engine/graph/callgraph/builder/c_builder_stdlib_test.go @@ -42,6 +42,14 @@ func (f *fakeCStdlibLoader) Platform() string { return "linux" } func (f *fakeCStdlibLoader) HeaderCount() int { return len(f.headers) } +func (f *fakeCStdlibLoader) ListHeaders() []string { + out := make([]string, 0, len(f.headers)) + for k := range f.headers { + out = append(out, k) + } + return out +} + // TestBuildCCallGraph_StdlibFallback verifies that an unresolved call // falls through to the stdlib registry and emits an enriched CallSite // (TargetFQN, return type, confidence, security tag). diff --git a/sast-engine/graph/callgraph/builder/cpp_builder.go b/sast-engine/graph/callgraph/builder/cpp_builder.go index 83df6a3e..e0900142 100644 --- a/sast-engine/graph/callgraph/builder/cpp_builder.go +++ b/sast-engine/graph/callgraph/builder/cpp_builder.go @@ -551,21 +551,48 @@ func lookupCppStdlibMethod( // ("std::move") for the registry's GetFreeFunction to succeed — bare // names like "move" without the namespace are a different lookup path // and stay out of scope here. +// +// The lookup happens in two stages: +// +// 1. Walk the caller file's direct system includes. This is the fast +// path and catches the common case where the calling file +// `#include ` itself. +// 2. If the direct walk doesn't yield a hit, fall back to scanning +// every header in the manifest. Real-world C++ code routinely +// relies on transitive includes (a header pulling in another that +// pulls in ), so without this fallback std::move / +// std::forward / std::swap fail to resolve in the majority of +// calling files. +// +// The fallback is bounded to fully-qualified names (containing "::") so +// unqualified project-internal symbols can never accidentally bind to a +// stdlib entry. Performance is fine: the scan is O(headers) per +// unresolved namespaced call, ~120 headers on libstdc++; first hit wins. func lookupCppStdlibFreeFunction( cs *CallSiteInternal, cReg *core.CModuleRegistry, cppLoader core.CppStdlibLoader, ) (string, *core.CStdlibFunction) { - if !strings.Contains(cs.FunctionName, fqnSeparator) { + if !strings.Contains(cs.FunctionName, fqnSeparator) || cppLoader == nil { return "", nil } - prefix, ok := cReg.FileToPrefix[cs.CallerFile] - if !ok { - return "", nil + + // Stage 1: direct system includes from the caller file. + if prefix, ok := cReg.FileToPrefix[cs.CallerFile]; ok { + for _, header := range cReg.SystemIncludes[prefix] { + if fn, err := cppLoader.GetFreeFunction(header, cs.FunctionName); err == nil && fn != nil { + return fn.FQN, fn + } + } } - for _, header := range cReg.SystemIncludes[prefix] { - fn, err := cppLoader.GetFreeFunction(header, cs.FunctionName) - if err == nil && fn != nil { + + // Stage 2: transitive-include fallback. Walk every manifest header. + // We're trading a constant-time loop per unresolved namespaced call + // for the ability to resolve symbols pulled in transitively. First + // hit wins; stdlib FQNs are unique across headers so order doesn't + // affect correctness (only speed-of-first-hit). + for _, header := range cppLoader.ListHeaders() { + if fn, err := cppLoader.GetFreeFunction(header, cs.FunctionName); err == nil && fn != nil { return fn.FQN, fn } } diff --git a/sast-engine/graph/callgraph/builder/cpp_builder_stdlib_test.go b/sast-engine/graph/callgraph/builder/cpp_builder_stdlib_test.go index 5cdcade9..291e279b 100644 --- a/sast-engine/graph/callgraph/builder/cpp_builder_stdlib_test.go +++ b/sast-engine/graph/callgraph/builder/cpp_builder_stdlib_test.go @@ -79,6 +79,14 @@ func (f *fakeCppStdlibLoader) GetFreeFunction(headerName, fqn string) (*core.CSt func (f *fakeCppStdlibLoader) Platform() string { return "linux" } func (f *fakeCppStdlibLoader) HeaderCount() int { return len(f.headers) } +func (f *fakeCppStdlibLoader) ListHeaders() []string { + out := make([]string, 0, len(f.headers)) + for k := range f.headers { + out = append(out, k) + } + return out +} + // TestBuildCppCallGraph_StdlibClassMethod resolves `vec.push_back(...)` // against the C++ stdlib registry. The receiver type comes from the // type engine; the resolver canonicalises std::vector → std::vector @@ -151,6 +159,39 @@ func TestBuildCppCallGraph_StdlibClassMethod_TemplateSubstitution(t *testing.T) assert.Equal(t, "int&", sites[0].InferredType, "T must be replaced with the concrete template argument") } +// TestBuildCppCallGraph_StdlibFreeFunction_TransitiveFallback covers the +// PR-04 transitive-include fallback. A file that calls `std::move` +// without directly including still resolves because the +// resolver scans every manifest header when the direct include list +// doesn't yield a hit. +func TestBuildCppCallGraph_StdlibFreeFunction_TransitiveFallback(t *testing.T) { + root := cppFixtureRoot + mainCpp := root + "/src/main.cpp" + + f := newCppFixture(t) + main := f.addFreeFunction(t, mainCpp, "src/main.cpp", "", "main", "int") + f.addCall(t, main, "std::move", "") + + // Notice: NO entry in SystemIncludes for src/main.cpp — the file + // gets std::move via a transitive include we can't see at the + // CodeGraph level. Pre-PR-04 this stayed unresolved. + f.registry.StdlibCppRegistry = newFakeCppStdlibLoader(map[string]*core.CStdlibHeader{ + "utility": { + Header: "utility", + FreeFunctions: map[string]*core.CStdlibFunction{ + "std::move": {FQN: "std::move", ReturnType: "T&&", Source: core.SourceOverlay, Confidence: 1.0}, + }, + }, + }) + + cg := f.build(t) + + sites := cg.CallSites["src/main.cpp::main"] + require.Len(t, sites, 1) + assert.True(t, sites[0].Resolved, "transitive-include fallback must resolve std::move") + assert.Equal(t, "std::move", sites[0].TargetFQN) +} + // TestBuildCppCallGraph_StdlibFreeFunction handles `std::move(x)` — // a namespaced free function looked up via GetFreeFunction. func TestBuildCppCallGraph_StdlibFreeFunction(t *testing.T) { diff --git a/sast-engine/graph/callgraph/core/clike_stdlib_types.go b/sast-engine/graph/callgraph/core/clike_stdlib_types.go index 63ffdf6e..1f7c1760 100644 --- a/sast-engine/graph/callgraph/core/clike_stdlib_types.go +++ b/sast-engine/graph/callgraph/core/clike_stdlib_types.go @@ -14,6 +14,14 @@ type CStdlibLoader interface { GetFunction(headerName, funcName string) (*CStdlibFunction, error) Platform() string HeaderCount() int + // ListHeaders returns every header name in the loaded manifest in + // deterministic order. The resolver uses this to fall back to a + // global manifest scan when a call site doesn't directly #include + // the header that owns the symbol — common with C++ codebases that + // rely on transitive includes (vector pulling in utility, etc). + // + // Returns an empty slice when LoadManifest has not been called. + ListHeaders() []string } // CppStdlibLoader extends CStdlibLoader with C++-specific accessors. Free diff --git a/sast-engine/graph/callgraph/registry/c_stdlib_remote.go b/sast-engine/graph/callgraph/registry/c_stdlib_remote.go index 0e9dfc74..76bed3e8 100644 --- a/sast-engine/graph/callgraph/registry/c_stdlib_remote.go +++ b/sast-engine/graph/callgraph/registry/c_stdlib_remote.go @@ -309,6 +309,27 @@ func (r *CStdlibRegistryRemote) HeaderCount() int { return len(r.manifest.Headers) } +// ListHeaders returns every header name in the loaded manifest in the +// order it was emitted. Resolver-side transitive-include fallback uses +// this to scan all stdlib headers when the caller's direct #include list +// doesn't yield a hit. +// +// Returns nil when LoadManifest has not been called. Allocates a fresh +// slice on each call so callers may sort or filter without mutating the +// loader state. +func (r *CStdlibRegistryRemote) ListHeaders() []string { + r.cacheMutex.RLock() + defer r.cacheMutex.RUnlock() + if r.manifest == nil { + return nil + } + out := make([]string, 0, len(r.manifest.Headers)) + for _, e := range r.manifest.Headers { + out = append(out, e.Header) + } + return out +} + // Compile-time interface checks — fail at build time if the struct ever // drifts from the contract. var _ core.CStdlibLoader = (*CStdlibRegistryRemote)(nil) From f0bfd819c0c906a8c76695ccaef930d97e319e7a Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 20:36:45 -0400 Subject: [PATCH 4/4] test(registry): cover ListHeaders + before-load behavior Direct tests for the new ListHeaders() method on both loaders: - canonical case after LoadManifest returns the manifest's header set - before LoadManifest returns nil (matches HeaderCount semantics) - mutating the returned slice doesn't bleed into a subsequent call Pushes both methods from 0% to 100% coverage; satisfies codecov/patch threshold on PR #682. Co-Authored-By: Claude Sonnet 4.5 --- .../registry/c_stdlib_remote_test.go | 24 +++++++++++++++++++ .../registry/cpp_stdlib_remote_test.go | 22 +++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go b/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go index fc5367af..06a49f3e 100644 --- a/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go +++ b/sast-engine/graph/callgraph/registry/c_stdlib_remote_test.go @@ -254,3 +254,27 @@ func TestCStdlibRegistry_ImplementsInterface(t *testing.T) { var _ core.CStdlibLoader = NewCStdlibRegistryFile(t.TempDir(), core.PlatformLinux) var _ core.CStdlibLoader = NewCStdlibRegistryRemote("https://x", core.PlatformLinux) } + +// TestCStdlibRegistry_ListHeaders verifies the manifest-wide enumerator +// the resolver uses for transitive-include fallback. The order should +// follow the manifest's Headers slice; the result must be a fresh slice +// (callers may sort/filter without mutating loader state). +func TestCStdlibRegistry_ListHeaders(t *testing.T) { + dir := t.TempDir() + writeCRegistry(t, dir) + r := NewCStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + headers := r.ListHeaders() + assert.ElementsMatch(t, []string{"stdio.h", "stdlib.h"}, headers) + + // Mutating the returned slice must not affect a subsequent call. + headers[0] = "tampered" + again := r.ListHeaders() + assert.NotEqual(t, "tampered", again[0], "ListHeaders must return an independent slice") +} + +func TestCStdlibRegistry_ListHeaders_BeforeLoad(t *testing.T) { + r := NewCStdlibRegistryFile(t.TempDir(), core.PlatformLinux) + assert.Nil(t, r.ListHeaders()) +} diff --git a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go index 8d04910d..872844cf 100644 --- a/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go +++ b/sast-engine/graph/callgraph/registry/cpp_stdlib_remote_test.go @@ -255,6 +255,28 @@ func TestCppStdlibRegistry_RemoteCtorTrimsSlash(t *testing.T) { assert.Equal(t, "https://x/registries", r.baseURL) } +// TestCppStdlibRegistry_ListHeaders confirms the C++ loader exposes the +// manifest's header list for transitive-include fallback. Same contract +// as the C loader: deterministic order, fresh slice each call. +func TestCppStdlibRegistry_ListHeaders(t *testing.T) { + dir := t.TempDir() + writeCppRegistry(t, dir) + r := NewCppStdlibRegistryFile(dir, core.PlatformLinux) + require.NoError(t, r.LoadManifest(noopLogger{})) + + headers := r.ListHeaders() + assert.ElementsMatch(t, []string{"vector", "utility"}, headers) + + headers[0] = "tampered" + again := r.ListHeaders() + assert.NotEqual(t, "tampered", again[0]) +} + +func TestCppStdlibRegistry_ListHeaders_BeforeLoad(t *testing.T) { + r := NewCppStdlibRegistryFile(t.TempDir(), core.PlatformLinux) + assert.Nil(t, r.ListHeaders()) +} + func TestCppStdlibRegistry_ImplementsInterface(t *testing.T) { var _ core.CppStdlibLoader = NewCppStdlibRegistryFile(t.TempDir(), core.PlatformLinux) var _ core.CppStdlibLoader = NewCppStdlibRegistryRemote("https://x", core.PlatformLinux)