From 69b4a35b73c6f4da9c20d7abb3d517ff65248f5f Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 17:45:41 -0400 Subject: [PATCH 1/3] feat(core): C/C++ stdlib registry public types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the schema contract consumed by both the PR-01 generator (this stack) and the loader landing in PR-02: - CStdlibRegistry / NewCStdlibRegistry — root in-memory container per (platform, language) axis, with accessors HasHeader, GetHeader, GetFunction, GetClass, GetMethod that mirror the existing GoStdlibRegistry surface. - CStdlibManifest + CStdlibHeaderEntry + CStdlibStatistics — the top-level manifest.json shape; HasHeader / GetHeaderEntry helpers for the loader's lazy-fetch path. - CStdlibHeader — per-header content; one type works for both C and C++ (C++-only fields are tagged omitempty so C output stays clean). - CStdlibFunction / CStdlibParam / CStdlibTypedef / CStdlibConstant / CppStdlibClass / CppStdlibConstructor — leaf entries. - Source / language / platform string constants so consumers don't hard-code "header" / "overlay" / "merged" / "linux" / "c" / "cpp". JSON tags are snake_case to match the Python and Go stdlib registry files already on disk; nolint:tagliatelle directives match the pattern in go_stdlib_types.go. 100% test coverage on the new file via 12 round-trip + accessor tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../callgraph/core/clike_stdlib_types.go | 306 ++++++++++++++++++ .../callgraph/core/clike_stdlib_types_test.go | 288 +++++++++++++++++ 2 files changed, 594 insertions(+) create mode 100644 sast-engine/graph/callgraph/core/clike_stdlib_types.go create mode 100644 sast-engine/graph/callgraph/core/clike_stdlib_types_test.go diff --git a/sast-engine/graph/callgraph/core/clike_stdlib_types.go b/sast-engine/graph/callgraph/core/clike_stdlib_types.go new file mode 100644 index 00000000..78a04714 --- /dev/null +++ b/sast-engine/graph/callgraph/core/clike_stdlib_types.go @@ -0,0 +1,306 @@ +package core + +// CStdlibRegistry is the root in-memory container for C/C++ stdlib data on a single +// (platform, language) axis (e.g. linux/c, linux/cpp, windows/c). It is populated by +// the loader (PR-02) from registry JSON hosted on the CDN and consulted by the call +// graph builder when a call site cannot be resolved against project-internal symbols. +// +// One CStdlibRegistry instance is created per (platform, language) pair the analyzer +// touches; the engine typically holds two — one for C, one for C++ — for the +// auto-detected target platform. +type CStdlibRegistry struct { + // Manifest lists the per-header files available on the CDN and the + // aggregate statistics. Always non-nil after a successful Load. + Manifest *CStdlibManifest + // Headers maps header name (e.g. "stdio.h", "vector") to the parsed + // per-header content. Populated lazily by the loader: an entry is + // present only after the corresponding header file has been fetched. + Headers map[string]*CStdlibHeader +} + +// NewCStdlibRegistry creates an initialized registry with a pre-allocated headers map. +func NewCStdlibRegistry() *CStdlibRegistry { + return &CStdlibRegistry{ + Headers: make(map[string]*CStdlibHeader), + } +} + +// GetHeader returns the parsed per-header content for the given header, or nil +// if the loader has not (yet) fetched it. Callers expecting on-demand fetch +// should go through the loader's lazy-fetch path rather than this accessor. +func (r *CStdlibRegistry) GetHeader(name string) *CStdlibHeader { + return r.Headers[name] +} + +// HasHeader reports whether the given header has been loaded into the registry. +func (r *CStdlibRegistry) HasHeader(name string) bool { + _, ok := r.Headers[name] + return ok +} + +// HeaderCount returns the number of headers currently materialised in memory. +// This is at most len(Manifest.Headers); equal only after every header has been +// fetched (eager load) which is unusual — most projects include 10–30 stdlib +// headers out of the 80–110 available. +func (r *CStdlibRegistry) HeaderCount() int { + return len(r.Headers) +} + +// GetFunction is a convenience accessor: looks up the function by name within the +// given header. Returns nil if either the header is not loaded or the function is +// absent. Used by both C registries (top-level functions) and C++ registries (free +// functions in a namespace, indexed in the same map under their fully-qualified +// name e.g. "std::move"). +func (r *CStdlibRegistry) GetFunction(headerName, funcName string) *CStdlibFunction { + h := r.GetHeader(headerName) + if h == nil { + return nil + } + if f, ok := h.Functions[funcName]; ok { + return f + } + return h.FreeFunctions[funcName] +} + +// GetClass returns the C++ class metadata for the given header + class FQN, +// or nil if missing. Always returns nil for C-language registries. +func (r *CStdlibRegistry) GetClass(headerName, classFQN string) *CppStdlibClass { + h := r.GetHeader(headerName) + if h == nil { + return nil + } + return h.Classes[classFQN] +} + +// GetMethod is a two-step accessor: header → class → method. Returns nil if any +// step misses. Used by the C++ resolver when dispatching `obj.method()` against +// a stdlib type whose receiver was inferred earlier in the pipeline. +func (r *CStdlibRegistry) GetMethod(headerName, classFQN, methodName string) *CStdlibFunction { + cls := r.GetClass(headerName, classFQN) + if cls == nil { + return nil + } + return cls.Methods[methodName] +} + +// CStdlibManifest is the top-level manifest.json for a (platform, language) pair. +// It is the smallest file in the registry tree (typically <10 KB) and serves as the +// directory the loader uses to discover, validate, and lazily fetch per-header files. +// +//nolint:tagliatelle // JSON tags match the registry format on disk (snake_case). +type CStdlibManifest struct { + SchemaVersion string `json:"schema_version"` + RegistryVersion string `json:"registry_version"` + Platform string `json:"platform"` + Language string `json:"language"` + SystemTag string `json:"system_tag"` + GeneratedAt string `json:"generated_at"` + GeneratorVersion string `json:"generator_version"` + BaseURL string `json:"base_url"` + Headers []*CStdlibHeaderEntry `json:"headers"` + Statistics *CStdlibStatistics `json:"statistics"` +} + +// NewCStdlibManifest creates an initialized manifest with a pre-allocated headers slice. +func NewCStdlibManifest() *CStdlibManifest { + return &CStdlibManifest{ + Headers: make([]*CStdlibHeaderEntry, 0), + Statistics: &CStdlibStatistics{}, + } +} + +// HasHeader reports whether the manifest has an entry for the given header name. +// Comparison is exact (case-sensitive) against the Header field; the C++ headers +// "vector"/"string"/"map" appear as-is, the C headers as "stdio.h"/"string.h", etc. +func (m *CStdlibManifest) HasHeader(name string) bool { + return m.GetHeaderEntry(name) != nil +} + +// GetHeaderEntry returns the manifest entry for a header, or nil if absent. +func (m *CStdlibManifest) GetHeaderEntry(name string) *CStdlibHeaderEntry { + for _, e := range m.Headers { + if e.Header == name { + return e + } + } + return nil +} + +// CStdlibHeaderEntry is one row in the manifest's header list. It carries enough +// information for the loader to download, validate, and cache the per-header file. +// +//nolint:tagliatelle // JSON tags match the registry format on disk (snake_case). +type CStdlibHeaderEntry struct { + Header string `json:"header"` + ModuleID string `json:"module_id"` + File string `json:"file"` + URL string `json:"url"` + Size int64 `json:"size_bytes"` + Checksum string `json:"checksum"` +} + +// CStdlibStatistics carries aggregate counts across all headers in the manifest. +// Useful for `pathfinder resolution-report` summary blocks and for end-of-generation +// sanity-checking ("are we anywhere near the budget?"). +// +//nolint:tagliatelle // JSON tags match the registry format on disk (snake_case). +type CStdlibStatistics struct { + TotalHeaders int `json:"total_headers"` + TotalFunctions int `json:"total_functions"` + TotalClasses int `json:"total_classes,omitempty"` + TotalTypedefs int `json:"total_typedefs"` + TotalConstants int `json:"total_constants"` + OverlayOverrides int `json:"overlay_overrides"` +} + +// CStdlibHeader is the per-header registry file content. One file per stdlib header +// (e.g. stdio_stdlib.json, vector_stdlib.json) — chosen over per-module aggregation +// because it maps directly to the `#include ` directives the engine already tracks +// (see Phase 1 c_module.go BuildCIncludeMap). +// +// One type serves both C and C++. The C++ specific maps (Classes, FreeFunctions, +// Namespaces) are tagged omitempty so the C variants emit clean output without them. +// +//nolint:tagliatelle // JSON tags match the registry format on disk (snake_case). +type CStdlibHeader struct { + SchemaVersion string `json:"schema_version"` + Header string `json:"header"` + ModuleID string `json:"module_id"` + Language string `json:"language"` + Platform string `json:"platform"` + SystemTag string `json:"system_tag"` + GeneratedAt string `json:"generated_at"` + Functions map[string]*CStdlibFunction `json:"functions,omitempty"` + Typedefs map[string]*CStdlibTypedef `json:"typedefs,omitempty"` + Constants map[string]*CStdlibConstant `json:"constants,omitempty"` + Classes map[string]*CppStdlibClass `json:"classes,omitempty"` + Namespaces []string `json:"namespaces,omitempty"` + FreeFunctions map[string]*CStdlibFunction `json:"free_functions,omitempty"` +} + +// NewCStdlibHeader creates an initialized header with all maps pre-allocated. The +// caller is expected to set SchemaVersion, Header, ModuleID, Language, Platform, +// SystemTag, and GeneratedAt before populating the symbol maps. +func NewCStdlibHeader() *CStdlibHeader { + return &CStdlibHeader{ + Functions: make(map[string]*CStdlibFunction), + Typedefs: make(map[string]*CStdlibTypedef), + Constants: make(map[string]*CStdlibConstant), + Classes: make(map[string]*CppStdlibClass), + FreeFunctions: make(map[string]*CStdlibFunction), + } +} + +// CStdlibFunction is the in-memory record for a C function, a C++ free function, or +// a C++ class method. The shape is identical for all three because every consumer +// (resolver, security rule, type-info propagator) wants the same fields. The FQN +// distinguishes the call form: bare ("printf"), namespaced ("std::move"), or +// class-qualified ("std::vector::push_back"). +// +//nolint:tagliatelle // JSON tags match the registry format on disk (snake_case). +type CStdlibFunction struct { + FQN string `json:"fqn"` + ReturnType string `json:"return_type"` + Params []*CStdlibParam `json:"params"` + Confidence float32 `json:"confidence"` + Source string `json:"source"` + SecurityTag string `json:"security_tag,omitempty"` + Attribute string `json:"attribute,omitempty"` + Throws string `json:"throws,omitempty"` +} + +// CStdlibParam is a single parameter in a function signature. Variadic positions +// are encoded as a synthetic param with Name="..." and Type="variadic" — the engine +// recognises this convention and skips type checking against variadic positions. +// +//nolint:tagliatelle // JSON tags match the registry format on disk (snake_case). +type CStdlibParam struct { + Name string `json:"name"` + Type string `json:"type"` + Required bool `json:"required"` + Attribute string `json:"attribute,omitempty"` +} + +// CStdlibTypedef is a single typedef entry. PlatformSpecific is set when the +// underlying type changes across platforms (e.g. size_t differs between glibc and +// MSVCRT) — the loader uses this to decide whether to surface platform-specific +// values when the analyzer is run against a single-platform build. +// +//nolint:tagliatelle // JSON tags match the registry format on disk (snake_case). +type CStdlibTypedef struct { + Type string `json:"type"` + PlatformSpecific bool `json:"platform_specific"` + Source string `json:"source"` +} + +// CStdlibConstant is a #define or const-expression value extracted from a header. +// Value is the literal text from the header (e.g. "-1" for EOF, "8192" for BUFSIZ); +// it may be empty for symbolic constants whose body is not a single literal. +// +//nolint:tagliatelle // JSON tags match the registry format on disk (snake_case). +type CStdlibConstant struct { + Type string `json:"type"` + Value string `json:"value,omitempty"` + Source string `json:"source"` +} + +// CppStdlibClass is the in-memory record for a C++ class or class template +// (e.g. std::vector, std::basic_string). TypeParams carries the template parameter +// names declared on the class — the resolver uses them to substitute concrete types +// into method return types ("T&" with receiver `std::vector` → "int&"). +// +//nolint:tagliatelle // JSON tags match the registry format on disk (snake_case). +type CppStdlibClass struct { + FQN string `json:"fqn"` + TypeParams []string `json:"type_params,omitempty"` + DefaultTemplateArgs map[string]string `json:"default_template_args,omitempty"` + Methods map[string]*CStdlibFunction `json:"methods"` + Constructors []*CppStdlibConstructor `json:"constructors,omitempty"` +} + +// NewCppStdlibClass creates an initialized class with a pre-allocated methods map. +func NewCppStdlibClass(fqn string) *CppStdlibClass { + return &CppStdlibClass{ + FQN: fqn, + DefaultTemplateArgs: make(map[string]string), + Methods: make(map[string]*CStdlibFunction), + } +} + +// CppStdlibConstructor is one constructor overload. Constructors share Params and +// Source semantics with CStdlibFunction but have no return type, no throws annotation, +// and no FQN of their own (the FQN is the class's, by convention). +// +//nolint:tagliatelle // JSON tags match the registry format on disk (snake_case). +type CppStdlibConstructor struct { + Params []*CStdlibParam `json:"params"` + Source string `json:"source"` +} + +// Source-field constants. The Source field on any extracted entry must be exactly +// one of these three values. The generator (PR-01) sets it during extract+merge; +// the resolver (PR-02) uses it as a confidence signal alongside the Confidence +// float (header < merged < overlay, in terms of curator-vetted reliability). +const ( + // SourceHeader marks an entry that came purely from tree-sitter parsing. + SourceHeader = "header" + // SourceOverlay marks an entry that exists only in the YAML overlay — no + // matching header-extracted entry was present. + SourceOverlay = "overlay" + // SourceMerged marks an entry that was present in both the header parse and + // the overlay; the overlay's values won on every conflicting field. + SourceMerged = "merged" +) + +// Language constants, used in CStdlibManifest.Language and CStdlibHeader.Language. +const ( + LanguageC = "c" + LanguageCpp = "cpp" +) + +// Platform constants, used in CStdlibManifest.Platform and CStdlibHeader.Platform. +const ( + PlatformLinux = "linux" + PlatformWindows = "windows" + PlatformDarwin = "darwin" +) diff --git a/sast-engine/graph/callgraph/core/clike_stdlib_types_test.go b/sast-engine/graph/callgraph/core/clike_stdlib_types_test.go new file mode 100644 index 00000000..83516008 --- /dev/null +++ b/sast-engine/graph/callgraph/core/clike_stdlib_types_test.go @@ -0,0 +1,288 @@ +package core + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewCStdlibRegistryPrealloc(t *testing.T) { + r := NewCStdlibRegistry() + require.NotNil(t, r) + require.NotNil(t, r.Headers) + assert.Equal(t, 0, r.HeaderCount()) +} + +func TestCStdlibRegistry_HeaderAccessors(t *testing.T) { + r := NewCStdlibRegistry() + assert.False(t, r.HasHeader("stdio.h")) + assert.Nil(t, r.GetHeader("stdio.h")) + + h := NewCStdlibHeader() + h.Header = "stdio.h" + r.Headers["stdio.h"] = h + + assert.True(t, r.HasHeader("stdio.h")) + assert.Equal(t, h, r.GetHeader("stdio.h")) + assert.Equal(t, 1, r.HeaderCount()) +} + +func TestCStdlibRegistry_GetFunction(t *testing.T) { + r := NewCStdlibRegistry() + // Missing header + assert.Nil(t, r.GetFunction("stdio.h", "printf")) + + h := NewCStdlibHeader() + h.Functions["printf"] = &CStdlibFunction{FQN: "c::stdio::printf"} + h.FreeFunctions["std::move"] = &CStdlibFunction{FQN: "std::move"} + r.Headers["stdio.h"] = h + + // Hits Functions map + got := r.GetFunction("stdio.h", "printf") + require.NotNil(t, got) + assert.Equal(t, "c::stdio::printf", got.FQN) + + // Hits FreeFunctions fallback + got = r.GetFunction("stdio.h", "std::move") + require.NotNil(t, got) + assert.Equal(t, "std::move", got.FQN) + + // Missing function in present header + assert.Nil(t, r.GetFunction("stdio.h", "unknown_func")) +} + +func TestCStdlibRegistry_GetClassAndMethod(t *testing.T) { + r := NewCStdlibRegistry() + // Missing header + assert.Nil(t, r.GetClass("vector", "std::vector")) + assert.Nil(t, r.GetMethod("vector", "std::vector", "push_back")) + + h := NewCStdlibHeader() + cls := NewCppStdlibClass("std::vector") + cls.Methods["push_back"] = &CStdlibFunction{FQN: "std::vector::push_back"} + h.Classes["std::vector"] = cls + r.Headers["vector"] = h + + // Class hit + got := r.GetClass("vector", "std::vector") + require.NotNil(t, got) + assert.Equal(t, "std::vector", got.FQN) + + // Method hit + method := r.GetMethod("vector", "std::vector", "push_back") + require.NotNil(t, method) + assert.Equal(t, "std::vector::push_back", method.FQN) + + // Missing class in present header + assert.Nil(t, r.GetClass("vector", "std::string")) + assert.Nil(t, r.GetMethod("vector", "std::string", "c_str")) + + // Missing method in present class + assert.Nil(t, r.GetMethod("vector", "std::vector", "unknown_method")) +} + +func TestCStdlibManifest_HasAndGetHeaderEntry(t *testing.T) { + m := NewCStdlibManifest() + require.NotNil(t, m) + require.NotNil(t, m.Headers) + require.NotNil(t, m.Statistics) + + assert.False(t, m.HasHeader("stdio.h")) + assert.Nil(t, m.GetHeaderEntry("stdio.h")) + + entry := &CStdlibHeaderEntry{ + Header: "stdio.h", + ModuleID: "c::stdio", + File: "stdio_stdlib.json", + URL: "https://assets.codepathfinder.dev/registries/linux/c/v1/stdio_stdlib.json", + Size: 1024, + Checksum: "sha256:abc", + } + m.Headers = append(m.Headers, entry) + + assert.True(t, m.HasHeader("stdio.h")) + assert.Equal(t, entry, m.GetHeaderEntry("stdio.h")) + + // Non-matching name + assert.False(t, m.HasHeader("string.h")) + assert.Nil(t, m.GetHeaderEntry("string.h")) +} + +func TestNewCStdlibHeader_PreallocatesAllMaps(t *testing.T) { + h := NewCStdlibHeader() + require.NotNil(t, h.Functions) + require.NotNil(t, h.Typedefs) + require.NotNil(t, h.Constants) + require.NotNil(t, h.Classes) + require.NotNil(t, h.FreeFunctions) +} + +func TestNewCppStdlibClass_PreallocatesMethodsAndArgs(t *testing.T) { + cls := NewCppStdlibClass("std::vector") + assert.Equal(t, "std::vector", cls.FQN) + require.NotNil(t, cls.Methods) + require.NotNil(t, cls.DefaultTemplateArgs) +} + +// Round-trip a fully populated manifest through JSON. Asserts every field survives. +func TestCStdlibManifest_JSONRoundTrip(t *testing.T) { + m := &CStdlibManifest{ + SchemaVersion: "1.0.0", + RegistryVersion: "v1", + Platform: PlatformLinux, + Language: LanguageC, + SystemTag: "glibc-2.39", + GeneratedAt: "2026-05-04T10:30:00Z", + GeneratorVersion: "1.0.0", + BaseURL: "https://assets.codepathfinder.dev/registries/linux/c/v1", + Headers: []*CStdlibHeaderEntry{ + { + Header: "stdio.h", + ModuleID: "c::stdio", + File: "stdio_stdlib.json", + URL: "https://example/stdio_stdlib.json", + Size: 2048, + Checksum: "sha256:abc123", + }, + }, + Statistics: &CStdlibStatistics{ + TotalHeaders: 1, + TotalFunctions: 42, + TotalClasses: 0, + TotalTypedefs: 7, + TotalConstants: 12, + OverlayOverrides: 3, + }, + } + + data, err := json.Marshal(m) + require.NoError(t, err) + + var got CStdlibManifest + require.NoError(t, json.Unmarshal(data, &got)) + assert.Equal(t, *m, got) +} + +func TestCStdlibHeader_JSONRoundTrip_C(t *testing.T) { + h := &CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "stdio.h", + ModuleID: "c::stdio", + Language: LanguageC, + Platform: PlatformLinux, + SystemTag: "glibc-2.39", + GeneratedAt: "2026-05-04T10:30:00Z", + Functions: map[string]*CStdlibFunction{ + "printf": { + FQN: "c::stdio::printf", + ReturnType: "int", + Params: []*CStdlibParam{ + {Name: "format", Type: "const char*", Required: true, Attribute: "format(printf, 1, 2)"}, + {Name: "...", Type: "variadic", Required: false}, + }, + Confidence: 1.0, + Source: SourceOverlay, + SecurityTag: "format_string_sink", + }, + }, + Typedefs: map[string]*CStdlibTypedef{ + "FILE": {Type: "struct __FILE", PlatformSpecific: false, Source: SourceHeader}, + }, + Constants: map[string]*CStdlibConstant{ + "EOF": {Type: "int", Value: "-1", Source: SourceHeader}, + }, + } + + data, err := json.Marshal(h) + require.NoError(t, err) + + // Confirm omitempty drops Classes / FreeFunctions / Namespaces for C-only headers. + assert.NotContains(t, string(data), `"classes":`) + assert.NotContains(t, string(data), `"free_functions":`) + assert.NotContains(t, string(data), `"namespaces":`) + + var got CStdlibHeader + require.NoError(t, json.Unmarshal(data, &got)) + // Empty maps decode to nil; copy them across before comparing. + assert.Equal(t, h.Functions, got.Functions) + assert.Equal(t, h.Typedefs, got.Typedefs) + assert.Equal(t, h.Constants, got.Constants) + assert.Equal(t, h.Header, got.Header) +} + +func TestCStdlibHeader_JSONRoundTrip_Cpp(t *testing.T) { + h := &CStdlibHeader{ + SchemaVersion: "1.0.0", + Header: "vector", + ModuleID: "std::vector", + Language: LanguageCpp, + Platform: PlatformLinux, + SystemTag: "libstdc++-13", + GeneratedAt: "2026-05-04T10:30:00Z", + Namespaces: []string{"std"}, + Classes: map[string]*CppStdlibClass{ + "std::vector": { + FQN: "std::vector", + TypeParams: []string{"T", "Allocator"}, + DefaultTemplateArgs: map[string]string{ + "Allocator": "std::allocator", + }, + Methods: map[string]*CStdlibFunction{ + "push_back": { + FQN: "std::vector::push_back", + ReturnType: "void", + Params: []*CStdlibParam{{Name: "value", Type: "const T&", Required: true}}, + Confidence: 1.0, + Source: SourceOverlay, + }, + "at": { + FQN: "std::vector::at", + ReturnType: "T&", + Params: []*CStdlibParam{{Name: "pos", Type: "size_t", Required: true}}, + Confidence: 1.0, + Source: SourceOverlay, + Throws: "std::out_of_range", + }, + }, + Constructors: []*CppStdlibConstructor{ + {Params: []*CStdlibParam{}, Source: SourceHeader}, + }, + }, + }, + FreeFunctions: map[string]*CStdlibFunction{ + "std::swap": { + FQN: "std::swap", + ReturnType: "void", + Source: SourceHeader, + }, + }, + } + + data, err := json.Marshal(h) + require.NoError(t, err) + + var got CStdlibHeader + require.NoError(t, json.Unmarshal(data, &got)) + assert.Equal(t, h.Classes, got.Classes) + assert.Equal(t, h.FreeFunctions, got.FreeFunctions) + assert.Equal(t, h.Namespaces, got.Namespaces) +} + +func TestSourceConstants_AreDistinct(t *testing.T) { + assert.NotEqual(t, SourceHeader, SourceOverlay) + assert.NotEqual(t, SourceOverlay, SourceMerged) + assert.NotEqual(t, SourceHeader, SourceMerged) + assert.Equal(t, "header", SourceHeader) + assert.Equal(t, "overlay", SourceOverlay) + assert.Equal(t, "merged", SourceMerged) +} + +func TestLanguageAndPlatformConstants(t *testing.T) { + assert.Equal(t, "c", LanguageC) + assert.Equal(t, "cpp", LanguageCpp) + assert.Equal(t, "linux", PlatformLinux) + assert.Equal(t, "windows", PlatformWindows) + assert.Equal(t, "darwin", PlatformDarwin) +} From 9029c1f26940ed2d5d4f8a489b8437405097beaa Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 17:46:05 -0400 Subject: [PATCH 2/3] feat(tools): clikeextract generator package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds tools/internal/clikeextract — the Go package that walks installed C/C++ system headers and emits per-header JSON registry files. Mirrors the existing tools/internal/goextract layout (thin entry-point binary plus a fully-tested internal package), one concern per file: - doc.go — package docs (pipeline overview, reuse rules) - config.go — Config + GeneratorVersion / SchemaVersion / RegistryVersion / DefaultBaseURL constants. Validate() rejects unsupported targets and languages early. - normalize.go — strip __attribute__((...)), _GLIBCXX_*, _LIBCPP_*, __THROW, _Nonnull etc.; canonicalize std::__cxx11:: -> std::; private- symbol detection (single-underscore lowercase, double-underscore); SanitizeHeaderName for output filenames. - walker.go — DiscoverHeaderSources for linux/c (glibc) and linux/cpp (libstdc++), system-tag detection, deterministic header walking with bits/ / internal/ skip rules. Windows/Darwin paths return an explicit PR-03-deferred error so the surface is forward-compatible. - overlay.go — yaml.v3-based loader for c_stdlib_overlay.yaml / cpp_stdlib_overlay.yaml. Validates language match, exactly-one-of function/method/typedef/constant, and skip-rule shape. MergeOverlay applies overrides in place and returns the count for statistics. - c_extractor.go — C function / typedef / preproc-def extraction over the tree-sitter AST. Reuses graph/clike helpers (ExtractFunctionInfo, ExtractTypeString, ExtractParameters). Conservative #define handling: emit constants only when the body parses as a literal. - cpp_extractor.go — C++ classes (with template_parameter_list capture), methods, namespace-qualified free functions, constructors. Adds a local findFunctionDeclarator that handles C++ reference_declarator wrappers (Phase 1's clike helper does not — this is reachable for T&-returning methods like vector::operator[] / vector::at). - emitter.go — per-header JSON write with sha256 checksums, statistics tally, top-level manifest.json. Output is deterministic across runs (sorted by header name) and idempotent. - extractor.go — Run() orchestrator stitching discover -> walk -> extract -> merge -> emit. Continue-on-parse-failure pattern matches goextract; fatal errors only on missing search dirs, invalid overlay, unwritable output dir. testdata/c/{stdio.h,string.h,unistd.h,inline.h} and testdata/cpp/{vector,string,utility} provide synthetic fixture headers for unit + integration tests. End-to-end TestRunFixtureLinux{C,Cpp} exercise the full pipeline. Coverage: 91.5% on the new package across 99 test cases. Remaining gaps are defensive nil paths and tree-sitter shapes the synthetic fixtures don't reach (operator_name, destructor_name fallbacks). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/clikeextract/c_extractor.go | 397 +++++++++++++ .../internal/clikeextract/c_extractor_test.go | 273 +++++++++ .../tools/internal/clikeextract/config.go | 95 +++ .../internal/clikeextract/config_test.go | 88 +++ .../internal/clikeextract/cpp_extractor.go | 552 ++++++++++++++++++ .../clikeextract/cpp_extractor_test.go | 226 +++++++ .../tools/internal/clikeextract/doc.go | 43 ++ .../tools/internal/clikeextract/emitter.go | 155 +++++ .../internal/clikeextract/emitter_test.go | 218 +++++++ .../tools/internal/clikeextract/extractor.go | 101 ++++ .../internal/clikeextract/extractor_test.go | 234 ++++++++ .../tools/internal/clikeextract/normalize.go | 184 ++++++ .../internal/clikeextract/normalize_test.go | 133 +++++ .../tools/internal/clikeextract/overlay.go | 475 +++++++++++++++ .../internal/clikeextract/overlay_test.go | 535 +++++++++++++++++ .../internal/clikeextract/testdata/c/inline.h | 34 ++ .../internal/clikeextract/testdata/c/stdio.h | 34 ++ .../internal/clikeextract/testdata/c/string.h | 23 + .../internal/clikeextract/testdata/c/unistd.h | 19 + .../internal/clikeextract/testdata/cpp/string | 28 + .../clikeextract/testdata/cpp/utility | 30 + .../internal/clikeextract/testdata/cpp/vector | 34 ++ .../tools/internal/clikeextract/walker.go | 301 ++++++++++ .../internal/clikeextract/walker_test.go | 318 ++++++++++ 24 files changed, 4530 insertions(+) create mode 100644 sast-engine/tools/internal/clikeextract/c_extractor.go create mode 100644 sast-engine/tools/internal/clikeextract/c_extractor_test.go create mode 100644 sast-engine/tools/internal/clikeextract/config.go create mode 100644 sast-engine/tools/internal/clikeextract/config_test.go create mode 100644 sast-engine/tools/internal/clikeextract/cpp_extractor.go create mode 100644 sast-engine/tools/internal/clikeextract/cpp_extractor_test.go create mode 100644 sast-engine/tools/internal/clikeextract/doc.go create mode 100644 sast-engine/tools/internal/clikeextract/emitter.go create mode 100644 sast-engine/tools/internal/clikeextract/emitter_test.go create mode 100644 sast-engine/tools/internal/clikeextract/extractor.go create mode 100644 sast-engine/tools/internal/clikeextract/extractor_test.go create mode 100644 sast-engine/tools/internal/clikeextract/normalize.go create mode 100644 sast-engine/tools/internal/clikeextract/normalize_test.go create mode 100644 sast-engine/tools/internal/clikeextract/overlay.go create mode 100644 sast-engine/tools/internal/clikeextract/overlay_test.go create mode 100644 sast-engine/tools/internal/clikeextract/testdata/c/inline.h create mode 100644 sast-engine/tools/internal/clikeextract/testdata/c/stdio.h create mode 100644 sast-engine/tools/internal/clikeextract/testdata/c/string.h create mode 100644 sast-engine/tools/internal/clikeextract/testdata/c/unistd.h create mode 100644 sast-engine/tools/internal/clikeextract/testdata/cpp/string create mode 100644 sast-engine/tools/internal/clikeextract/testdata/cpp/utility create mode 100644 sast-engine/tools/internal/clikeextract/testdata/cpp/vector create mode 100644 sast-engine/tools/internal/clikeextract/walker.go create mode 100644 sast-engine/tools/internal/clikeextract/walker_test.go diff --git a/sast-engine/tools/internal/clikeextract/c_extractor.go b/sast-engine/tools/internal/clikeextract/c_extractor.go new file mode 100644 index 00000000..62d6279a --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/c_extractor.go @@ -0,0 +1,397 @@ +package clikeextract + +import ( + "context" + "fmt" + "os" + "strings" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sast-engine/graph/clike" + clang "github.com/smacker/go-tree-sitter/c" + sitter "github.com/smacker/go-tree-sitter" +) + +// extractCHeader parses a single C header file and returns its symbol table +// as a *core.CStdlibHeader. Every entry is stamped with Source="header" — +// the overlay merge step (overlay.go) is responsible for promoting entries to +// "merged" or "overlay" status afterwards. +// +// Parse errors are returned as errors, not silently swallowed: callers (the +// orchestrator in extractor.go) decide whether to log-and-continue or abort +// the run. tree-sitter is forgiving and returns a (partial) parse tree even +// on syntax errors, so the most common return path is "(*CStdlibHeader, nil)" +// even for headers with macro magic the grammar can't fully digest. +func extractCHeader(file HeaderFile, src HeaderSource) (*core.CStdlibHeader, error) { + source, err := os.ReadFile(file.Path) //nolint:gosec // file path is from filepath.WalkDir, not user input + if err != nil { + return nil, fmt.Errorf("extractCHeader: reading %q: %w", file.Path, err) + } + + parser := sitter.NewParser() + parser.SetLanguage(clang.GetLanguage()) + tree, err := parser.ParseCtx(context.Background(), nil, source) + if err != nil { + return nil, fmt.Errorf("extractCHeader: parsing %q: %w", file.Path, err) + } + defer tree.Close() + + h := newCHeader(file, src) + walkCRoot(tree.RootNode(), source, h) + return h, nil +} + +// newCHeader allocates a fresh CStdlibHeader with the metadata fields filled +// in — Header, ModuleID, Language, Platform, SystemTag, GeneratedAt are all +// independent of which symbols the file contains, so they are pre-populated +// before AST walking starts. +func newCHeader(file HeaderFile, src HeaderSource) *core.CStdlibHeader { + h := core.NewCStdlibHeader() + h.SchemaVersion = SchemaVersion + h.Header = file.Name + h.ModuleID = "c::" + SanitizeHeaderName(file.Name) + h.Language = src.Language + h.Platform = src.Platform + h.SystemTag = src.SystemTag + return h +} + +// walkCRoot visits every direct child of a translation_unit node and dispatches +// to the appropriate extractor. The C grammar nests most declarations directly +// under the root, so a one-level walk is sufficient — only `linkage_specification` +// (for `extern "C"`) and `preproc_if` (conditional sections) need recursive +// descent, and we handle them inline. +func walkCRoot(root *sitter.Node, source []byte, h *core.CStdlibHeader) { + if root == nil { + return + } + for i := 0; i < int(root.NamedChildCount()); i++ { + child := root.NamedChild(i) + if child == nil { + continue + } + walkCNode(child, source, h) + } +} + +// walkCNode dispatches one AST node to the matching extractor. Shape types +// not relevant to the public registry (pragmas, line directives, comments) +// are silently ignored. +func walkCNode(node *sitter.Node, source []byte, h *core.CStdlibHeader) { + switch node.Type() { + case "function_definition": + extractCFunction(node, source, h) + case "declaration": + extractCDeclaration(node, source, h) + case "type_definition": + extractCTypedef(node, source, h) + case "preproc_def": + extractCPreprocDef(node, source, h) + case "preproc_if", "preproc_ifdef", "preproc_else", "preproc_elif": + // Conditional blocks: walk their bodies as if they were the outer + // scope. Tree-sitter's grammar nests the body as named children + // directly under the conditional node. + for i := 0; i < int(node.NamedChildCount()); i++ { + if c := node.NamedChild(i); c != nil { + walkCNode(c, source, h) + } + } + case "linkage_specification": + // `extern "C" { ... }` blocks are unusual in pure C headers but + // occasionally appear when a header is shared with C++ users. + // Walk the inner body the same way. + body := node.ChildByFieldName("body") + if body == nil { + return + } + for i := 0; i < int(body.NamedChildCount()); i++ { + if c := body.NamedChild(i); c != nil { + walkCNode(c, source, h) + } + } + } +} + +// extractCFunction handles a function_definition node — uncommon in headers +// (most just declare prototypes) but legal for inline definitions in glibc +// and libc++. Reuses clike.ExtractFunctionInfo which returns a normalised +// FunctionInfo regardless of whether the body is present. +func extractCFunction(node *sitter.Node, source []byte, h *core.CStdlibHeader) { + info := clike.ExtractFunctionInfo(node, source) + if info == nil || info.Name == "" { + return + } + if IsPrivateSymbol(info.Name) { + return + } + h.Functions[info.Name] = makeFunctionFromInfo(h, info) +} + +// extractCDeclaration handles a `declaration` node, which is the typical shape +// for header function prototypes (`int printf(const char* fmt, ...);`). The +// declaration's declarator chain may be: +// +// - function_declarator → bare prototype +// - pointer_declarator(function_declarator)→ function returning a pointer +// - identifier → variable declaration (skipped here) +// +// Only the function-prototype form is registered as a CStdlibFunction. +// Plain variable declarations at the top of headers (rare; e.g. `extern int +// errno;`) are skipped — they belong to a future "globals" extension. +func extractCDeclaration(node *sitter.Node, source []byte, h *core.CStdlibHeader) { + declarator := node.ChildByFieldName("declarator") + funcDecl := unwrapToFunctionDeclaratorPublic(declarator) + if funcDecl == nil { + return + } + info := clike.ExtractFunctionInfo(node, source) + if info == nil || info.Name == "" { + return + } + if IsPrivateSymbol(info.Name) { + return + } + h.Functions[info.Name] = makeFunctionFromInfo(h, info) +} + +// extractCTypedef converts a `type_definition` node into a CStdlibTypedef +// entry. The C grammar exposes the underlying type as the `type` field and +// the new name as the `declarator` field's identifier, with optional +// pointer/array wrappers we must walk through. +func extractCTypedef(node *sitter.Node, source []byte, h *core.CStdlibHeader) { + declarator := node.ChildByFieldName("declarator") + name := unwrapDeclaratorIdentifier(declarator, source) + if name == "" || IsPrivateSymbol(name) { + return + } + typeNode := node.ChildByFieldName("type") + underlying := clike.ExtractTypeString(typeNode, declarator, source) + h.Typedefs[name] = &core.CStdlibTypedef{ + Type: NormalizeType(underlying), + Source: core.SourceHeader, + } +} + +// extractCPreprocDef handles `#define NAME value` lines. Only literals (integer, +// negative integer, string, character, hex/octal) are emitted; expression bodies +// and function-like macros are skipped. The conservative cut keeps the output +// trustworthy: registry consumers can rely on `Value` being a literal that round- +// trips through `strconv` or pure string comparison. +func extractCPreprocDef(node *sitter.Node, source []byte, h *core.CStdlibHeader) { + nameNode := node.ChildByFieldName("name") + valueNode := node.ChildByFieldName("value") + if nameNode == nil { + return + } + name := nameNode.Content(source) + if IsPrivateSymbol(name) { + return + } + + value := "" + typ := "" + if valueNode != nil { + value = strings.TrimSpace(valueNode.Content(source)) + // Drop comments and trailing semicolons that sometimes sneak into + // the right-hand side via tree-sitter's lossy macro tokenization. + value = stripTrailingComment(value) + typ = inferConstantType(value) + if typ == "" { + // Non-literal body: skip rather than emit something we can't + // give a type to. Future overlay extension can backfill. + return + } + } + + h.Constants[name] = &core.CStdlibConstant{ + Type: typ, + Value: value, + Source: core.SourceHeader, + } +} + +// makeFunctionFromInfo converts a clike.FunctionInfo into a core.CStdlibFunction +// with the canonical FQN and Source="header". Variadic positions in the +// FunctionInfo (`ParamNames[i] == "..."`) are translated into the registry's +// variadic convention (CStdlibParam{Name: "...", Type: "variadic", Required: +// false}). +func makeFunctionFromInfo(h *core.CStdlibHeader, info *clike.FunctionInfo) *core.CStdlibFunction { + params := make([]*core.CStdlibParam, 0, len(info.ParamNames)) + for i, name := range info.ParamNames { + typ := "" + if i < len(info.ParamTypes) { + typ = info.ParamTypes[i] + } + if name == "..." || typ == "..." { + params = append(params, &core.CStdlibParam{Name: "...", Type: "variadic", Required: false}) + continue + } + params = append(params, &core.CStdlibParam{ + Name: name, + Type: NormalizeType(typ), + Required: true, + }) + } + + return &core.CStdlibFunction{ + FQN: h.ModuleID + "::" + info.Name, + ReturnType: NormalizeType(info.ReturnType), + Params: params, + Confidence: 1.0, + Source: core.SourceHeader, + } +} + +// unwrapToFunctionDeclaratorPublic walks past pointer wrappers and returns the +// inner function_declarator, or nil if there isn't one. Mirrors the unexported +// helper in graph/clike/declarations.go but reachable here. +func unwrapToFunctionDeclaratorPublic(declarator *sitter.Node) *sitter.Node { + for cur := declarator; cur != nil; { + if cur.Type() == "function_declarator" { + return cur + } + next := cur.ChildByFieldName("declarator") + if next == nil { + return nil + } + cur = next + } + return nil +} + +// unwrapDeclaratorIdentifier walks down a declarator chain and returns the +// bare identifier name, stripping pointer / array / reference wrappers. +// Returns "" if no identifier is reachable. +// +// `primitive_type` is included in the leaf set because tree-sitter's C grammar +// reclassifies recognised typedef names (size_t, ssize_t, pid_t, …) as +// primitive types via a built-in lookup table. The token spelling is still +// a valid identifier; we just can't rely on the AST shape alone to tell us +// "this is the new name". +func unwrapDeclaratorIdentifier(declarator *sitter.Node, source []byte) string { + for cur := declarator; cur != nil; { + switch cur.Type() { + case "type_identifier", "identifier", "primitive_type": + return cur.Content(source) + case "pointer_declarator", "reference_declarator", "array_declarator": + cur = cur.ChildByFieldName("declarator") + if cur == nil { + return "" + } + case "function_declarator": + cur = cur.ChildByFieldName("declarator") + if cur == nil { + return "" + } + default: + return "" + } + } + return "" +} + +// stripTrailingComment removes a `// ...` or `/* ... */` tail that some +// preprocessor definitions carry alongside the value. Non-greedy: stops at +// the first comment-introducing token and trims trailing whitespace. +func stripTrailingComment(s string) string { + if i := strings.Index(s, "/*"); i >= 0 { + s = s[:i] + } + if i := strings.Index(s, "//"); i >= 0 { + s = s[:i] + } + return strings.TrimRight(s, " \t") +} + +// inferConstantType returns a best-guess type for a #define right-hand side. +// The caller drops the entry if this returns "" — that signals "expression +// too complex; refusing to emit unsound type info". +func inferConstantType(value string) string { + if value == "" { + return "" + } + // Strip redundant parens so common forms like `EOF (-1)` and `(0x10)` + // classify as the integer literal they represent. + stripped := stripBalancedOuterParens(value) + + // Integer / negative integer / hex / octal — `int` is conservative; the + // overlay can refine to `long`/`unsigned int`/etc. for specific constants + // (BUFSIZ, EOF, etc.) when needed. + if isIntegerLiteral(stripped) { + return "int" + } + // String literal. + if strings.HasPrefix(stripped, `"`) && strings.HasSuffix(stripped, `"`) { + return "const char*" + } + // Char literal. + if strings.HasPrefix(stripped, "'") && strings.HasSuffix(stripped, "'") { + return "char" + } + return "" +} + +// stripBalancedOuterParens removes a single layer of `(…)` wrapping iff the +// opening `(` matches the closing `)` at the very ends of the string. Repeats +// until no more outer wrap. Conservative: only strips when balanced — does +// NOT touch `((void*)0)` (the outer ) doesn't balance the leading () shape). +func stripBalancedOuterParens(s string) string { + for { + s = strings.TrimSpace(s) + if len(s) < 2 || s[0] != '(' || s[len(s)-1] != ')' { + return s + } + // Only strip when paren depth never returns to 0 before the very last char. + depth := 0 + ok := true + for i, r := range s { + switch r { + case '(': + depth++ + case ')': + depth-- + } + if depth == 0 && i < len(s)-1 { + ok = false + break + } + } + if !ok { + return s + } + s = s[1 : len(s)-1] + } +} + +func isIntegerLiteral(s string) bool { + if s == "" { + return false + } + i := 0 + if s[0] == '-' || s[0] == '+' { + i = 1 + } + if i >= len(s) { + return false + } + // Hex literal `0x..` + if i+1 < len(s) && s[i] == '0' && (s[i+1] == 'x' || s[i+1] == 'X') { + hex := s[i+2:] + if hex == "" { + return false + } + for _, r := range hex { + if !((r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')) { + return false + } + } + return true + } + // Plain decimal / octal. + for _, r := range s[i:] { + if r < '0' || r > '9' { + return false + } + } + return true +} diff --git a/sast-engine/tools/internal/clikeextract/c_extractor_test.go b/sast-engine/tools/internal/clikeextract/c_extractor_test.go new file mode 100644 index 00000000..df34c8c4 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/c_extractor_test.go @@ -0,0 +1,273 @@ +package clikeextract + +import ( + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const cFixtureDir = "testdata/c" + +func cTestSource() HeaderSource { + return HeaderSource{ + Platform: core.PlatformLinux, + Language: core.LanguageC, + SearchDirs: []string{cFixtureDir}, + HeaderExts: []string{".h"}, + SystemTag: "glibc-test", + } +} + +func TestExtractCHeader_Stdio(t *testing.T) { + src := cTestSource() + hf := HeaderFile{Name: "stdio.h", Path: filepath.Join(cFixtureDir, "stdio.h")} + + h, err := extractCHeader(hf, src) + require.NoError(t, err) + require.NotNil(t, h) + + // Header metadata + assert.Equal(t, "stdio.h", h.Header) + assert.Equal(t, "c::stdio", h.ModuleID) + assert.Equal(t, core.LanguageC, h.Language) + assert.Equal(t, core.PlatformLinux, h.Platform) + assert.Equal(t, "glibc-test", h.SystemTag) + assert.Equal(t, SchemaVersion, h.SchemaVersion) + + // Spot-check a function with variadic params. + printf := h.Functions["printf"] + require.NotNil(t, printf, "printf must extract") + assert.Equal(t, "c::stdio::printf", printf.FQN) + assert.Equal(t, "int", printf.ReturnType) + assert.Equal(t, core.SourceHeader, printf.Source) + require.Len(t, printf.Params, 2) + assert.Equal(t, "format", printf.Params[0].Name) + assert.Equal(t, "const char*", printf.Params[0].Type) + assert.Equal(t, "...", printf.Params[1].Name) + assert.Equal(t, "variadic", printf.Params[1].Type) + assert.False(t, printf.Params[1].Required) + + // Function returning a pointer. + fopen := h.Functions["fopen"] + require.NotNil(t, fopen) + assert.Equal(t, "FILE*", fopen.ReturnType) + + // Functions with multiple plain params. + fread := h.Functions["fread"] + require.NotNil(t, fread) + require.Len(t, fread.Params, 4) + assert.Equal(t, "void*", fread.Params[0].Type) + assert.Equal(t, "FILE*", fread.Params[3].Type) + + // Private symbols (leading _ / __) must be skipped. + assert.Nil(t, h.Functions["_internal_buffer_flush"]) + assert.Nil(t, h.Functions["__builtin_printf_check"]) + + // Typedefs. + require.Contains(t, h.Typedefs, "FILE") + assert.Equal(t, "struct __FILE", h.Typedefs["FILE"].Type) + require.Contains(t, h.Typedefs, "fpos_t") + + // Constants. + eof := h.Constants["EOF"] + require.NotNil(t, eof) + assert.Equal(t, "int", eof.Type) + assert.Equal(t, "(-1)", eof.Value) + assert.Equal(t, core.SourceHeader, eof.Source) + + bufsiz := h.Constants["BUFSIZ"] + require.NotNil(t, bufsiz) + assert.Equal(t, "int", bufsiz.Type) + assert.Equal(t, "8192", bufsiz.Value) + + defaultPath := h.Constants["DEFAULT_PATH"] + require.NotNil(t, defaultPath) + assert.Equal(t, "const char*", defaultPath.Type) +} + +func TestExtractCHeader_String(t *testing.T) { + src := cTestSource() + hf := HeaderFile{Name: "string.h", Path: filepath.Join(cFixtureDir, "string.h")} + + h, err := extractCHeader(hf, src) + require.NoError(t, err) + + for _, name := range []string{"memcpy", "strcpy", "strlen", "strchr", "memcmp"} { + assert.NotNil(t, h.Functions[name], "expected function %q", name) + } + require.Contains(t, h.Typedefs, "size_t") + // `((void*)0)` is not a literal — NULL constant gets dropped because + // inferConstantType returns "" for it. This is the conservative-cut + // behaviour we documented in the package: we'd rather be silent than + // emit untyped constants. + assert.Nil(t, h.Constants["NULL"]) +} + +func TestExtractCHeader_Unistd_PointerArrayParam(t *testing.T) { + src := cTestSource() + hf := HeaderFile{Name: "unistd.h", Path: filepath.Join(cFixtureDir, "unistd.h")} + + h, err := extractCHeader(hf, src) + require.NoError(t, err) + + // Function with `void` parameter list (fork(void)). + fork := h.Functions["fork"] + require.NotNil(t, fork) + assert.Equal(t, "pid_t", fork.ReturnType) + // `void` is a single explicit-empty param in tree-sitter; we accept the + // flat representation and don't try to special-case it. + assert.LessOrEqual(t, len(fork.Params), 1) + + // Function whose parameter is `char* const argv[]` — type tracking should + // at least capture the array part. Exact form is grammar-dependent so + // we just check it's non-empty. + exec := h.Functions["execvp"] + require.NotNil(t, exec) + assert.GreaterOrEqual(t, len(exec.Params), 2) +} + +func TestExtractCHeader_Inline(t *testing.T) { + src := cTestSource() + hf := HeaderFile{Name: "inline.h", Path: filepath.Join(cFixtureDir, "inline.h")} + + h, err := extractCHeader(hf, src) + require.NoError(t, err) + + // Inline function (function_definition) — exercises extractCFunction. + abs := h.Functions["abs_diff"] + require.NotNil(t, abs, "extractCFunction must capture inline definitions") + assert.Equal(t, "int", abs.ReturnType) + + // preproc_ifdef + preproc_else branches both walked. + assert.NotNil(t, h.Functions["foo_a"]) + assert.NotNil(t, h.Functions["foo_b"]) + + // Pointer typedef. + require.Contains(t, h.Typedefs, "int_ptr") + assert.Equal(t, "int*", h.Typedefs["int_ptr"].Type) +} + +func TestExtractCHeader_FileNotFound(t *testing.T) { + src := cTestSource() + _, err := extractCHeader(HeaderFile{Name: "absent.h", Path: "/nope/absent.h"}, src) + require.Error(t, err) + assert.Contains(t, err.Error(), "reading") +} + +func TestStripTrailingComment(t *testing.T) { + tests := []struct { + in, want string + }{ + {"42", "42"}, + {"42 // line comment", "42"}, + {"42 /* block */", "42"}, + {"42 // c1\n", "42"}, + {"42 ", "42"}, + {"", ""}, + } + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + assert.Equal(t, tt.want, stripTrailingComment(tt.in)) + }) + } +} + +func TestInferConstantType(t *testing.T) { + tests := []struct { + in, want string + }{ + {"", ""}, + {"42", "int"}, + {"-1", "int"}, + {"(-1)", "int"}, + {"((42))", "int"}, + {"0x1F", "int"}, + {"077", "int"}, + {`"hello"`, "const char*"}, + {`'a'`, "char"}, + {`((void*)0)`, ""}, + {`x + 1`, ""}, + {"FOO", ""}, + } + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + assert.Equal(t, tt.want, inferConstantType(tt.in)) + }) + } +} + +func TestIsIntegerLiteral(t *testing.T) { + tests := []struct { + in string + want bool + }{ + {"", false}, + {"42", true}, + {"-1", true}, + {"+5", true}, + {"-", false}, + {"0x1F", true}, + {"0X1f", true}, + {"0x", false}, + {"0xZ", false}, + {"abc", false}, + {"1a", false}, + } + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + assert.Equal(t, tt.want, isIntegerLiteral(tt.in)) + }) + } +} + +func TestStripBalancedOuterParens(t *testing.T) { + tests := []struct { + in, want string + }{ + {"", ""}, + {"42", "42"}, + {"(42)", "42"}, + {"((42))", "42"}, + {"(-1)", "-1"}, + {"((void*)0)", "(void*)0"}, + {"(a)+(b)", "(a)+(b)"}, + {" (5) ", "5"}, + } + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + assert.Equal(t, tt.want, stripBalancedOuterParens(tt.in)) + }) + } +} + +func TestUnwrapToFunctionDeclaratorPublic_NilInput(t *testing.T) { + assert.Nil(t, unwrapToFunctionDeclaratorPublic(nil)) +} + +func TestUnwrapDeclaratorIdentifier_NilInput(t *testing.T) { + assert.Equal(t, "", unwrapDeclaratorIdentifier(nil, nil)) +} + +// extractCHeader invariant: returns a fully-allocated header with +// pre-populated maps, even if no symbols are extracted from a body that +// only has comments. +func TestExtractCHeader_EmptyHeader(t *testing.T) { + dir := t.TempDir() + emptyPath := filepath.Join(dir, "empty.h") + mustWriteFile(t, emptyPath, "/* empty */\n") + + src := HeaderSource{ + Platform: core.PlatformLinux, Language: core.LanguageC, + SystemTag: "glibc-test", SearchDirs: []string{dir}, HeaderExts: []string{".h"}, + } + h, err := extractCHeader(HeaderFile{Name: "empty.h", Path: emptyPath}, src) + require.NoError(t, err) + require.NotNil(t, h) + assert.Equal(t, "empty.h", h.Header) + assert.Empty(t, h.Functions) + assert.Empty(t, h.Typedefs) + assert.Empty(t, h.Constants) +} diff --git a/sast-engine/tools/internal/clikeextract/config.go b/sast-engine/tools/internal/clikeextract/config.go new file mode 100644 index 00000000..bace396b --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/config.go @@ -0,0 +1,95 @@ +package clikeextract + +import ( + "fmt" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" +) + +// GeneratorVersion is the version of the clikeextract pipeline. Embedded in the +// generated manifest for downstream debugging — when a registry file is misbehaving, +// the consumer can correlate it with the generator that produced it. +const GeneratorVersion = "1.0.0" + +// SchemaVersion is the schema_version stamped on every emitted JSON file. Loaders +// (PR-02) read this and reject anything outside the supported range. Bump only on +// breaking schema changes (renamed/removed fields); additive changes (new fields, +// new headers) stay on the same SchemaVersion. +const SchemaVersion = "1.0.0" + +// RegistryVersion is the URL-path segment between language and per-header file +// (e.g. .../linux/c/v1/stdio_stdlib.json). Distinct from SchemaVersion: schema can +// move forward without rev'ing the URL path, and vice versa. +const RegistryVersion = "v1" + +// DefaultBaseURL is the registry root on the production CDN. Used to construct +// per-header URLs in manifest entries. The URL does not have to resolve at +// generation time — the loader fetches at scan time. +const DefaultBaseURL = "https://assets.codepathfinder.dev/registries" + +// Config carries the parameters for one Extractor.Run invocation. One run produces +// one (target, language) pair of registry files; producing the full Linux+Windows+ +// Darwin matrix takes six runs (handled by the GitHub Actions workflow in PR-03). +type Config struct { + // Target is the platform the generated registry is for ("linux", "windows", + // "darwin"). Used to drive header discovery (which dirs to walk) and stamped + // into every emitted file's Platform field. + Target string + + // Language is "c" or "cpp". Drives extractor dispatch and the URL-path + // segment under which the registry is published. + Language string + + // OutputDir is the directory where manifest.json and per-header JSONs are + // written. Created if it does not exist; existing files are overwritten. + OutputDir string + + // OverlayPath is the path to the YAML overlay file that augments tree-sitter + // extraction with hand-curated entries. If empty, the extractor runs without + // an overlay (every entry stays Source="header"). Mismatch between overlay's + // declared language and Config.Language is a hard error at load time. + OverlayPath string + + // BaseURL overrides DefaultBaseURL when stamping per-header URL fields. Tests + // set this to a local file:// path so generated manifests can be replayed + // without hitting the network. + BaseURL string +} + +// Validate reports the first inconsistency in cfg, or nil if cfg is usable. +// Validation is intentionally minimal — most invariants (search dirs exist, +// overlay parses) are enforced later in Run() where the error has more context. +func (c Config) Validate() error { + if c.Target == "" { + return fmt.Errorf("config: Target is required (one of %q, %q, %q)", + core.PlatformLinux, core.PlatformWindows, core.PlatformDarwin) + } + switch c.Target { + case core.PlatformLinux, core.PlatformWindows, core.PlatformDarwin: + default: + return fmt.Errorf("config: unsupported Target %q (allowed: %q, %q, %q)", + c.Target, core.PlatformLinux, core.PlatformWindows, core.PlatformDarwin) + } + + switch c.Language { + case core.LanguageC, core.LanguageCpp: + case "": + return fmt.Errorf("config: Language is required (%q or %q)", core.LanguageC, core.LanguageCpp) + default: + return fmt.Errorf("config: unsupported Language %q (allowed: %q, %q)", + c.Language, core.LanguageC, core.LanguageCpp) + } + + if c.OutputDir == "" { + return fmt.Errorf("config: OutputDir is required") + } + return nil +} + +// effectiveBaseURL returns cfg.BaseURL if set, otherwise DefaultBaseURL. +func (c Config) effectiveBaseURL() string { + if c.BaseURL != "" { + return c.BaseURL + } + return DefaultBaseURL +} diff --git a/sast-engine/tools/internal/clikeextract/config_test.go b/sast-engine/tools/internal/clikeextract/config_test.go new file mode 100644 index 00000000..d35acec0 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/config_test.go @@ -0,0 +1,88 @@ +package clikeextract + +import ( + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestConfig_Validate(t *testing.T) { + tests := []struct { + name string + cfg Config + wantErr string // empty = expect no error + }{ + { + name: "missing target", + cfg: Config{Language: core.LanguageC, OutputDir: "/tmp/out"}, + wantErr: "Target is required", + }, + { + name: "unsupported target", + cfg: Config{Target: "freebsd", Language: core.LanguageC, OutputDir: "/tmp/out"}, + wantErr: "unsupported Target", + }, + { + name: "missing language", + cfg: Config{Target: core.PlatformLinux, OutputDir: "/tmp/out"}, + wantErr: "Language is required", + }, + { + name: "unsupported language", + cfg: Config{Target: core.PlatformLinux, Language: "rust", OutputDir: "/tmp/out"}, + wantErr: "unsupported Language", + }, + { + name: "missing output dir", + cfg: Config{Target: core.PlatformLinux, Language: core.LanguageC}, + wantErr: "OutputDir is required", + }, + { + name: "valid linux c", + cfg: Config{Target: core.PlatformLinux, Language: core.LanguageC, OutputDir: "/tmp/out"}, + }, + { + name: "valid linux cpp", + cfg: Config{Target: core.PlatformLinux, Language: core.LanguageCpp, OutputDir: "/tmp/out"}, + }, + { + name: "valid windows c", + cfg: Config{Target: core.PlatformWindows, Language: core.LanguageC, OutputDir: "/tmp/out"}, + }, + { + name: "valid darwin cpp", + cfg: Config{Target: core.PlatformDarwin, Language: core.LanguageCpp, OutputDir: "/tmp/out"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.cfg.Validate() + if tt.wantErr == "" { + require.NoError(t, err) + return + } + require.Error(t, err) + assert.Contains(t, err.Error(), tt.wantErr) + }) + } +} + +func TestConfig_EffectiveBaseURL(t *testing.T) { + cfg := Config{} + assert.Equal(t, DefaultBaseURL, cfg.effectiveBaseURL()) + + cfg.BaseURL = "file:///tmp/registries" + assert.Equal(t, "file:///tmp/registries", cfg.effectiveBaseURL()) +} + +func TestVersionConstants(t *testing.T) { + // Lock the visible version surface — bumping these is a deliberate change, + // the test is the trip-wire that forces the bump to be intentional. + assert.Equal(t, "1.0.0", GeneratorVersion) + assert.Equal(t, "1.0.0", SchemaVersion) + assert.Equal(t, "v1", RegistryVersion) + assert.Equal(t, "https://assets.codepathfinder.dev/registries", DefaultBaseURL) +} diff --git a/sast-engine/tools/internal/clikeextract/cpp_extractor.go b/sast-engine/tools/internal/clikeextract/cpp_extractor.go new file mode 100644 index 00000000..b8117738 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/cpp_extractor.go @@ -0,0 +1,552 @@ +package clikeextract + +import ( + "context" + "fmt" + "os" + "strings" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sast-engine/graph/clike" + cpplang "github.com/smacker/go-tree-sitter/cpp" + sitter "github.com/smacker/go-tree-sitter" +) + +// extractCppHeader parses a single C++ header file and returns its symbol table +// as a *core.CStdlibHeader. Compared to extractCHeader, this walker tracks an +// additional piece of state — the current namespace stack — so methods, classes, +// and free functions are emitted with their fully-qualified name. +// +// Stamping rules: +// +// - Classes go into h.Classes keyed by FQN ("std::vector"). +// - Class methods go into Class.Methods keyed by bare name ("push_back"). +// - Free functions inside namespaces go into h.FreeFunctions keyed by FQN +// ("std::move"). +// - Free functions at file scope (rare in headers) go into h.Functions keyed +// by bare name. This is the same map C uses, so a header that mixes C +// and C++ shapes still emits a coherent registry. +// - Top-level namespaces (`std`, `boost`, …) are recorded in h.Namespaces. +func extractCppHeader(file HeaderFile, src HeaderSource) (*core.CStdlibHeader, error) { + source, err := os.ReadFile(file.Path) //nolint:gosec // path is from filepath.WalkDir + if err != nil { + return nil, fmt.Errorf("extractCppHeader: reading %q: %w", file.Path, err) + } + + parser := sitter.NewParser() + parser.SetLanguage(cpplang.GetLanguage()) + tree, err := parser.ParseCtx(context.Background(), nil, source) + if err != nil { + return nil, fmt.Errorf("extractCppHeader: parsing %q: %w", file.Path, err) + } + defer tree.Close() + + h := newCppHeader(file, src) + w := &cppWalker{header: h, source: source} + w.walk(tree.RootNode()) + finaliseNamespaces(h) + return h, nil +} + +func newCppHeader(file HeaderFile, src HeaderSource) *core.CStdlibHeader { + h := core.NewCStdlibHeader() + h.SchemaVersion = SchemaVersion + h.Header = file.Name + h.ModuleID = "std::" + SanitizeHeaderName(file.Name) + h.Language = src.Language + h.Platform = src.Platform + h.SystemTag = src.SystemTag + return h +} + +// cppWalker carries the mutable walk state. namespaceStack is updated on entry +// to namespace_definition and reverted on exit. templateStack carries the most +// recent template_parameter_list — when the next class_specifier or function is +// extracted, it consumes (and clears) the entry. +type cppWalker struct { + header *core.CStdlibHeader + source []byte + + // namespaceStack is the chain of currently-open namespaces, deepest last. + namespaceStack []string + // pendingTemplate is a template_parameter_list node whose enclosing + // declaration has not been seen yet. The walker sets this on entering a + // template_declaration and clears it after dispatching the child. + pendingTemplate *sitter.Node +} + +func (w *cppWalker) walk(node *sitter.Node) { + if node == nil { + return + } + switch node.Type() { + case "namespace_definition": + w.walkNamespace(node) + case "template_declaration": + w.walkTemplate(node) + case "class_specifier", "struct_specifier": + w.extractClass(node, w.pendingTemplate) + w.pendingTemplate = nil + case "function_definition": + w.extractFreeFunctionFromDefinition(node) + case "declaration": + w.extractFreeDeclaration(node) + case "type_definition": + w.extractCppTypedef(node) + case "preproc_def": + extractCPreprocDef(node, w.source, w.header) + case "preproc_if", "preproc_ifdef", "preproc_else", "preproc_elif", + "declaration_list", "translation_unit": + w.walkChildren(node) + case "linkage_specification": + body := node.ChildByFieldName("body") + w.walkChildren(body) + } +} + +func (w *cppWalker) walkChildren(node *sitter.Node) { + if node == nil { + return + } + for i := 0; i < int(node.NamedChildCount()); i++ { + w.walk(node.NamedChild(i)) + } +} + +// walkNamespace pushes the namespace name onto the stack, walks the body, then +// pops. Anonymous namespaces (no namespace_identifier child) are skipped — by +// definition they don't contribute symbols visible to other translation units, +// and the registry only describes the public surface. +func (w *cppWalker) walkNamespace(node *sitter.Node) { + nameNode := node.ChildByFieldName("name") + if nameNode == nil { + return + } + name := nameNode.Content(w.source) + if IsPrivateNamespace(name) { + return + } + w.namespaceStack = append(w.namespaceStack, name) + defer func() { w.namespaceStack = w.namespaceStack[:len(w.namespaceStack)-1] }() + + body := node.ChildByFieldName("body") + w.walkChildren(body) +} + +// walkTemplate captures the template_parameter_list and dispatches to the next +// child (which is the entity being templated — a class or function). After +// extraction the pendingTemplate is cleared. +func (w *cppWalker) walkTemplate(node *sitter.Node) { + w.pendingTemplate = nil + for i := 0; i < int(node.NamedChildCount()); i++ { + child := node.NamedChild(i) + if child == nil { + continue + } + if child.Type() == "template_parameter_list" { + w.pendingTemplate = child + continue + } + w.walk(child) + } + w.pendingTemplate = nil +} + +// extractClass converts a class_specifier or struct_specifier into a +// CppStdlibClass entry. Methods are pulled from the field_declaration_list; +// constructors are recognised as declarations whose name matches the class. +// Template parameters from the enclosing template_declaration (if any) are +// recorded on the class. +func (w *cppWalker) extractClass(node *sitter.Node, tmpl *sitter.Node) { + nameNode := node.ChildByFieldName("name") + if nameNode == nil { + return // anonymous class — no public surface + } + bareName := nameNode.Content(w.source) + if IsPrivateSymbol(bareName) { + return + } + fqn := w.qualified(bareName) + + cls, ok := w.header.Classes[fqn] + if !ok { + cls = core.NewCppStdlibClass(fqn) + w.header.Classes[fqn] = cls + } + + if tmpl != nil { + cls.TypeParams = extractTemplateParamNames(tmpl, w.source) + } + + body := node.ChildByFieldName("body") + if body == nil { + return + } + w.walkClassBody(body, cls, bareName) +} + +// walkClassBody iterates a field_declaration_list and routes each child to the +// right extractor: field_declaration → method or data field; declaration → +// constructor (when its name matches the class). +func (w *cppWalker) walkClassBody(body *sitter.Node, cls *core.CppStdlibClass, className string) { + for i := 0; i < int(body.NamedChildCount()); i++ { + child := body.NamedChild(i) + if child == nil { + continue + } + switch child.Type() { + case "field_declaration": + w.extractMaybeMethod(child, cls) + case "declaration": + w.extractMaybeConstructor(child, cls, className) + case "preproc_if", "preproc_ifdef", "preproc_else", "preproc_elif": + // Conditional inside a class body: walk recursively, treating + // each matching child node as part of the class body. + w.walkClassBody(child, cls, className) + } + } +} + +// extractMaybeMethod inspects a field_declaration. If its declarator is a +// function_declarator (possibly through a pointer / reference wrapper), we +// treat it as a method; otherwise it's a data field and we ignore it. +// +// Implementation note: Phase 1's clike.ExtractFunctionInfo does not see through +// reference_declarator wrappers (it only follows the "declarator" field, which +// reference_declarator does not expose). So for member functions returning T& +// (`T& at(...)`, `vector& operator+=(...)`) we extract the name manually via +// findFunctionDeclarator below, then build the FunctionInfo ourselves rather +// than going through clike. +func (w *cppWalker) extractMaybeMethod(node *sitter.Node, cls *core.CppStdlibClass) { + declarator := node.ChildByFieldName("declarator") + funcDecl := findFunctionDeclarator(declarator) + if funcDecl == nil { + return + } + name := functionDeclaratorName(funcDecl, w.source) + if name == "" || IsPrivateSymbol(name) { + return + } + + typeNode := node.ChildByFieldName("type") + returnType := clike.ExtractTypeString(typeNode, declarator, w.source) + paramList := funcDecl.ChildByFieldName("parameters") + pNames, pTypes := clike.ExtractParameters(paramList, w.source) + + info := &clike.FunctionInfo{ + Name: name, + ReturnType: returnType, + ParamNames: pNames, + ParamTypes: pTypes, + } + cls.Methods[name] = w.makeMethod(cls, info) +} + +// extractMaybeConstructor recognises declarations inside a class body whose +// declarator name matches the class name — those are constructors. Anything +// else inside a `declaration` node at class scope is ignored (rare; e.g. +// nested using-declarations). +func (w *cppWalker) extractMaybeConstructor(node *sitter.Node, cls *core.CppStdlibClass, className string) { + declarator := node.ChildByFieldName("declarator") + if declarator == nil || declarator.Type() != "function_declarator" { + return + } + innerDecl := declarator.ChildByFieldName("declarator") + if innerDecl == nil || innerDecl.Content(w.source) != className { + return + } + paramList := declarator.ChildByFieldName("parameters") + names, types := clike.ExtractParameters(paramList, w.source) + + params := make([]*core.CStdlibParam, 0, len(names)) + for i, n := range names { + typ := "" + if i < len(types) { + typ = types[i] + } + params = append(params, &core.CStdlibParam{ + Name: n, + Type: NormalizeType(typ), + Required: true, + }) + } + cls.Constructors = append(cls.Constructors, &core.CppStdlibConstructor{ + Params: params, + Source: core.SourceHeader, + }) +} + +// extractCppTypedef is the C++ analog of extractCTypedef but emits typedef +// entries with the namespace prefix when one is active. The registry does not +// currently key typedefs by FQN — they live in a flat map — so namespaced +// typedefs land under their bare name; future schema work may move them to +// FQN-keyed maps. +func (w *cppWalker) extractCppTypedef(node *sitter.Node) { + declarator := node.ChildByFieldName("declarator") + name := unwrapDeclaratorIdentifier(declarator, w.source) + if name == "" || IsPrivateSymbol(name) { + return + } + typeNode := node.ChildByFieldName("type") + underlying := clike.ExtractTypeString(typeNode, declarator, w.source) + w.header.Typedefs[name] = &core.CStdlibTypedef{ + Type: NormalizeType(underlying), + Source: core.SourceHeader, + } +} + +// extractFreeDeclaration handles a top-level (or namespace-scoped) `declaration` +// node — typically a free-function prototype. The result is keyed in either +// h.Functions (file-scope) or h.FreeFunctions (namespaced) depending on whether +// the namespace stack is currently non-empty. +func (w *cppWalker) extractFreeDeclaration(node *sitter.Node) { + declarator := node.ChildByFieldName("declarator") + funcDecl := findFunctionDeclarator(declarator) + if funcDecl == nil { + return + } + name := functionDeclaratorName(funcDecl, w.source) + if name == "" || IsPrivateSymbol(name) { + return + } + typeNode := node.ChildByFieldName("type") + returnType := clike.ExtractTypeString(typeNode, declarator, w.source) + paramList := funcDecl.ChildByFieldName("parameters") + pNames, pTypes := clike.ExtractParameters(paramList, w.source) + + info := &clike.FunctionInfo{ + Name: name, + ReturnType: returnType, + ParamNames: pNames, + ParamTypes: pTypes, + } + fn := &core.CStdlibFunction{ + FQN: w.qualified(name), + ReturnType: NormalizeType(info.ReturnType), + Params: paramListFromInfo(info), + Confidence: 1.0, + Source: core.SourceHeader, + } + if len(w.namespaceStack) == 0 { + w.header.Functions[name] = fn + return + } + w.header.FreeFunctions[fn.FQN] = fn +} + +// extractFreeFunctionFromDefinition handles the inline-definition case (rare +// in stdlib headers but used by libstdc++ for trivial wrappers). +func (w *cppWalker) extractFreeFunctionFromDefinition(node *sitter.Node) { + info := clike.ExtractFunctionInfo(node, w.source) + if info == nil || info.Name == "" { + return + } + if IsPrivateSymbol(info.Name) { + return + } + fn := &core.CStdlibFunction{ + FQN: w.qualified(info.Name), + ReturnType: NormalizeType(info.ReturnType), + Params: paramListFromInfo(info), + Confidence: 1.0, + Source: core.SourceHeader, + } + if len(w.namespaceStack) == 0 { + w.header.Functions[info.Name] = fn + return + } + w.header.FreeFunctions[fn.FQN] = fn +} + +// makeMethod converts a clike.FunctionInfo into a CStdlibFunction whose FQN is +// `class::method`. Variadic params follow the same convention as the C path. +func (w *cppWalker) makeMethod(cls *core.CppStdlibClass, info *clike.FunctionInfo) *core.CStdlibFunction { + return &core.CStdlibFunction{ + FQN: cls.FQN + "::" + info.Name, + ReturnType: NormalizeType(info.ReturnType), + Params: paramListFromInfo(info), + Confidence: 1.0, + Source: core.SourceHeader, + } +} + +// qualified returns name prefixed by the current namespace stack +// (`std::sub::name`), or just name if the stack is empty. +func (w *cppWalker) qualified(name string) string { + if len(w.namespaceStack) == 0 { + return name + } + return strings.Join(w.namespaceStack, "::") + "::" + name +} + +// extractTemplateParamNames pulls the bare type-parameter names out of a +// template_parameter_list. We only capture names; constraints and default +// arguments are ignored at this layer (the overlay is the right place for +// concrete defaults like `Allocator = std::allocator`). +func extractTemplateParamNames(list *sitter.Node, source []byte) []string { + if list == nil { + return nil + } + var names []string + for i := 0; i < int(list.NamedChildCount()); i++ { + child := list.NamedChild(i) + if child == nil { + continue + } + switch child.Type() { + case "type_parameter_declaration", "optional_type_parameter_declaration", + "variadic_type_parameter_declaration": + if id := findFirstIdentifier(child, source); id != "" { + names = append(names, id) + } + case "parameter_declaration": + // Non-type template parameters (e.g., `int N`). Capture the bare + // identifier from the declarator field; tree-sitter exposes it as + // `declarator -> identifier`. + decl := child.ChildByFieldName("declarator") + if decl != nil && decl.Type() == "identifier" { + names = append(names, decl.Content(source)) + } + } + } + return names +} + +// findFunctionDeclarator walks past pointer_declarator / reference_declarator / +// abstract_*_declarator wrappers and returns the inner function_declarator, +// or nil if none. Unlike clike's helper, this also follows the first-named- +// child fallback used when a wrapper does not expose its body via the +// "declarator" field — necessary for C++ reference_declarator nodes. +func findFunctionDeclarator(node *sitter.Node) *sitter.Node { + for cur := node; cur != nil; { + if cur.Type() == "function_declarator" { + return cur + } + cur = innerDeclaratorChild(cur) + } + return nil +} + +// functionDeclaratorName extracts the name from a function_declarator. The +// name lives under the "declarator" field; that may itself be wrapped (an +// `operator[]` or destructor `~T()` form), so we walk it recursively until +// we reach an identifier-like leaf or run out of declarator field links. +func functionDeclaratorName(funcDecl *sitter.Node, source []byte) string { + if funcDecl == nil { + return "" + } + name := funcDecl.ChildByFieldName("declarator") + if name == nil { + return "" + } + // Common leaf shapes: identifier, field_identifier, operator_name, + // destructor_name, qualified_identifier, template_function (for + // `function()` declarations — rare in stdlib but legal). + switch name.Type() { + case "identifier", "field_identifier", "operator_name", "destructor_name": + return name.Content(source) + case "qualified_identifier": + return name.Content(source) + case "template_function": + // Template function call form — emit the bare name. + if id := name.ChildByFieldName("name"); id != nil { + return id.Content(source) + } + return name.Content(source) + default: + // Fallback: dump the raw text. This may include ref/ptr decoration + // for unusual shapes; downstream consumers can normalise. + return strings.TrimSpace(name.Content(source)) + } +} + +// innerDeclaratorChild returns the next declarator inside a wrapper, falling +// back to the first non-qualifier named child when the "declarator" field is +// not exposed by the grammar (the case for C++ reference_declarator). +func innerDeclaratorChild(wrapper *sitter.Node) *sitter.Node { + if wrapper == nil { + return nil + } + if c := wrapper.ChildByFieldName("declarator"); c != nil { + return c + } + for i := 0; i < int(wrapper.NamedChildCount()); i++ { + c := wrapper.NamedChild(i) + if c == nil { + continue + } + if c.Type() == "type_qualifier" { + continue + } + return c + } + return nil +} + +// findFirstIdentifier returns the content of the first identifier or +// type_identifier descendant of node. Used to pull a parameter name out of a +// template_parameter_declaration that may have a default argument or +// constraint preceding/following the name. +func findFirstIdentifier(node *sitter.Node, source []byte) string { + if node == nil { + return "" + } + for i := 0; i < int(node.NamedChildCount()); i++ { + c := node.NamedChild(i) + if c == nil { + continue + } + if c.Type() == "type_identifier" || c.Type() == "identifier" { + return c.Content(source) + } + } + return "" +} + +// paramListFromInfo is the C++ analog of makeFunctionFromInfo's param block — +// extracted so methods, constructors, and free functions all share identical +// param-shape conversion logic. +func paramListFromInfo(info *clike.FunctionInfo) []*core.CStdlibParam { + params := make([]*core.CStdlibParam, 0, len(info.ParamNames)) + for i, name := range info.ParamNames { + typ := "" + if i < len(info.ParamTypes) { + typ = info.ParamTypes[i] + } + if name == "..." || typ == "..." { + params = append(params, &core.CStdlibParam{Name: "...", Type: "variadic", Required: false}) + continue + } + params = append(params, &core.CStdlibParam{ + Name: name, + Type: NormalizeType(typ), + Required: true, + }) + } + return params +} + +// finaliseNamespaces deduplicates and stamps the top-level namespace list onto +// the header. It is computed by inspecting all FreeFunctions and Classes for +// their FQN prefix. +func finaliseNamespaces(h *core.CStdlibHeader) { + seen := map[string]struct{}{} + add := func(fqn string) { + idx := strings.Index(fqn, "::") + if idx <= 0 { + return + } + ns := fqn[:idx] + if _, ok := seen[ns]; ok { + return + } + seen[ns] = struct{}{} + h.Namespaces = append(h.Namespaces, ns) + } + for fqn := range h.Classes { + add(fqn) + } + for fqn := range h.FreeFunctions { + add(fqn) + } +} diff --git a/sast-engine/tools/internal/clikeextract/cpp_extractor_test.go b/sast-engine/tools/internal/clikeextract/cpp_extractor_test.go new file mode 100644 index 00000000..2af2e35a --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/cpp_extractor_test.go @@ -0,0 +1,226 @@ +package clikeextract + +import ( + "path/filepath" + "sort" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sast-engine/graph/clike" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +const cppFixtureDir = "testdata/cpp" + +func cppTestSource() HeaderSource { + return HeaderSource{ + Platform: core.PlatformLinux, + Language: core.LanguageCpp, + SystemTag: "libstdc++-test", + SearchDirs: []string{cppFixtureDir}, + HeaderExts: []string{".h", ".hpp", ""}, + } +} + +func TestExtractCppHeader_VectorClassWithTemplateAndMethods(t *testing.T) { + src := cppTestSource() + hf := HeaderFile{Name: "vector", Path: filepath.Join(cppFixtureDir, "vector")} + + h, err := extractCppHeader(hf, src) + require.NoError(t, err) + require.NotNil(t, h) + + assert.Equal(t, "vector", h.Header) + assert.Equal(t, "std::vector", h.ModuleID) + assert.Equal(t, core.LanguageCpp, h.Language) + + // Class extracted with FQN. + cls := h.Classes["std::vector"] + require.NotNil(t, cls, "std::vector class must be extracted") + assert.Equal(t, "std::vector", cls.FQN) + + // Template parameter names captured. + assert.Equal(t, []string{"T", "Allocator"}, cls.TypeParams) + + // Methods present. + for _, name := range []string{"push_back", "at", "operator[]", "size", "empty", "data", "clear"} { + assert.Contains(t, cls.Methods, name, "method %q expected", name) + } + pushBack := cls.Methods["push_back"] + require.NotNil(t, pushBack) + assert.Equal(t, "void", pushBack.ReturnType) + assert.Equal(t, "std::vector::push_back", pushBack.FQN) + assert.Equal(t, core.SourceHeader, pushBack.Source) + + // Constructors recognised. + assert.GreaterOrEqual(t, len(cls.Constructors), 3) + + // Free function in std namespace lands in FreeFunctions. + swap := h.FreeFunctions["std::swap"] + require.NotNil(t, swap) + assert.Equal(t, "std::swap", swap.FQN) + + // Top-level namespace recorded. + assert.Contains(t, h.Namespaces, "std") +} + +func TestExtractCppHeader_StringClassAndTypedef(t *testing.T) { + src := cppTestSource() + hf := HeaderFile{Name: "string", Path: filepath.Join(cppFixtureDir, "string")} + + h, err := extractCppHeader(hf, src) + require.NoError(t, err) + + cls := h.Classes["std::basic_string"] + require.NotNil(t, cls) + assert.Equal(t, []string{"CharT"}, cls.TypeParams) + + cstr := cls.Methods["c_str"] + require.NotNil(t, cstr) + + // operator+= captured (operator overloads share the function_declarator shape). + assert.NotNil(t, cls.Methods["operator+="]) + + // Typedefs present (size_t at namespace scope, string as the alias). + require.Contains(t, h.Typedefs, "size_t") + require.Contains(t, h.Typedefs, "string") +} + +func TestExtractCppHeader_PrivateNamespaceSkipped(t *testing.T) { + src := cppTestSource() + hf := HeaderFile{Name: "utility", Path: filepath.Join(cppFixtureDir, "utility")} + + h, err := extractCppHeader(hf, src) + require.NoError(t, err) + + // Templated free functions land in FreeFunctions (under namespace std). + assert.NotNil(t, h.FreeFunctions["std::move"]) + assert.NotNil(t, h.FreeFunctions["std::forward"]) + + // Private/library-internal names dropped. + assert.Nil(t, h.FreeFunctions["std::_internal_helper"]) + assert.Nil(t, h.FreeFunctions["std::__detail_helper"]) + + // File-scope free function lands in Functions, not FreeFunctions. + assert.NotNil(t, h.Functions["file_scope_func"]) + assert.Nil(t, h.FreeFunctions["file_scope_func"]) + + // Compiler-internal namespace `__detail` was skipped wholesale. + for fqn := range h.FreeFunctions { + assert.NotContains(t, fqn, "__detail") + } + for fqn := range h.Classes { + assert.NotContains(t, fqn, "__detail") + } +} + +func TestExtractCppHeader_FileNotFound(t *testing.T) { + _, err := extractCppHeader(HeaderFile{Name: "absent", Path: "/no-such-cpp"}, cppTestSource()) + require.Error(t, err) + assert.Contains(t, err.Error(), "reading") +} + +func TestExtractTemplateParamNames(t *testing.T) { + t.Run("nil list yields nil", func(t *testing.T) { + got := extractTemplateParamNames(nil, nil) + assert.Nil(t, got) + }) + // Other branches are exercised end-to-end via the vector and string fixtures + // (TestExtractCppHeader_VectorClassWithTemplateAndMethods, + // TestExtractCppHeader_StringClassAndTypedef). +} + +func TestFinaliseNamespaces_DedupsAndOrders(t *testing.T) { + h := core.NewCStdlibHeader() + h.FreeFunctions["std::move"] = &core.CStdlibFunction{} + h.FreeFunctions["std::forward"] = &core.CStdlibFunction{} + h.FreeFunctions["boost::lexical_cast"] = &core.CStdlibFunction{} + h.Classes["std::vector"] = &core.CppStdlibClass{} + h.Classes["std::map"] = &core.CppStdlibClass{} + + finaliseNamespaces(h) + + sort.Strings(h.Namespaces) + assert.Equal(t, []string{"boost", "std"}, h.Namespaces) +} + +func TestFinaliseNamespaces_BareNamesIgnored(t *testing.T) { + h := core.NewCStdlibHeader() + h.FreeFunctions["bare_func"] = &core.CStdlibFunction{} + finaliseNamespaces(h) + assert.Empty(t, h.Namespaces) +} + +func TestQualified(t *testing.T) { + w := &cppWalker{} + assert.Equal(t, "foo", w.qualified("foo")) + + w.namespaceStack = []string{"std"} + assert.Equal(t, "std::foo", w.qualified("foo")) + + w.namespaceStack = []string{"std", "detail"} + assert.Equal(t, "std::detail::foo", w.qualified("foo")) +} + +func TestFindFirstIdentifier_Nil(t *testing.T) { + assert.Equal(t, "", findFirstIdentifier(nil, nil)) +} + +// TestExtractCppHeader_InlineFunctionDefinition exercises +// extractFreeFunctionFromDefinition (the function_definition branch in walk). +func TestExtractCppHeader_InlineFunctionDefinition(t *testing.T) { + dir := t.TempDir() + hp := filepath.Join(dir, "inline_funcs") + mustWriteFile(t, hp, ` +namespace std { +inline int square(int x) { return x * x; } +} +inline int file_scope_inline(int x) { return x; } +`) + + src := HeaderSource{ + Platform: core.PlatformLinux, Language: core.LanguageCpp, + SystemTag: "test", SearchDirs: []string{dir}, HeaderExts: []string{""}, + } + h, err := extractCppHeader(HeaderFile{Name: "inline_funcs", Path: hp}, src) + require.NoError(t, err) + + // Inline function inside namespace lands in FreeFunctions under the FQN. + require.NotNil(t, h.FreeFunctions["std::square"]) + + // File-scope inline lands in Functions. + require.NotNil(t, h.Functions["file_scope_inline"]) +} + +func TestParamListFromInfo_Variadic(t *testing.T) { + got := paramListFromInfo(&clike.FunctionInfo{ + ParamNames: []string{"x", "..."}, + ParamTypes: []string{"int", "..."}, + }) + require.Len(t, got, 2) + assert.False(t, got[1].Required) + assert.Equal(t, "variadic", got[1].Type) +} + +func TestParamListFromInfo_NormalParams(t *testing.T) { + got := paramListFromInfo(&clike.FunctionInfo{ + ParamNames: []string{"a", "b"}, + ParamTypes: []string{"int", "const char*"}, + }) + require.Len(t, got, 2) + assert.True(t, got[0].Required) + assert.Equal(t, "int", got[0].Type) + assert.Equal(t, "const char*", got[1].Type) +} + +func TestParamListFromInfo_NameTypeMismatch(t *testing.T) { + // Defensive: when ParamTypes is shorter than ParamNames, we emit an empty + // type rather than panicking. + got := paramListFromInfo(&clike.FunctionInfo{ + ParamNames: []string{"a", "b"}, + ParamTypes: []string{"int"}, + }) + require.Len(t, got, 2) + assert.Equal(t, "", got[1].Type) +} diff --git a/sast-engine/tools/internal/clikeextract/doc.go b/sast-engine/tools/internal/clikeextract/doc.go new file mode 100644 index 00000000..9d2261b1 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/doc.go @@ -0,0 +1,43 @@ +// Package clikeextract walks installed C and C++ system headers and emits per-header +// JSON registry files describing functions, classes, methods, typedefs, and #define +// constants — the input the loader (PR-02) consumes when resolving stdlib calls +// during analysis. +// +// The package is the heavy lifter behind tools/generate_clike_stdlib_registry.go. +// The entry-point binary is a thin //go:build cpf_generate_stdlib_registry wrapper +// that flag-parses and calls Extractor.Run; everything else lives here so it stays +// testable under regular `go test ./...`. +// +// # Pipeline +// +// discoverHeaderSources(target, lang) → []HeaderSource +// walkHeaders(src) → []HeaderFile +// extractHeader(file, lang) → *core.CStdlibHeader (Source: "header") +// mergeOverlay(extracted, overlay) → *core.CStdlibHeader (Source: "merged" or unchanged) +// emitter.WritePerHeader(...) + WriteManifest(...) +// +// # Why a separate package (not flat in tools/) +// +// The flat layout in the original PR-01 spec piles 1,500+ LoC into tools/. Sibling +// generator tools/internal/goextract demonstrates the alternative: thin entry-point +// + internal package with one file per concern. This package follows that pattern +// for consistency and so each concern (walker, extractor, overlay, emitter) is +// individually unit-testable. +// +// # Reuse of Phase 1 helpers +// +// AST extraction reuses graph/clike helpers wherever possible +// (ExtractFunctionInfo, ExtractStructFields, ExtractTypeString, ExtractParameters) +// — this package does not re-implement tree-sitter walking that already exists +// upstream. C++-specific concerns not yet covered by graph/clike (preproc_def +// macro extraction, template_parameter_list capture on classes, namespace tracking +// for free functions) are added here. +// +// # Output layout +// +// / +// manifest.json CStdlibManifest — top-level index +// _stdlib.json CStdlibHeader — one per header +// +// where sanitized-header strips the .h extension and replaces /'s with _'s. +package clikeextract diff --git a/sast-engine/tools/internal/clikeextract/emitter.go b/sast-engine/tools/internal/clikeextract/emitter.go new file mode 100644 index 00000000..111709eb --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/emitter.go @@ -0,0 +1,155 @@ +package clikeextract + +import ( + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + "time" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" +) + +// EmitOutput writes the per-header JSON files and the top-level manifest.json +// to outDir. The output layout matches what the loader (PR-02) expects: +// +// /manifest.json +// /_stdlib.json (one per header) +// +// The manifest is computed deterministically — entries are sorted by header +// name, statistics are tallied from the actual symbol counts, and checksums +// are sha256 hashes of the per-header JSON bytes that ship to the CDN. Two +// runs over the same input produce byte-identical output. +// +// overlayApplied is the count returned by MergeOverlay across all headers, +// used to populate Statistics.OverlayOverrides for visibility in +// resolution-report. +func EmitOutput(headers []*core.CStdlibHeader, cfg Config, overlayApplied int) error { + if err := os.MkdirAll(cfg.OutputDir, 0o755); err != nil { + return fmt.Errorf("EmitOutput: creating output dir %q: %w", cfg.OutputDir, err) + } + + // Sort headers by name so the manifest's Headers slice is stable. + sort.SliceStable(headers, func(i, j int) bool { + return headers[i].Header < headers[j].Header + }) + + entries := make([]*core.CStdlibHeaderEntry, 0, len(headers)) + for _, h := range headers { + entry, err := writePerHeader(h, cfg) + if err != nil { + return err + } + entries = append(entries, entry) + } + + manifest := buildManifest(headers, entries, cfg, overlayApplied) + return writeManifest(manifest, cfg) +} + +// writePerHeader serialises one header to disk and returns the manifest entry +// describing it. The serialised bytes are also hashed (sha256) and the hash is +// embedded in the entry so loaders can verify integrity at fetch time. +func writePerHeader(h *core.CStdlibHeader, cfg Config) (*core.CStdlibHeaderEntry, error) { + if h.GeneratedAt == "" { + h.GeneratedAt = time.Now().UTC().Format(time.RFC3339) + } + data, err := json.MarshalIndent(h, "", " ") + if err != nil { + return nil, fmt.Errorf("writePerHeader: marshalling %q: %w", h.Header, err) + } + + filename := SanitizeHeaderName(h.Header) + "_stdlib.json" + path := filepath.Join(cfg.OutputDir, filename) + if err := os.WriteFile(path, data, 0o644); err != nil { + return nil, fmt.Errorf("writePerHeader: writing %q: %w", path, err) + } + + sum := sha256.Sum256(data) + return &core.CStdlibHeaderEntry{ + Header: h.Header, + ModuleID: h.ModuleID, + File: filename, + URL: buildHeaderURL(cfg, filename), + Size: int64(len(data)), + Checksum: "sha256:" + hex.EncodeToString(sum[:]), + }, nil +} + +// buildHeaderURL returns the absolute URL the loader will GET for a given +// header file. Format: ////. +func buildHeaderURL(cfg Config, filename string) string { + return fmt.Sprintf("%s/%s/%s/%s/%s", + cfg.effectiveBaseURL(), cfg.Target, cfg.Language, RegistryVersion, filename) +} + +// buildManifest assembles a CStdlibManifest with deterministic statistics and +// the per-header entries already on hand. Only this function knows how to +// compute aggregate counts; emitter callers must not roll their own. +func buildManifest(headers []*core.CStdlibHeader, entries []*core.CStdlibHeaderEntry, + cfg Config, overlayApplied int) *core.CStdlibManifest { + stats := computeStatistics(headers, overlayApplied) + + return &core.CStdlibManifest{ + SchemaVersion: SchemaVersion, + RegistryVersion: RegistryVersion, + Platform: cfg.Target, + Language: cfg.Language, + SystemTag: firstSystemTag(headers), + GeneratedAt: time.Now().UTC().Format(time.RFC3339), + GeneratorVersion: GeneratorVersion, + BaseURL: fmt.Sprintf("%s/%s/%s/%s", + cfg.effectiveBaseURL(), cfg.Target, cfg.Language, RegistryVersion), + Headers: entries, + Statistics: stats, + } +} + +// computeStatistics tallies aggregate counts across all extracted headers. +// Symbol totals reflect the in-memory state AFTER overlay merge, so they +// include both extracted and overlay-only entries. +func computeStatistics(headers []*core.CStdlibHeader, overlayApplied int) *core.CStdlibStatistics { + stats := &core.CStdlibStatistics{ + TotalHeaders: len(headers), + OverlayOverrides: overlayApplied, + } + for _, h := range headers { + stats.TotalFunctions += len(h.Functions) + len(h.FreeFunctions) + stats.TotalClasses += len(h.Classes) + stats.TotalTypedefs += len(h.Typedefs) + stats.TotalConstants += len(h.Constants) + for _, cls := range h.Classes { + stats.TotalFunctions += len(cls.Methods) + } + } + return stats +} + +// firstSystemTag returns the SystemTag of the first header in the slice, or +// the empty string if the slice is empty. All headers in a single run share +// the same SystemTag (we walk one HeaderSource at a time), so this picks the +// authoritative value without scanning all of them. +func firstSystemTag(headers []*core.CStdlibHeader) string { + if len(headers) == 0 { + return "" + } + return headers[0].SystemTag +} + +// writeManifest serialises the top-level manifest.json next to the per-header +// JSONs. Like the per-header writer, the output is indented for human review +// in the local-validation gate. +func writeManifest(m *core.CStdlibManifest, cfg Config) error { + data, err := json.MarshalIndent(m, "", " ") + if err != nil { + return fmt.Errorf("writeManifest: marshalling: %w", err) + } + path := filepath.Join(cfg.OutputDir, "manifest.json") + if err := os.WriteFile(path, data, 0o644); err != nil { + return fmt.Errorf("writeManifest: writing %q: %w", path, err) + } + return nil +} diff --git a/sast-engine/tools/internal/clikeextract/emitter_test.go b/sast-engine/tools/internal/clikeextract/emitter_test.go new file mode 100644 index 00000000..82fc6b11 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/emitter_test.go @@ -0,0 +1,218 @@ +package clikeextract + +import ( + "crypto/sha256" + "encoding/hex" + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func makeTestHeader(name string) *core.CStdlibHeader { + h := core.NewCStdlibHeader() + h.SchemaVersion = SchemaVersion + h.Header = name + h.ModuleID = "c::" + SanitizeHeaderName(name) + h.Language = core.LanguageC + h.Platform = core.PlatformLinux + h.SystemTag = "glibc-test" + h.Functions["printf"] = &core.CStdlibFunction{ + FQN: "c::stdio::printf", + ReturnType: "int", + Source: core.SourceHeader, + } + h.Typedefs["FILE"] = &core.CStdlibTypedef{Type: "struct __FILE", Source: core.SourceHeader} + h.Constants["EOF"] = &core.CStdlibConstant{Type: "int", Value: "-1", Source: core.SourceHeader} + return h +} + +func TestEmitOutput_ProducesManifestAndPerHeaderFiles(t *testing.T) { + dir := t.TempDir() + cfg := Config{ + Target: core.PlatformLinux, + Language: core.LanguageC, + OutputDir: dir, + } + headers := []*core.CStdlibHeader{ + makeTestHeader("stdio.h"), + makeTestHeader("string.h"), + } + + require.NoError(t, EmitOutput(headers, cfg, 7)) + + // Files exist. + assert.FileExists(t, filepath.Join(dir, "manifest.json")) + assert.FileExists(t, filepath.Join(dir, "stdio_stdlib.json")) + assert.FileExists(t, filepath.Join(dir, "string_stdlib.json")) + + // Manifest parses and has expected fields. + data, err := os.ReadFile(filepath.Join(dir, "manifest.json")) + require.NoError(t, err) + var got core.CStdlibManifest + require.NoError(t, json.Unmarshal(data, &got)) + + assert.Equal(t, SchemaVersion, got.SchemaVersion) + assert.Equal(t, RegistryVersion, got.RegistryVersion) + assert.Equal(t, core.PlatformLinux, got.Platform) + assert.Equal(t, core.LanguageC, got.Language) + assert.Equal(t, "glibc-test", got.SystemTag) + assert.Equal(t, GeneratorVersion, got.GeneratorVersion) + + require.Len(t, got.Headers, 2) + // Sorted alphabetically. + assert.Equal(t, "stdio.h", got.Headers[0].Header) + assert.Equal(t, "string.h", got.Headers[1].Header) + assert.Equal(t, "stdio_stdlib.json", got.Headers[0].File) + + // Statistics correct. + require.NotNil(t, got.Statistics) + assert.Equal(t, 2, got.Statistics.TotalHeaders) + assert.Equal(t, 2, got.Statistics.TotalFunctions) + assert.Equal(t, 2, got.Statistics.TotalTypedefs) + assert.Equal(t, 2, got.Statistics.TotalConstants) + assert.Equal(t, 7, got.Statistics.OverlayOverrides) + + // Checksum is well-formed and matches the file we wrote. + for _, e := range got.Headers { + assert.True(t, len(e.Checksum) > len("sha256:")) + fileBytes, err := os.ReadFile(filepath.Join(dir, e.File)) + require.NoError(t, err) + want := "sha256:" + hex.EncodeToString(sha256SliceToHexed(sha256.Sum256(fileBytes))) + assert.Equal(t, want, e.Checksum) + assert.Equal(t, int64(len(fileBytes)), e.Size) + } +} + +func sha256SliceToHexed(sum [32]byte) []byte { + return sum[:] +} + +func TestEmitOutput_DefaultBaseURL(t *testing.T) { + dir := t.TempDir() + cfg := Config{ + Target: core.PlatformLinux, + Language: core.LanguageC, + OutputDir: dir, + } + require.NoError(t, EmitOutput([]*core.CStdlibHeader{makeTestHeader("stdio.h")}, cfg, 0)) + + data, err := os.ReadFile(filepath.Join(dir, "manifest.json")) + require.NoError(t, err) + var m core.CStdlibManifest + require.NoError(t, json.Unmarshal(data, &m)) + assert.Equal(t, "https://assets.codepathfinder.dev/registries/linux/c/v1", m.BaseURL) + require.Len(t, m.Headers, 1) + assert.Equal(t, "https://assets.codepathfinder.dev/registries/linux/c/v1/stdio_stdlib.json", + m.Headers[0].URL) +} + +func TestEmitOutput_OverrideBaseURL(t *testing.T) { + dir := t.TempDir() + cfg := Config{ + Target: core.PlatformLinux, + Language: core.LanguageC, + OutputDir: dir, + BaseURL: "file:///tmp/registries", + } + require.NoError(t, EmitOutput([]*core.CStdlibHeader{makeTestHeader("stdio.h")}, cfg, 0)) + + data, err := os.ReadFile(filepath.Join(dir, "manifest.json")) + require.NoError(t, err) + var m core.CStdlibManifest + require.NoError(t, json.Unmarshal(data, &m)) + assert.Equal(t, "file:///tmp/registries/linux/c/v1", m.BaseURL) + assert.Equal(t, "file:///tmp/registries/linux/c/v1/stdio_stdlib.json", m.Headers[0].URL) +} + +func TestEmitOutput_DeterministicAcrossRuns(t *testing.T) { + headers := []*core.CStdlibHeader{ + makeTestHeader("string.h"), + makeTestHeader("stdio.h"), + } + // Set GeneratedAt explicitly to avoid the time-based fields differing. + for _, h := range headers { + h.GeneratedAt = "2026-05-04T00:00:00Z" + } + + dir1 := t.TempDir() + dir2 := t.TempDir() + cfg1 := Config{Target: core.PlatformLinux, Language: core.LanguageC, OutputDir: dir1} + cfg2 := Config{Target: core.PlatformLinux, Language: core.LanguageC, OutputDir: dir2} + + require.NoError(t, EmitOutput(headers, cfg1, 0)) + // Re-create the headers to avoid in-place mutations from the first run. + for _, h := range headers { + h.GeneratedAt = "2026-05-04T00:00:00Z" + } + require.NoError(t, EmitOutput(headers, cfg2, 0)) + + for _, name := range []string{"stdio_stdlib.json", "string_stdlib.json"} { + a, _ := os.ReadFile(filepath.Join(dir1, name)) + b, _ := os.ReadFile(filepath.Join(dir2, name)) + assert.Equal(t, a, b, "per-header file %q should be byte-identical across runs", name) + } +} + +func TestEmitOutput_OutputDirCreated(t *testing.T) { + dir := filepath.Join(t.TempDir(), "nested", "subdir") + cfg := Config{Target: core.PlatformLinux, Language: core.LanguageC, OutputDir: dir} + require.NoError(t, EmitOutput([]*core.CStdlibHeader{makeTestHeader("stdio.h")}, cfg, 0)) + assert.FileExists(t, filepath.Join(dir, "manifest.json")) +} + +func TestEmitOutput_WriteFails_OutputUnderFile(t *testing.T) { + if os.Geteuid() == 0 { + t.Skip("running as root: file masquerading as dir does not block writes") + } + tmp := t.TempDir() + clash := filepath.Join(tmp, "clash") + require.NoError(t, os.WriteFile(clash, []byte("plain file"), 0o644)) + + cfg := Config{ + Target: core.PlatformLinux, + Language: core.LanguageC, + OutputDir: clash, // a file, not a dir → MkdirAll fails + } + err := EmitOutput([]*core.CStdlibHeader{makeTestHeader("stdio.h")}, cfg, 0) + require.Error(t, err) + assert.Contains(t, err.Error(), "creating output dir") +} + +func TestComputeStatistics_WithCppContent(t *testing.T) { + cppHeader := core.NewCStdlibHeader() + cppHeader.Header = "vector" + cls := core.NewCppStdlibClass("std::vector") + cls.Methods["push_back"] = &core.CStdlibFunction{} + cls.Methods["size"] = &core.CStdlibFunction{} + cppHeader.Classes["std::vector"] = cls + cppHeader.FreeFunctions["std::swap"] = &core.CStdlibFunction{} + + stats := computeStatistics([]*core.CStdlibHeader{cppHeader}, 0) + assert.Equal(t, 1, stats.TotalHeaders) + assert.Equal(t, 1, stats.TotalClasses) + // 0 free + 1 namespaced free + 2 methods = 3. + assert.Equal(t, 3, stats.TotalFunctions) +} + +func TestFirstSystemTag(t *testing.T) { + assert.Equal(t, "", firstSystemTag(nil)) + assert.Equal(t, "", firstSystemTag([]*core.CStdlibHeader{})) + + h := makeTestHeader("stdio.h") + assert.Equal(t, "glibc-test", firstSystemTag([]*core.CStdlibHeader{h})) +} + +func TestBuildHeaderURL(t *testing.T) { + cfg := Config{ + Target: core.PlatformLinux, + Language: core.LanguageC, + BaseURL: "https://example.com/registries", + } + got := buildHeaderURL(cfg, "stdio_stdlib.json") + assert.Equal(t, "https://example.com/registries/linux/c/v1/stdio_stdlib.json", got) +} diff --git a/sast-engine/tools/internal/clikeextract/extractor.go b/sast-engine/tools/internal/clikeextract/extractor.go new file mode 100644 index 00000000..900d02e5 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/extractor.go @@ -0,0 +1,101 @@ +package clikeextract + +import ( + "fmt" + "os" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" +) + +// Extractor stitches the four pipeline stages — discovery, walk, extract, merge, +// emit — into a single Run() entrypoint. It mirrors the API of the sibling +// goextract.Extractor: construct with NewExtractor(cfg), call Run(), get an +// error or success. +type Extractor struct { + cfg Config + overlay *Overlay + logf func(format string, args ...any) +} + +// NewExtractor constructs an Extractor with the given configuration. The +// caller MUST have validated cfg with cfg.Validate() before this point — +// invalid configuration is a programming error. +func NewExtractor(cfg Config) *Extractor { + return &Extractor{ + cfg: cfg, + // Default logger writes to stderr; the entry-point binary uses this, + // while tests can swap it out via SetLogger to capture output. + logf: defaultLogf, + } +} + +// SetLogger overrides the warning-emit destination. Used by tests to capture +// continue-on-failure messages without writing to stderr. +func (e *Extractor) SetLogger(logf func(format string, args ...any)) { + e.logf = logf +} + +// Run executes the full pipeline. Errors abort the run only when they cannot +// be recovered from (missing search dirs, invalid overlay, unwritable output); +// per-header parse failures log a warning and continue, so a single bad header +// does not poison the whole registry. +func (e *Extractor) Run() error { + if err := e.cfg.Validate(); err != nil { + return fmt.Errorf("Run: invalid config: %w", err) + } + + overlay, err := LoadOverlay(e.cfg.OverlayPath, e.cfg.Language) + if err != nil { + return err + } + e.overlay = overlay + + sources, err := discoverHeaderSourcesFn(e.cfg.Target, e.cfg.Language) + if err != nil { + return err + } + + allHeaders := make([]*core.CStdlibHeader, 0, 256) + overlayApplied := 0 + + for _, src := range sources { + files, err := src.WalkHeaders() + if err != nil { + return err + } + for _, f := range files { + h, perHeaderErr := e.extractOne(f, src) + if perHeaderErr != nil { + e.logf("warning: skipping header %q: %v", f.Name, perHeaderErr) + continue + } + overlayApplied += MergeOverlay(h, overlay) + allHeaders = append(allHeaders, h) + } + } + + return EmitOutput(allHeaders, e.cfg, overlayApplied) +} + +// extractOne dispatches to the language-specific extractor. Kept on the +// receiver (rather than a free function) so future fields on Extractor — +// caching, parallelism — have somewhere natural to land. +func (e *Extractor) extractOne(f HeaderFile, src HeaderSource) (*core.CStdlibHeader, error) { + switch e.cfg.Language { + case core.LanguageC: + return extractCHeader(f, src) + case core.LanguageCpp: + return extractCppHeader(f, src) + default: + // Validate() should have caught this; defensive fallback. + return nil, fmt.Errorf("extractOne: unsupported language %q", e.cfg.Language) + } +} + +func defaultLogf(format string, args ...any) { + fmt.Fprintf(os.Stderr, format+"\n", args...) +} + +// discoverHeaderSourcesFn is the package-level indirection that lets tests +// substitute a synthetic source list. Production code never re-binds this. +var discoverHeaderSourcesFn = DiscoverHeaderSources diff --git a/sast-engine/tools/internal/clikeextract/extractor_test.go b/sast-engine/tools/internal/clikeextract/extractor_test.go new file mode 100644 index 00000000..7e19221c --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/extractor_test.go @@ -0,0 +1,234 @@ +package clikeextract + +import ( + "encoding/json" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestRunFixtureLinuxC pipes the C testdata fixtures through the full extractor +// pipeline (discover → walk → extract → merge → emit) and asserts on the +// generated manifest + a sampled per-header file. This is the integration +// test that catches breakage in the wiring between stages. +func TestRunFixtureLinuxC(t *testing.T) { + out := t.TempDir() + cfg := Config{ + Target: core.PlatformLinux, + Language: core.LanguageC, + OutputDir: out, + } + + // Override the C search dir to our fixture tree. + src := cTestSource() + useFixedSources(t, []HeaderSource{src}) + + ext := NewExtractor(cfg) + require.NoError(t, ext.Run()) + + // Manifest written and parses. + mData, err := os.ReadFile(filepath.Join(out, "manifest.json")) + require.NoError(t, err) + var m core.CStdlibManifest + require.NoError(t, json.Unmarshal(mData, &m)) + assert.Equal(t, core.PlatformLinux, m.Platform) + assert.Equal(t, core.LanguageC, m.Language) + require.GreaterOrEqual(t, len(m.Headers), 4, "fixture has at least stdio.h, string.h, unistd.h, inline.h") + + // Per-header file is reachable via the manifest's File field. + stdioEntry := m.GetHeaderEntry("stdio.h") + require.NotNil(t, stdioEntry) + stdioPath := filepath.Join(out, stdioEntry.File) + stdioData, err := os.ReadFile(stdioPath) + require.NoError(t, err) + + var stdioHeader core.CStdlibHeader + require.NoError(t, json.Unmarshal(stdioData, &stdioHeader)) + assert.NotNil(t, stdioHeader.Functions["printf"]) + assert.Equal(t, core.SourceHeader, stdioHeader.Functions["printf"].Source) + + // Statistics are populated and reasonable. + require.NotNil(t, m.Statistics) + assert.Greater(t, m.Statistics.TotalFunctions, 0) + assert.Greater(t, m.Statistics.TotalConstants, 0) +} + +// TestRunFixtureLinuxCpp does the same for the C++ testdata. +func TestRunFixtureLinuxCpp(t *testing.T) { + out := t.TempDir() + cfg := Config{ + Target: core.PlatformLinux, + Language: core.LanguageCpp, + OutputDir: out, + } + + useFixedSources(t, []HeaderSource{cppTestSource()}) + + require.NoError(t, NewExtractor(cfg).Run()) + + mData, err := os.ReadFile(filepath.Join(out, "manifest.json")) + require.NoError(t, err) + var m core.CStdlibManifest + require.NoError(t, json.Unmarshal(mData, &m)) + assert.Equal(t, core.LanguageCpp, m.Language) + + vectorEntry := m.GetHeaderEntry("vector") + require.NotNil(t, vectorEntry) + vData, err := os.ReadFile(filepath.Join(out, vectorEntry.File)) + require.NoError(t, err) + + var vec core.CStdlibHeader + require.NoError(t, json.Unmarshal(vData, &vec)) + assert.Contains(t, vec.Classes, "std::vector") +} + +func TestRun_OverlayMerged(t *testing.T) { + out := t.TempDir() + overlayPath := filepath.Join(out, "overlay.yaml") + require.NoError(t, os.WriteFile(overlayPath, []byte(`schema_version: "1.0.0" +language: c +overrides: + - header: stdio.h + function: printf + security_tag: format_string_sink +`), 0o644)) + + cfg := Config{ + Target: core.PlatformLinux, + Language: core.LanguageC, + OutputDir: out, + OverlayPath: overlayPath, + } + useFixedSources(t, []HeaderSource{cTestSource()}) + + require.NoError(t, NewExtractor(cfg).Run()) + + stdioPath := filepath.Join(out, "stdio_stdlib.json") + data, err := os.ReadFile(stdioPath) + require.NoError(t, err) + + var h core.CStdlibHeader + require.NoError(t, json.Unmarshal(data, &h)) + got := h.Functions["printf"] + require.NotNil(t, got) + assert.Equal(t, "format_string_sink", got.SecurityTag) + assert.Equal(t, core.SourceMerged, got.Source) +} + +func TestRun_InvalidConfig(t *testing.T) { + cfg := Config{} // missing everything + err := NewExtractor(cfg).Run() + require.Error(t, err) + assert.Contains(t, err.Error(), "invalid config") +} + +func TestRun_OverlayLoadError(t *testing.T) { + cfg := Config{ + Target: core.PlatformLinux, + Language: core.LanguageC, + OutputDir: t.TempDir(), + OverlayPath: "/nonexistent-overlay-pr01-extractor-test.yaml", + } + err := NewExtractor(cfg).Run() + require.Error(t, err) + assert.Contains(t, err.Error(), "reading") +} + +func TestRun_DiscoveryError(t *testing.T) { + cfg := Config{ + Target: core.PlatformWindows, // PR-01 doesn't ship windows + Language: core.LanguageC, + OutputDir: t.TempDir(), + } + err := NewExtractor(cfg).Run() + require.Error(t, err) + assert.Contains(t, err.Error(), "PR-03") +} + +func TestRun_WalkError(t *testing.T) { + cfg := Config{ + Target: core.PlatformLinux, + Language: core.LanguageC, + OutputDir: t.TempDir(), + } + useFixedSources(t, []HeaderSource{{ + Platform: core.PlatformLinux, + Language: core.LanguageC, + SearchDirs: []string{"/this-path-must-not-exist-pr01-walk"}, + HeaderExts: []string{".h"}, + }}) + + err := NewExtractor(cfg).Run() + require.Error(t, err) + assert.Contains(t, err.Error(), "does not exist") +} + +func TestRun_PerHeaderParseFailure_Continues(t *testing.T) { + // A directory with a real header + a binary file that parses fine but + // produces no symbols. The pipeline should NOT abort; the warning channel + // should record nothing fatal. + dir := t.TempDir() + require.NoError(t, os.WriteFile(filepath.Join(dir, "real.h"), + []byte(`int real_func(int x);`), 0o644)) + + cfg := Config{ + Target: core.PlatformLinux, + Language: core.LanguageC, + OutputDir: t.TempDir(), + } + useFixedSources(t, []HeaderSource{{ + Platform: core.PlatformLinux, + Language: core.LanguageC, + SystemTag: "test", + SearchDirs: []string{dir}, + HeaderExts: []string{".h"}, + }}) + + var captured strings.Builder + ext := NewExtractor(cfg) + ext.SetLogger(func(format string, args ...any) { + captured.WriteString(format + "\n") + }) + require.NoError(t, ext.Run()) + // The real header is fine — no warnings expected. + assert.NotContains(t, captured.String(), "warning") +} + +func TestDefaultLogf_DoesNotPanic(t *testing.T) { + // defaultLogf writes to os.Stderr; we can't easily capture it here, but + // the call must not panic and must accept format args. + defaultLogf("test %d %s", 1, "x") +} + +func TestExtractOne_UnsupportedLanguage(t *testing.T) { + // Bypass Validate by constructing the extractor with a hand-tweaked Config + // that survives Validate (we can't — Validate rejects "rust"). Instead + // call extractOne directly to exercise the defensive fallback branch. + ext := NewExtractor(Config{ + Target: core.PlatformLinux, + Language: core.LanguageC, + OutputDir: t.TempDir(), + }) + ext.cfg.Language = "rust" // post-construction tweak for the test + _, err := ext.extractOne(HeaderFile{Name: "x.h", Path: "/dev/null"}, + HeaderSource{Language: "rust"}) + require.Error(t, err) + assert.Contains(t, err.Error(), "unsupported language") +} + +// useFixedSources monkey-patches DiscoverHeaderSources to return a fixed list +// for the duration of the test. The package-level pointer indirection is the +// same trick walker_test.go uses for linuxLibcRoots / linuxCppRoot. +func useFixedSources(t *testing.T, sources []HeaderSource) { + t.Helper() + original := discoverHeaderSourcesFn + discoverHeaderSourcesFn = func(target, language string) ([]HeaderSource, error) { + return sources, nil + } + t.Cleanup(func() { discoverHeaderSourcesFn = original }) +} diff --git a/sast-engine/tools/internal/clikeextract/normalize.go b/sast-engine/tools/internal/clikeextract/normalize.go new file mode 100644 index 00000000..99e64163 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/normalize.go @@ -0,0 +1,184 @@ +package clikeextract + +import ( + "regexp" + "strings" +) + +// Normalisation helpers shared by the C and C++ extractors. System headers carry +// a lot of compiler-implementation noise — gcc attributes, libc++/libstdc++ +// internal namespace prefixes, _GLIBCXX_NOEXCEPT macros — that makes mechanical +// type comparison fail. These helpers strip the noise so emitted entries match +// the canonical forms found in cppreference and developer-facing source. + +// attributeRegex matches `__attribute__((...))` clauses, including nested parens +// (one level deep, which is sufficient for every libc/libstdc++ form we care +// about: format(printf,1,2), nonnull(1), warn_unused_result, deprecated("msg")). +var attributeRegex = regexp.MustCompile(`__attribute__\s*\(\([^)]*(\)[^)]*)*\)\)`) + +// pragmaRegex matches `__cdecl` / `__stdcall` / `__fastcall` calling-convention +// markers used in mingw/MSVC headers. +var pragmaRegex = regexp.MustCompile(`__(cdecl|stdcall|fastcall|thiscall|vectorcall)\b`) + +// libcppMacroRegex matches macros that expand to nothing in real builds but show +// up in the AST verbatim (we don't run a preprocessor). Each macro is treated as +// removable whitespace. +var libcppMacroRegex = regexp.MustCompile(strings.Join([]string{ + `_GLIBCXX_NOEXCEPT`, + `_GLIBCXX_USE_NOEXCEPT`, + `_GLIBCXX_CONSTEXPR`, + `_GLIBCXX17_CONSTEXPR`, + `_GLIBCXX20_CONSTEXPR`, + `_LIBCPP_INLINE_VISIBILITY`, + `_LIBCPP_HIDE_FROM_ABI`, + `_LIBCPP_CONSTEXPR_SINCE_CXX\d+`, + `_LIBCPP_NODISCARD`, + `_NOEXCEPT`, + `_Nonnull`, + `_Nullable`, + `_Null_unspecified`, + `__THROW`, + `__nonnull\s*\(\s*\(\s*[^)]*\)\s*\)`, + `__wur`, +}, "|")) + +// cxx11InlineNamespaceRegex matches the libstdc++ inline namespace `__cxx11` +// that wraps C++11-ABI-stable types (basic_string, list). Stripping it gives the +// canonical type used in user code: `std::__cxx11::basic_string` → +// `std::basic_string`. The same shape catches `std::__1::` (libc++) and +// `std::__detail::` for completeness — the latter should normally be filtered +// out higher up the stack as a private symbol, but the regex catches the cases +// that slip through (e.g. inside a typedef value). +var cxx11InlineNamespaceRegex = regexp.MustCompile(`std::__(cxx11|1|detail)::`) + +// NormalizeType strips compiler decorations and canonicalises stdlib internal +// namespaces. Input may be empty; output is whitespace-collapsed but otherwise +// preserves the original form. +// +// Examples: +// +// "int __attribute__((nonnull))" → "int" +// "const char* _Nonnull" → "const char*" +// "std::__cxx11::basic_string" → "std::basic_string" +// "void __cdecl printf(const char*)" → "void printf(const char*)" +// "_GLIBCXX_CONSTEXPR size_t size() const" → "size_t size() const" +// +// The function is intentionally conservative: anything it doesn't recognise is +// left alone. The overlay (overlay.go) is the escape hatch for anything the +// regex set doesn't cover. +func NormalizeType(s string) string { + if s == "" { + return s + } + s = attributeRegex.ReplaceAllString(s, "") + s = pragmaRegex.ReplaceAllString(s, "") + s = libcppMacroRegex.ReplaceAllString(s, "") + s = cxx11InlineNamespaceRegex.ReplaceAllString(s, "std::") + return collapseWhitespace(s) +} + +// collapseWhitespace replaces runs of whitespace with a single space and trims +// leading/trailing space — the noise left over after the regex strips run. +func collapseWhitespace(s string) string { + var b strings.Builder + b.Grow(len(s)) + + prevSpace := true // start as if preceded by whitespace, so leading is trimmed + for _, r := range s { + if r == ' ' || r == '\t' || r == '\n' || r == '\r' { + if !prevSpace { + b.WriteByte(' ') + prevSpace = true + } + continue + } + b.WriteRune(r) + prevSpace = false + } + + out := b.String() + return strings.TrimRight(out, " ") +} + +// IsPrivateSymbol reports whether name is a compiler/library implementation +// detail that should not appear in the public registry. The rule is conservative: +// +// - Names with a double-underscore prefix (`__builtin_*`, `__cxxabiv1::*`) +// are private by every C and C++ convention. +// - Names with a single-underscore prefix followed by a lowercase letter are +// library private (`_exit` is the GNU underscore alias for `exit`; we want +// the public spelling, so we skip the alias). +// - Names starting with `_` followed by an uppercase letter or a digit are +// standard-library KEEP forms (`_Bool`, `_Static_assert`, `_Generic`, the +// `_LIBCPP_*` macros that get filtered separately) — these we keep. +// +// Edge case the keep-list rule above is designed for: `_Bool` is a C language +// keyword (since C99); skipping it would lose a real type. `_LIBCPP_*` is +// macro/internal but the macro is consumed before extraction reaches the name, +// so we should never see it here — but the keep rule means even if we do, we +// don't accidentally drop something that's just an oddly-named keyword. +func IsPrivateSymbol(name string) bool { + if len(name) == 0 { + return false + } + if !strings.HasPrefix(name, "_") { + return false + } + if strings.HasPrefix(name, "__") { + return true + } + // Single underscore + lowercase ascii letter → library-private (e.g. _exit). + c := name[1] + if c >= 'a' && c <= 'z' { + return true + } + return false +} + +// IsPrivateNamespace reports whether the given C++ namespace name is a +// compiler-implementation detail that should be skipped during extraction. +// Namespaces with `__` prefix or matching well-known internal patterns are +// excluded; user-facing namespaces (`std`, `boost`, `Qt`) pass through. +func IsPrivateNamespace(name string) bool { + if name == "" { + return false + } + if strings.HasPrefix(name, "__") { + return true + } + if name == "_GLIBCXX_DEBUG" || strings.HasPrefix(name, "_GLIBCXX_") || strings.HasPrefix(name, "_LIBCPP_") { + return true + } + return false +} + +// SanitizeHeaderName converts a `#include`-form header name into a stable +// filesystem-safe identifier suitable for use as a JSON filename stem. The +// transformation is: +// +// - Strip the leading `<` and trailing `>` if present. +// - Strip the trailing `.h`, `.hpp`, or `.hxx` extension. +// - Replace `/` with `_` so subdirectories (`sys/socket.h`, `bits/types.h`) +// yield a flat filename. +// +// Examples: +// +// "vector" → "vector" +// "" → "string" +// "stdio.h" → "stdio" +// "sys/socket.h" → "sys_socket" +// "bits/types.h" → "bits_types" +// +// Idempotent — running it twice on the same input yields the same output. +func SanitizeHeaderName(h string) string { + h = strings.TrimPrefix(h, "<") + h = strings.TrimSuffix(h, ">") + for _, ext := range []string{".hpp", ".hxx", ".hh", ".h"} { + if cut, ok := strings.CutSuffix(h, ext); ok { + h = cut + break + } + } + h = strings.ReplaceAll(h, "/", "_") + return h +} diff --git a/sast-engine/tools/internal/clikeextract/normalize_test.go b/sast-engine/tools/internal/clikeextract/normalize_test.go new file mode 100644 index 00000000..a8f84ba2 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/normalize_test.go @@ -0,0 +1,133 @@ +package clikeextract + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestNormalizeType(t *testing.T) { + tests := []struct { + name string + in string + want string + }{ + {"empty", "", ""}, + {"plain int", "int", "int"}, + {"strip nonnull attribute", "int __attribute__((nonnull))", "int"}, + {"strip format attribute", "int __attribute__((format(printf, 1, 2)))", "int"}, + {"strip Nonnull tag", "const char* _Nonnull", "const char*"}, + {"strip cdecl", "void __cdecl printf(const char*)", "void printf(const char*)"}, + {"canonicalize cxx11 namespace", "std::__cxx11::basic_string", "std::basic_string"}, + {"canonicalize libc++ inline", "std::__1::vector", "std::vector"}, + {"strip GLIBCXX_CONSTEXPR", "_GLIBCXX_CONSTEXPR size_t size() const", "size_t size() const"}, + {"strip libcpp inline visibility", "_LIBCPP_INLINE_VISIBILITY const T& at(size_t)", "const T& at(size_t)"}, + {"strip THROW marker", "void* __THROW malloc(size_t)", "void* malloc(size_t)"}, + {"collapse whitespace", "int *foo ()", "int *foo ()"}, + {"trim trailing space", "int ", "int"}, + {"unrecognised left alone", "MyCustomType", "MyCustomType"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := NormalizeType(tt.in) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestCollapseWhitespace(t *testing.T) { + tests := []struct { + in, want string + }{ + {"", ""}, + {" ", ""}, + {"abc", "abc"}, + {"a b", "a b"}, + {"a b", "a b"}, + {"a\tb\nc", "a b c"}, + {" leading", "leading"}, + {"trailing ", "trailing"}, + {" both ", "both"}, + } + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + assert.Equal(t, tt.want, collapseWhitespace(tt.in)) + }) + } +} + +func TestIsPrivateSymbol(t *testing.T) { + tests := []struct { + name string + want bool + }{ + {"", false}, + {"printf", false}, + {"_Bool", false}, // C keyword — keep + {"_Static_assert", false}, // C keyword — keep + {"_Generic", false}, // C keyword — keep + {"_exit", true}, // library-private alias + {"_setjmp", true}, + {"__builtin_strlen", true}, + {"__GLIBC_INTERNAL", true}, + {"__cxxabiv1", true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, IsPrivateSymbol(tt.name)) + }) + } +} + +func TestIsPrivateNamespace(t *testing.T) { + tests := []struct { + name string + want bool + }{ + {"", false}, + {"std", false}, + {"boost", false}, + {"__detail", true}, + {"__cxxabiv1", true}, + {"_GLIBCXX_DEBUG", true}, + {"_GLIBCXX_VERSION_NAMESPACE", true}, + {"_LIBCPP_INTERNAL", true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, IsPrivateNamespace(tt.name)) + }) + } +} + +func TestSanitizeHeaderName(t *testing.T) { + tests := []struct { + in, want string + }{ + {"vector", "vector"}, + {"", "string"}, + {"", "vector"}, + {"stdio.h", "stdio"}, + {"string.h", "string"}, + {"", "stdio"}, + {"sys/socket.h", "sys_socket"}, + {"bits/types.h", "bits_types"}, + {"foo.hpp", "foo"}, + {"foo.hxx", "foo"}, + {"foo.hh", "foo"}, + } + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + assert.Equal(t, tt.want, SanitizeHeaderName(tt.in)) + }) + } +} + +func TestSanitizeHeaderName_Idempotent(t *testing.T) { + for _, in := range []string{"vector", "", "stdio.h", "sys/socket.h"} { + once := SanitizeHeaderName(in) + twice := SanitizeHeaderName(once) + assert.Equal(t, once, twice, "input=%q", in) + } +} diff --git a/sast-engine/tools/internal/clikeextract/overlay.go b/sast-engine/tools/internal/clikeextract/overlay.go new file mode 100644 index 00000000..c2e7e960 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/overlay.go @@ -0,0 +1,475 @@ +package clikeextract + +import ( + "errors" + "fmt" + "os" + "strings" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "gopkg.in/yaml.v3" +) + +// Overlay is the parsed shape of c_stdlib_overlay.yaml / cpp_stdlib_overlay.yaml. +// It is a small hand-curated patch applied on top of tree-sitter extraction to +// fill in cases the parser cannot capture: template specialisations, missing +// __attribute__((format)) annotations, security tags, and cross-platform aliases. +// +// Schema (yaml.v3 tag conventions, lowercase field names matching the YAML keys): +// +// schema_version: "1.0.0" // string +// language: "c" | "cpp" +// overrides: // each entry MUST specify exactly one of +// // {function} OR {class+method} OR {typedef} +// // OR {constant} +// - header: "" +// function: "" +// class: "" +// method: "" +// typedef: "" +// constant: "" +// return_type: "" +// params: [ {name, type, attribute?, required?}, ... ] +// confidence: 1.0 // optional, default 1.0 +// security_tag: "" // optional +// attribute: "" // optional +// throws: ""// optional +// type: "" // for typedef/constant entries +// value: "" // for constant entries +// note: "" // optional, ignored at runtime +// cross_platform_aliases: +// - alias: "
" +// canonical: "
" +// skip: +// - prefix: "<...>" +// - exact: "<...>" +//nolint:tagliatelle // YAML keys are snake_case to match hand-edited overlay files. +type Overlay struct { + SchemaVersion string `yaml:"schema_version"` + Language string `yaml:"language"` + + Overrides []OverlayOverride `yaml:"overrides"` + CrossPlatformAliases []OverlayAlias `yaml:"cross_platform_aliases"` + Skip []OverlaySkip `yaml:"skip"` + + // path is the file the overlay was loaded from; used for richer error + // messages on validation failure. + path string `yaml:"-"` //nolint:unused // populated by Load, surfaced in errors +} + +// OverlayOverride describes one hand-curated override or insertion. The kind +// of entry is determined by which mutually-exclusive identifier fields are +// populated (Function / Method / Typedef / Constant). Validation enforces +// exactly-one. +// +//nolint:tagliatelle // YAML keys are snake_case to match hand-edited overlay files. +type OverlayOverride struct { + Header string `yaml:"header"` + Function string `yaml:"function,omitempty"` + Class string `yaml:"class,omitempty"` + Method string `yaml:"method,omitempty"` + Typedef string `yaml:"typedef,omitempty"` + Constant string `yaml:"constant,omitempty"` + ReturnType string `yaml:"return_type,omitempty"` + Params []OverlayParam `yaml:"params,omitempty"` + + // Confidence has no yaml omitempty marker because zero is a meaningful + // signal — the merger replaces zero with 1.0 since hand-curated entries + // are by definition high-confidence. + Confidence float32 `yaml:"confidence"` + SecurityTag string `yaml:"security_tag,omitempty"` + Attribute string `yaml:"attribute,omitempty"` + Throws string `yaml:"throws,omitempty"` + Type string `yaml:"type,omitempty"` // for typedef / constant entries + Value string `yaml:"value,omitempty"` // for constant entries + Note string `yaml:"note,omitempty"` // ignored at runtime; for human readers +} + +// OverlayParam is a parameter override entry. Required defaults to true (most +// overlay entries are well-formed signatures) — set explicitly to false for +// optional-after-default-args cases. +type OverlayParam struct { + Name string `yaml:"name"` + Type string `yaml:"type"` + Attribute string `yaml:"attribute,omitempty"` + Required *bool `yaml:"required,omitempty"` +} + +// OverlayAlias maps a non-canonical header name to its canonical form. Phase 2 +// stores these for later use; aliasing across platforms is a Phase 3 concern. +type OverlayAlias struct { + Alias string `yaml:"alias"` + Canonical string `yaml:"canonical"` + Platforms []string `yaml:"platforms,omitempty"` +} + +// OverlaySkip names a single skip rule. Exactly one of Prefix or Exact must be +// set; both being empty is a validation error. +type OverlaySkip struct { + Prefix string `yaml:"prefix,omitempty"` + Exact string `yaml:"exact,omitempty"` +} + +// LoadOverlay reads, parses, and validates the YAML overlay at path. Returns nil +// (no overlay) if path is empty — callers may use this no-overlay shape to opt +// out, in which case every extracted entry's Source stays "header". +// +// Mismatch between the overlay's declared language and the wantLanguage argument +// is a hard error: a cpp overlay applied to a C extraction would silently inject +// invalid entries (e.g. class-method overrides into a C registry). +func LoadOverlay(path, wantLanguage string) (*Overlay, error) { + if path == "" { + return nil, nil //nolint:nilnil // empty path is intentional opt-out, not an error + } + data, err := os.ReadFile(path) //nolint:gosec // path comes from operator CLI flag, not user input + if err != nil { + return nil, fmt.Errorf("LoadOverlay: reading %q: %w", path, err) + } + + var o Overlay + if err := yaml.Unmarshal(data, &o); err != nil { + return nil, fmt.Errorf("LoadOverlay: parsing YAML at %q: %w", path, err) + } + o.path = path + + if err := o.validate(wantLanguage); err != nil { + return nil, fmt.Errorf("LoadOverlay: validating %q: %w", path, err) + } + return &o, nil +} + +// validate enforces the overlay invariants declared in the package docs. The +// errors returned identify the offending entry by index so a hand-editor can +// jump straight to the line. +func (o *Overlay) validate(wantLanguage string) error { + if o.SchemaVersion == "" { + return errors.New("schema_version is required") + } + if o.Language == "" { + return errors.New("language is required") + } + if o.Language != wantLanguage { + return fmt.Errorf("overlay declares language=%q but extractor wants %q", o.Language, wantLanguage) + } + if o.Language != core.LanguageC && o.Language != core.LanguageCpp { + return fmt.Errorf("language must be %q or %q (got %q)", core.LanguageC, core.LanguageCpp, o.Language) + } + + for i, ov := range o.Overrides { + if ov.Header == "" { + return fmt.Errorf("overrides[%d]: header is required", i) + } + k, kerr := overrideKind(ov) + if kerr != nil { + return fmt.Errorf("overrides[%d]: %w", i, kerr) + } + if k == overrideMethod && ov.Class == "" { + return fmt.Errorf("overrides[%d]: method override requires class", i) + } + if k == overrideMethod && o.Language == core.LanguageC { + return fmt.Errorf("overrides[%d]: class+method only valid in cpp overlay", i) + } + } + for i, sk := range o.Skip { + if (sk.Prefix == "") == (sk.Exact == "") { + return fmt.Errorf("skip[%d]: exactly one of prefix or exact must be set", i) + } + } + for i, al := range o.CrossPlatformAliases { + if al.Alias == "" || al.Canonical == "" { + return fmt.Errorf("cross_platform_aliases[%d]: alias and canonical are both required", i) + } + } + return nil +} + +// overrideKind classifies a single override entry by which identifier fields +// are set. Returns an error if zero or more than one identifier is present. +func overrideKind(ov OverlayOverride) (overrideEntryKind, error) { + count := 0 + var k overrideEntryKind + if ov.Function != "" { + count++ + k = overrideFunction + } + if ov.Method != "" { + count++ + k = overrideMethod + } + if ov.Typedef != "" { + count++ + k = overrideTypedef + } + if ov.Constant != "" { + count++ + k = overrideConstant + } + if count == 0 { + return overrideUnknown, errors.New("must specify exactly one of function, method, typedef, constant") + } + if count > 1 { + return overrideUnknown, errors.New("must specify exactly one of function, method, typedef, constant (got multiple)") + } + return k, nil +} + +type overrideEntryKind int + +const ( + overrideUnknown overrideEntryKind = iota + overrideFunction + overrideMethod + overrideTypedef + overrideConstant +) + +// MergeOverlay applies the overlay to one extracted CStdlibHeader, returning a +// (possibly modified) header. The returned pointer is the same as the input — +// merge happens in place. Counts the overlay entries actually applied via the +// returned int (used by the emitter to populate Statistics.OverlayOverrides). +// +// Merge rules: +// +// - For each override whose Header matches the input, locate the target +// symbol map (Functions / FreeFunctions / Classes.Methods / Typedefs / +// Constants) and apply. +// - If the target entry already exists from extraction, the overlay's values +// replace the matching fields in-place; Source becomes "merged". +// - If the target entry does not exist, a new one is inserted with +// Source="overlay". +// - Skip rules run last, AFTER overrides, so an override on a name that is +// subsequently skipped is silently dropped (matches the user expectation +// of "skip applies regardless of source"). +// +// Returns the count of overlay entries that produced or refined a header entry +// (i.e. entries whose Header field matched the input header). Cross-platform +// aliases and skip rules do not count toward this number. +func MergeOverlay(h *core.CStdlibHeader, overlay *Overlay) int { + if overlay == nil || h == nil { + return 0 + } + + applied := 0 + for _, ov := range overlay.Overrides { + if ov.Header != h.Header { + continue + } + k, _ := overrideKind(ov) // pre-validated by Load + switch k { + case overrideFunction: + applyFunctionOverride(h, ov) + case overrideMethod: + applyMethodOverride(h, ov) + case overrideTypedef: + applyTypedefOverride(h, ov) + case overrideConstant: + applyConstantOverride(h, ov) + case overrideUnknown: + continue + } + applied++ + } + + applySkipRules(h, overlay.Skip) + return applied +} + +func applyFunctionOverride(h *core.CStdlibHeader, ov OverlayOverride) { + conf := ov.Confidence + if conf == 0 { + conf = 1.0 + } + target := h.Functions + if target == nil { + target = make(map[string]*core.CStdlibFunction) + h.Functions = target + } + + existing, found := target[ov.Function] + if !found { + target[ov.Function] = &core.CStdlibFunction{ + FQN: buildFunctionFQN(h, ov.Function), + ReturnType: ov.ReturnType, + Params: paramListFromOverlay(ov.Params), + Confidence: conf, + Source: core.SourceOverlay, + SecurityTag: ov.SecurityTag, + Attribute: ov.Attribute, + Throws: ov.Throws, + } + return + } + if ov.ReturnType != "" { + existing.ReturnType = ov.ReturnType + } + if len(ov.Params) > 0 { + existing.Params = paramListFromOverlay(ov.Params) + } + if ov.SecurityTag != "" { + existing.SecurityTag = ov.SecurityTag + } + if ov.Attribute != "" { + existing.Attribute = ov.Attribute + } + if ov.Throws != "" { + existing.Throws = ov.Throws + } + existing.Confidence = conf + existing.Source = core.SourceMerged +} + +func applyMethodOverride(h *core.CStdlibHeader, ov OverlayOverride) { + conf := ov.Confidence + if conf == 0 { + conf = 1.0 + } + if h.Classes == nil { + h.Classes = make(map[string]*core.CppStdlibClass) + } + cls, ok := h.Classes[ov.Class] + if !ok { + cls = core.NewCppStdlibClass(ov.Class) + h.Classes[ov.Class] = cls + } + + existing, found := cls.Methods[ov.Method] + if !found { + cls.Methods[ov.Method] = &core.CStdlibFunction{ + FQN: ov.Class + "::" + ov.Method, + ReturnType: ov.ReturnType, + Params: paramListFromOverlay(ov.Params), + Confidence: conf, + Source: core.SourceOverlay, + Attribute: ov.Attribute, + Throws: ov.Throws, + } + return + } + if ov.ReturnType != "" { + existing.ReturnType = ov.ReturnType + } + if len(ov.Params) > 0 { + existing.Params = paramListFromOverlay(ov.Params) + } + if ov.Attribute != "" { + existing.Attribute = ov.Attribute + } + if ov.Throws != "" { + existing.Throws = ov.Throws + } + existing.Confidence = conf + existing.Source = core.SourceMerged +} + +func applyTypedefOverride(h *core.CStdlibHeader, ov OverlayOverride) { + if h.Typedefs == nil { + h.Typedefs = make(map[string]*core.CStdlibTypedef) + } + if existing, ok := h.Typedefs[ov.Typedef]; ok { + if ov.Type != "" { + existing.Type = ov.Type + } + existing.Source = core.SourceMerged + return + } + h.Typedefs[ov.Typedef] = &core.CStdlibTypedef{ + Type: ov.Type, + Source: core.SourceOverlay, + } +} + +func applyConstantOverride(h *core.CStdlibHeader, ov OverlayOverride) { + if h.Constants == nil { + h.Constants = make(map[string]*core.CStdlibConstant) + } + if existing, ok := h.Constants[ov.Constant]; ok { + if ov.Type != "" { + existing.Type = ov.Type + } + if ov.Value != "" { + existing.Value = ov.Value + } + existing.Source = core.SourceMerged + return + } + h.Constants[ov.Constant] = &core.CStdlibConstant{ + Type: ov.Type, + Value: ov.Value, + Source: core.SourceOverlay, + } +} + +// applySkipRules removes any function / typedef / constant whose name matches +// a skip rule. C++ class methods are not currently subject to skip rules +// (they are scoped under a class, so name collisions are unlikely); add as a +// follow-up if a real-world overlay needs it. +func applySkipRules(h *core.CStdlibHeader, skips []OverlaySkip) { + if len(skips) == 0 { + return + } + for name := range h.Functions { + if matchesAnySkip(name, skips) { + delete(h.Functions, name) + } + } + for name := range h.FreeFunctions { + if matchesAnySkip(name, skips) { + delete(h.FreeFunctions, name) + } + } + for name := range h.Typedefs { + if matchesAnySkip(name, skips) { + delete(h.Typedefs, name) + } + } + for name := range h.Constants { + if matchesAnySkip(name, skips) { + delete(h.Constants, name) + } + } +} + +func matchesAnySkip(name string, skips []OverlaySkip) bool { + for _, sk := range skips { + if sk.Prefix != "" && strings.HasPrefix(name, sk.Prefix) { + return true + } + if sk.Exact != "" && name == sk.Exact { + return true + } + } + return false +} + +// paramListFromOverlay converts overlay-shaped params into the public schema +// params, defaulting Required to true when the overlay author left it blank. +func paramListFromOverlay(ps []OverlayParam) []*core.CStdlibParam { + if len(ps) == 0 { + return []*core.CStdlibParam{} + } + out := make([]*core.CStdlibParam, 0, len(ps)) + for _, p := range ps { + req := true + if p.Required != nil { + req = *p.Required + } + out = append(out, &core.CStdlibParam{ + Name: p.Name, + Type: p.Type, + Required: req, + Attribute: p.Attribute, + }) + } + return out +} + +// buildFunctionFQN derives a fully-qualified name for a header-only entry, using +// the header's ModuleID if present (e.g. "c::stdio") or falling back to the +// header name if not. This keeps overlay-only entries consistent with extracted +// entries even when the overlay omits the FQN. +func buildFunctionFQN(h *core.CStdlibHeader, name string) string { + if h.ModuleID != "" { + return h.ModuleID + "::" + name + } + return name +} diff --git a/sast-engine/tools/internal/clikeextract/overlay_test.go b/sast-engine/tools/internal/clikeextract/overlay_test.go new file mode 100644 index 00000000..4adcffac --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/overlay_test.go @@ -0,0 +1,535 @@ +package clikeextract + +import ( + "os" + "path/filepath" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// writeOverlay is a test helper that materialises a YAML overlay in a temp dir +// and returns its absolute path. +func writeOverlay(t *testing.T, contents string) string { + t.Helper() + dir := t.TempDir() + p := filepath.Join(dir, "overlay.yaml") + require.NoError(t, os.WriteFile(p, []byte(contents), 0o644)) + return p +} + +func TestLoadOverlay_EmptyPathReturnsNil(t *testing.T) { + o, err := LoadOverlay("", core.LanguageC) + require.NoError(t, err) + assert.Nil(t, o) +} + +func TestLoadOverlay_MissingFile(t *testing.T) { + _, err := LoadOverlay("/nonexistent-overlay-pr01.yaml", core.LanguageC) + require.Error(t, err) + assert.Contains(t, err.Error(), "reading") +} + +func TestLoadOverlay_InvalidYAML(t *testing.T) { + p := writeOverlay(t, "::: not yaml :::") + _, err := LoadOverlay(p, core.LanguageC) + require.Error(t, err) + assert.Contains(t, err.Error(), "parsing YAML") +} + +func TestLoadOverlay_MissingSchemaVersion(t *testing.T) { + p := writeOverlay(t, `language: c +overrides: [] +`) + _, err := LoadOverlay(p, core.LanguageC) + require.Error(t, err) + assert.Contains(t, err.Error(), "schema_version") +} + +func TestLoadOverlay_MissingLanguage(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +overrides: [] +`) + _, err := LoadOverlay(p, core.LanguageC) + require.Error(t, err) + assert.Contains(t, err.Error(), "language is required") +} + +func TestLoadOverlay_LanguageMismatch(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +language: cpp +overrides: [] +`) + _, err := LoadOverlay(p, core.LanguageC) + require.Error(t, err) + assert.Contains(t, err.Error(), `declares language="cpp" but extractor wants "c"`) +} + +func TestLoadOverlay_UnsupportedLanguage(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +language: rust +overrides: [] +`) + _, err := LoadOverlay(p, "rust") + require.Error(t, err) + assert.Contains(t, err.Error(), "language must be") +} + +func TestLoadOverlay_ValidEmpty(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +language: c +overrides: [] +`) + o, err := LoadOverlay(p, core.LanguageC) + require.NoError(t, err) + require.NotNil(t, o) + assert.Equal(t, "1.0.0", o.SchemaVersion) + assert.Equal(t, core.LanguageC, o.Language) +} + +func TestLoadOverlay_FunctionOverride(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +language: c +overrides: + - header: stdio.h + function: printf + return_type: int + params: + - { name: format, type: const char* } + confidence: 0.9 + security_tag: format_string_sink +`) + o, err := LoadOverlay(p, core.LanguageC) + require.NoError(t, err) + require.Len(t, o.Overrides, 1) + assert.Equal(t, "printf", o.Overrides[0].Function) + assert.InDelta(t, 0.9, o.Overrides[0].Confidence, 0.001) +} + +func TestLoadOverlay_OverrideMissingHeader(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +language: c +overrides: + - function: printf +`) + _, err := LoadOverlay(p, core.LanguageC) + require.Error(t, err) + assert.Contains(t, err.Error(), "header is required") +} + +func TestLoadOverlay_OverrideMissingIdentifier(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +language: c +overrides: + - header: stdio.h + return_type: int +`) + _, err := LoadOverlay(p, core.LanguageC) + require.Error(t, err) + assert.Contains(t, err.Error(), "exactly one of function, method, typedef, constant") +} + +func TestLoadOverlay_OverrideMultipleIdentifiers(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +language: c +overrides: + - header: stdio.h + function: printf + typedef: FILE +`) + _, err := LoadOverlay(p, core.LanguageC) + require.Error(t, err) + assert.Contains(t, err.Error(), "got multiple") +} + +func TestLoadOverlay_MethodOverrideWithoutClass(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +language: cpp +overrides: + - header: vector + method: push_back +`) + _, err := LoadOverlay(p, core.LanguageCpp) + require.Error(t, err) + assert.Contains(t, err.Error(), "method override requires class") +} + +func TestLoadOverlay_MethodOverrideInCOverlayRejected(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +language: c +overrides: + - header: stdio.h + class: FILE + method: read +`) + _, err := LoadOverlay(p, core.LanguageC) + require.Error(t, err) + assert.Contains(t, err.Error(), "class+method only valid in cpp") +} + +func TestLoadOverlay_SkipNeedsExactlyOne(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +language: c +overrides: [] +skip: + - prefix: __builtin_ + exact: malloc +`) + _, err := LoadOverlay(p, core.LanguageC) + require.Error(t, err) + assert.Contains(t, err.Error(), "exactly one of prefix or exact") +} + +func TestLoadOverlay_AliasNeedsBothFields(t *testing.T) { + p := writeOverlay(t, `schema_version: "1.0.0" +language: c +overrides: [] +cross_platform_aliases: + - alias: _stdio.h +`) + _, err := LoadOverlay(p, core.LanguageC) + require.Error(t, err) + assert.Contains(t, err.Error(), "alias and canonical") +} + +// Merge tests. Each builds a CStdlibHeader, applies an overlay, asserts the +// final state. + +//nolint:unparam // name is parameterised for future test cases on other headers. +func newTestHeader(name string) *core.CStdlibHeader { + h := core.NewCStdlibHeader() + h.Header = name + h.ModuleID = "c::" + SanitizeHeaderName(name) + h.Language = core.LanguageC + return h +} + +func TestMergeOverlay_NilOverlayIsNoop(t *testing.T) { + h := newTestHeader("stdio.h") + h.Functions["printf"] = &core.CStdlibFunction{FQN: "c::stdio::printf", Source: core.SourceHeader} + + count := MergeOverlay(h, nil) + assert.Equal(t, 0, count) + assert.Equal(t, core.SourceHeader, h.Functions["printf"].Source) +} + +func TestMergeOverlay_NilHeader(t *testing.T) { + o := &Overlay{Language: core.LanguageC, SchemaVersion: "1.0.0"} + count := MergeOverlay(nil, o) + assert.Equal(t, 0, count) +} + +func TestMergeOverlay_FunctionOverride_HeaderOnlyBecomesMerged(t *testing.T) { + h := newTestHeader("stdio.h") + h.Functions["printf"] = &core.CStdlibFunction{ + FQN: "c::stdio::printf", ReturnType: "int", Source: core.SourceHeader, + } + o := &Overlay{ + SchemaVersion: "1.0.0", + Language: core.LanguageC, + Overrides: []OverlayOverride{ + {Header: "stdio.h", Function: "printf", SecurityTag: "format_string_sink"}, + }, + } + + applied := MergeOverlay(h, o) + assert.Equal(t, 1, applied) + got := h.Functions["printf"] + assert.Equal(t, core.SourceMerged, got.Source) + assert.Equal(t, "format_string_sink", got.SecurityTag) + assert.Equal(t, "int", got.ReturnType, "untouched fields preserved") +} + +func TestMergeOverlay_FunctionOverride_OverlayOnlyBecomesOverlay(t *testing.T) { + h := newTestHeader("stdio.h") + o := &Overlay{ + SchemaVersion: "1.0.0", + Language: core.LanguageC, + Overrides: []OverlayOverride{ + {Header: "stdio.h", Function: "system", ReturnType: "int", SecurityTag: "command_injection_sink"}, + }, + } + + applied := MergeOverlay(h, o) + assert.Equal(t, 1, applied) + got := h.Functions["system"] + require.NotNil(t, got) + assert.Equal(t, core.SourceOverlay, got.Source) + assert.Equal(t, "command_injection_sink", got.SecurityTag) + assert.Equal(t, "c::stdio::system", got.FQN) + assert.InDelta(t, float32(1.0), got.Confidence, 0.001, "default confidence") +} + +func TestMergeOverlay_FunctionOverride_HeaderMismatchIsSkipped(t *testing.T) { + h := newTestHeader("stdio.h") + o := &Overlay{ + SchemaVersion: "1.0.0", + Language: core.LanguageC, + Overrides: []OverlayOverride{ + {Header: "string.h", Function: "strcpy"}, + }, + } + applied := MergeOverlay(h, o) + assert.Equal(t, 0, applied) + assert.Empty(t, h.Functions) +} + +func TestMergeOverlay_MethodOverride(t *testing.T) { + h := core.NewCStdlibHeader() + h.Header = "vector" + h.ModuleID = "std::vector" + h.Language = core.LanguageCpp + + o := &Overlay{ + SchemaVersion: "1.0.0", + Language: core.LanguageCpp, + Overrides: []OverlayOverride{ + {Header: "vector", Class: "std::vector", Method: "push_back", ReturnType: "void"}, + {Header: "vector", Class: "std::vector", Method: "at", ReturnType: "T&", Throws: "std::out_of_range"}, + }, + } + applied := MergeOverlay(h, o) + assert.Equal(t, 2, applied) + + cls := h.Classes["std::vector"] + require.NotNil(t, cls) + pb := cls.Methods["push_back"] + require.NotNil(t, pb) + assert.Equal(t, "void", pb.ReturnType) + assert.Equal(t, core.SourceOverlay, pb.Source) + assert.Equal(t, "std::vector::push_back", pb.FQN) + + at := cls.Methods["at"] + require.NotNil(t, at) + assert.Equal(t, "std::out_of_range", at.Throws) +} + +func TestMergeOverlay_MethodOverride_ExistingClassAndMethodMerged(t *testing.T) { + h := core.NewCStdlibHeader() + h.Header = "vector" + cls := core.NewCppStdlibClass("std::vector") + cls.Methods["size"] = &core.CStdlibFunction{ + FQN: "std::vector::size", ReturnType: "size_t", Source: core.SourceHeader, + } + h.Classes["std::vector"] = cls + + o := &Overlay{ + SchemaVersion: "1.0.0", + Language: core.LanguageCpp, + Overrides: []OverlayOverride{ + {Header: "vector", Class: "std::vector", Method: "size", Throws: ""}, + { + Header: "vector", Class: "std::vector", Method: "size", + ReturnType: "std::size_t", Attribute: "nodiscard", + Params: []OverlayParam{}, + }, + }, + } + applied := MergeOverlay(h, o) + assert.Equal(t, 2, applied) + got := h.Classes["std::vector"].Methods["size"] + assert.Equal(t, "std::size_t", got.ReturnType) + assert.Equal(t, "nodiscard", got.Attribute) + assert.Equal(t, core.SourceMerged, got.Source) +} + +func TestMergeOverlay_TypedefOverride(t *testing.T) { + h := newTestHeader("stdio.h") + h.Typedefs["FILE"] = &core.CStdlibTypedef{Type: "struct __FILE", Source: core.SourceHeader} + + o := &Overlay{ + SchemaVersion: "1.0.0", Language: core.LanguageC, + Overrides: []OverlayOverride{ + {Header: "stdio.h", Typedef: "FILE", Type: "struct _IO_FILE"}, + {Header: "stdio.h", Typedef: "fpos_t", Type: "long"}, + }, + } + applied := MergeOverlay(h, o) + assert.Equal(t, 2, applied) + assert.Equal(t, "struct _IO_FILE", h.Typedefs["FILE"].Type) + assert.Equal(t, core.SourceMerged, h.Typedefs["FILE"].Source) + assert.Equal(t, "long", h.Typedefs["fpos_t"].Type) + assert.Equal(t, core.SourceOverlay, h.Typedefs["fpos_t"].Source) +} + +func TestMergeOverlay_ConstantOverride(t *testing.T) { + h := newTestHeader("stdio.h") + h.Constants["EOF"] = &core.CStdlibConstant{Type: "int", Value: "-1", Source: core.SourceHeader} + + o := &Overlay{ + SchemaVersion: "1.0.0", Language: core.LanguageC, + Overrides: []OverlayOverride{ + {Header: "stdio.h", Constant: "EOF", Value: "(-1)"}, + {Header: "stdio.h", Constant: "BUFSIZ", Type: "int", Value: "8192"}, + }, + } + applied := MergeOverlay(h, o) + assert.Equal(t, 2, applied) + assert.Equal(t, "(-1)", h.Constants["EOF"].Value) + assert.Equal(t, core.SourceMerged, h.Constants["EOF"].Source) + assert.Equal(t, "8192", h.Constants["BUFSIZ"].Value) + assert.Equal(t, core.SourceOverlay, h.Constants["BUFSIZ"].Source) +} + +func TestMergeOverlay_SkipPrefixDropsFunctions(t *testing.T) { + h := newTestHeader("stdio.h") + h.Functions["printf"] = &core.CStdlibFunction{FQN: "c::stdio::printf", Source: core.SourceHeader} + h.Functions["__builtin_printf"] = &core.CStdlibFunction{FQN: "c::stdio::__builtin_printf", Source: core.SourceHeader} + h.FreeFunctions["__internal_free"] = &core.CStdlibFunction{Source: core.SourceHeader} + h.Typedefs["__attr_t"] = &core.CStdlibTypedef{Type: "int", Source: core.SourceHeader} + h.Constants["__BUFSIZ"] = &core.CStdlibConstant{Source: core.SourceHeader} + + o := &Overlay{ + SchemaVersion: "1.0.0", Language: core.LanguageC, + Skip: []OverlaySkip{{Prefix: "__"}}, + } + applied := MergeOverlay(h, o) + assert.Equal(t, 0, applied) + assert.NotNil(t, h.Functions["printf"]) + assert.Nil(t, h.Functions["__builtin_printf"]) + assert.Nil(t, h.FreeFunctions["__internal_free"]) + assert.Nil(t, h.Typedefs["__attr_t"]) + assert.Nil(t, h.Constants["__BUFSIZ"]) +} + +func TestMergeOverlay_SkipExactDropsExact(t *testing.T) { + h := newTestHeader("stdio.h") + h.Functions["printf"] = &core.CStdlibFunction{Source: core.SourceHeader} + h.Functions["fopen"] = &core.CStdlibFunction{Source: core.SourceHeader} + + o := &Overlay{ + SchemaVersion: "1.0.0", Language: core.LanguageC, + Skip: []OverlaySkip{{Exact: "fopen"}}, + } + MergeOverlay(h, o) + assert.NotNil(t, h.Functions["printf"]) + assert.Nil(t, h.Functions["fopen"]) +} + +func TestMatchesAnySkip(t *testing.T) { + skips := []OverlaySkip{{Prefix: "__"}, {Exact: "deprecated_thing"}} + assert.True(t, matchesAnySkip("__internal", skips)) + assert.True(t, matchesAnySkip("deprecated_thing", skips)) + assert.False(t, matchesAnySkip("public_func", skips)) +} + +func TestApplySkipRules_Empty(t *testing.T) { + h := newTestHeader("stdio.h") + h.Functions["printf"] = &core.CStdlibFunction{Source: core.SourceHeader} + applySkipRules(h, nil) + assert.NotNil(t, h.Functions["printf"]) +} + +func TestParamListFromOverlay(t *testing.T) { + t.Run("empty input", func(t *testing.T) { + assert.Empty(t, paramListFromOverlay(nil)) + }) + + t.Run("default required true", func(t *testing.T) { + got := paramListFromOverlay([]OverlayParam{{Name: "x", Type: "int"}}) + require.Len(t, got, 1) + assert.True(t, got[0].Required) + }) + + t.Run("explicit required false", func(t *testing.T) { + f := false + got := paramListFromOverlay([]OverlayParam{{Name: "x", Type: "int", Required: &f}}) + require.Len(t, got, 1) + assert.False(t, got[0].Required) + }) +} + +func TestBuildFunctionFQN(t *testing.T) { + h := &core.CStdlibHeader{ModuleID: "c::stdio"} + assert.Equal(t, "c::stdio::printf", buildFunctionFQN(h, "printf")) + + h = &core.CStdlibHeader{} + assert.Equal(t, "printf", buildFunctionFQN(h, "printf")) +} + +func TestApplyFunctionOverride_NilFunctionsMap(t *testing.T) { + h := &core.CStdlibHeader{Header: "stdio.h", ModuleID: "c::stdio"} + applyFunctionOverride(h, OverlayOverride{Header: "stdio.h", Function: "printf", ReturnType: "int"}) + require.NotNil(t, h.Functions) + assert.NotNil(t, h.Functions["printf"]) +} + +func TestApplyTypedefOverride_NilTypedefsMap(t *testing.T) { + h := &core.CStdlibHeader{Header: "stdio.h"} + applyTypedefOverride(h, OverlayOverride{Header: "stdio.h", Typedef: "FILE", Type: "struct __FILE"}) + require.NotNil(t, h.Typedefs) + assert.NotNil(t, h.Typedefs["FILE"]) +} + +func TestApplyConstantOverride_NilConstantsMap(t *testing.T) { + h := &core.CStdlibHeader{Header: "stdio.h"} + applyConstantOverride(h, OverlayOverride{Header: "stdio.h", Constant: "EOF", Value: "-1"}) + require.NotNil(t, h.Constants) + assert.NotNil(t, h.Constants["EOF"]) +} + +func TestApplyMethodOverride_NilClassesMap(t *testing.T) { + h := &core.CStdlibHeader{Header: "vector"} + applyMethodOverride(h, OverlayOverride{ + Header: "vector", Class: "std::vector", Method: "push_back", ReturnType: "void", + }) + require.NotNil(t, h.Classes) + assert.NotNil(t, h.Classes["std::vector"]) +} + +func TestOverrideKind_AllKinds(t *testing.T) { + tests := []struct { + name string + ov OverlayOverride + want overrideEntryKind + }{ + {"function", OverlayOverride{Function: "printf"}, overrideFunction}, + {"method", OverlayOverride{Class: "C", Method: "f"}, overrideMethod}, + {"typedef", OverlayOverride{Typedef: "T"}, overrideTypedef}, + {"constant", OverlayOverride{Constant: "K"}, overrideConstant}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := overrideKind(tt.ov) + require.NoError(t, err) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestApplyFunctionOverride_PreservesParamsWhenOverlayHasNone(t *testing.T) { + h := &core.CStdlibHeader{Header: "stdio.h"} + h.Functions = map[string]*core.CStdlibFunction{ + "printf": { + FQN: "c::stdio::printf", + Params: []*core.CStdlibParam{{Name: "fmt", Type: "const char*"}}, + Source: core.SourceHeader, + }, + } + applyFunctionOverride(h, OverlayOverride{Header: "stdio.h", Function: "printf", SecurityTag: "x"}) + got := h.Functions["printf"] + assert.Len(t, got.Params, 1, "extracted params preserved when overlay supplies none") + assert.Equal(t, "x", got.SecurityTag) +} + +func TestApplyMethodOverride_PreservesParamsWhenOverlayHasNone(t *testing.T) { + h := core.NewCStdlibHeader() + h.Header = "vector" + cls := core.NewCppStdlibClass("std::vector") + cls.Methods["at"] = &core.CStdlibFunction{ + FQN: "std::vector::at", + Params: []*core.CStdlibParam{{Name: "pos", Type: "size_t"}}, + Source: core.SourceHeader, + } + h.Classes["std::vector"] = cls + + applyMethodOverride(h, OverlayOverride{ + Header: "vector", Class: "std::vector", Method: "at", + Throws: "std::out_of_range", + }) + got := h.Classes["std::vector"].Methods["at"] + assert.Len(t, got.Params, 1) + assert.Equal(t, "std::out_of_range", got.Throws) +} diff --git a/sast-engine/tools/internal/clikeextract/testdata/c/inline.h b/sast-engine/tools/internal/clikeextract/testdata/c/inline.h new file mode 100644 index 00000000..11fd7d7c --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/testdata/c/inline.h @@ -0,0 +1,34 @@ +/* Synthetic header exercising inline function definitions, extern "C" blocks, + * preproc_ifdef/preproc_else branches, struct field declarators, and + * pointer-returning typedefs. Used to exercise the full set of walkCNode + * dispatch branches in tests. + */ +#ifndef _INLINE_H +#define _INLINE_H + +/* Inline function definition (function_definition node, not declaration). */ +static inline int abs_diff(int a, int b) { + return a > b ? a - b : b - a; +} + +/* Function returning a pointer-to-pointer. */ +char** make_argv(int argc); + +#ifdef __cplusplus +extern "C" { +int legacy_c_only(int x); +} +#endif + +/* preproc_else branch: only one branch is taken at compile time, but the + * AST still includes both. Walker should descend into both arms. */ +#ifdef HAVE_FOO +int foo_a(void); +#else +int foo_b(void); +#endif + +/* Pointer typedef. */ +typedef int* int_ptr; + +#endif diff --git a/sast-engine/tools/internal/clikeextract/testdata/c/stdio.h b/sast-engine/tools/internal/clikeextract/testdata/c/stdio.h new file mode 100644 index 00000000..b0ce3b83 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/testdata/c/stdio.h @@ -0,0 +1,34 @@ +/* Synthetic stdio.h fixture for clikeextract tests. + * Mirrors the canonical glibc surface area without the macro magic. + */ +#ifndef _STDIO_H +#define _STDIO_H + +typedef struct __FILE FILE; +typedef long fpos_t; + +#define EOF (-1) +#define BUFSIZ 8192 +#define FILENAME_MAX 4096 +#define DEFAULT_PATH "/tmp" + +extern FILE* stdin; +extern FILE* stdout; +extern FILE* stderr; + +int printf(const char* format, ...); +int fprintf(FILE* stream, const char* format, ...); +int sprintf(char* str, const char* format, ...); +int fclose(FILE* stream); +FILE* fopen(const char* pathname, const char* mode); +size_t fread(void* ptr, size_t size, size_t nmemb, FILE* stream); +size_t fwrite(const void* ptr, size_t size, size_t nmemb, FILE* stream); +int fseek(FILE* stream, long offset, int whence); +long ftell(FILE* stream); +void rewind(FILE* stream); + +/* Private symbols — should be filtered out by IsPrivateSymbol. */ +int _internal_buffer_flush(FILE*); +int __builtin_printf_check(const char*, ...); + +#endif diff --git a/sast-engine/tools/internal/clikeextract/testdata/c/string.h b/sast-engine/tools/internal/clikeextract/testdata/c/string.h new file mode 100644 index 00000000..12de2bc5 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/testdata/c/string.h @@ -0,0 +1,23 @@ +/* Synthetic string.h fixture for clikeextract tests. */ +#ifndef _STRING_H +#define _STRING_H + +typedef unsigned long size_t; + +#define NULL ((void*)0) + +void* memcpy(void* dest, const void* src, size_t n); +void* memmove(void* dest, const void* src, size_t n); +int memcmp(const void* s1, const void* s2, size_t n); +void* memset(void* s, int c, size_t n); +char* strcpy(char* dest, const char* src); +char* strncpy(char* dest, const char* src, size_t n); +char* strcat(char* dest, const char* src); +char* strncat(char* dest, const char* src, size_t n); +int strcmp(const char* s1, const char* s2); +int strncmp(const char* s1, const char* s2, size_t n); +size_t strlen(const char* s); +char* strchr(const char* s, int c); +char* strrchr(const char* s, int c); + +#endif diff --git a/sast-engine/tools/internal/clikeextract/testdata/c/unistd.h b/sast-engine/tools/internal/clikeextract/testdata/c/unistd.h new file mode 100644 index 00000000..3398d07a --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/testdata/c/unistd.h @@ -0,0 +1,19 @@ +/* Synthetic unistd.h fixture: a few POSIX functions and a #define. */ +#ifndef _UNISTD_H +#define _UNISTD_H + +typedef long ssize_t; +typedef int pid_t; + +#define STDIN_FILENO 0 +#define STDOUT_FILENO 1 +#define STDERR_FILENO 2 + +ssize_t read(int fd, void* buf, size_t count); +ssize_t write(int fd, const void* buf, size_t count); +int close(int fd); +int open(const char* pathname, int flags, int mode); +pid_t fork(void); +int execvp(const char* file, char* const argv[]); + +#endif diff --git a/sast-engine/tools/internal/clikeextract/testdata/cpp/string b/sast-engine/tools/internal/clikeextract/testdata/cpp/string new file mode 100644 index 00000000..e6c3d66c --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/testdata/cpp/string @@ -0,0 +1,28 @@ +// Synthetic fixture. + +#ifndef _GLIBCXX_STRING +#define _GLIBCXX_STRING + +namespace std { + + typedef unsigned long size_t; + + template + class basic_string { + public: + basic_string(); + basic_string(const CharT* s); + + const CharT* c_str() const; + const CharT* data() const; + size_t size() const; + size_t length() const; + bool empty() const; + basic_string& operator+=(const basic_string& rhs); + }; + + typedef basic_string string; + +} + +#endif diff --git a/sast-engine/tools/internal/clikeextract/testdata/cpp/utility b/sast-engine/tools/internal/clikeextract/testdata/cpp/utility new file mode 100644 index 00000000..99af1dea --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/testdata/cpp/utility @@ -0,0 +1,30 @@ +// Synthetic fixture: free functions and a non-template helper. + +#ifndef _GLIBCXX_UTILITY +#define _GLIBCXX_UTILITY + +namespace std { + + template + T&& move(T& x); + + template + T&& forward(T& x); + + // Skipped — leading underscore + lowercase = library-private. + void _internal_helper(); + + // Skipped — double underscore = compiler-implementation. + void __detail_helper(); + +} + +// File-scope free function (no namespace). Should land in h.Functions. +int file_scope_func(int x); + +// Compiler-internal namespace — entire body skipped. +namespace __detail { + void should_not_appear(); +} + +#endif diff --git a/sast-engine/tools/internal/clikeextract/testdata/cpp/vector b/sast-engine/tools/internal/clikeextract/testdata/cpp/vector new file mode 100644 index 00000000..6a47bb36 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/testdata/cpp/vector @@ -0,0 +1,34 @@ +// Synthetic fixture for clikeextract C++ tests. +// Mirrors libstdc++'s public surface without the macro magic. + +#ifndef _GLIBCXX_VECTOR +#define _GLIBCXX_VECTOR + +namespace std { + + template + class vector { + public: + vector(); + vector(size_t n); + vector(const vector& other); + + void push_back(const T& value); + T& at(size_t pos); + T& operator[](size_t pos); + size_t size() const; + bool empty() const; + T* data(); + void clear(); + + private: + T* data_; + size_t size_; + }; + + template + void swap(vector& a, vector& b); + +} // namespace std + +#endif diff --git a/sast-engine/tools/internal/clikeextract/walker.go b/sast-engine/tools/internal/clikeextract/walker.go new file mode 100644 index 00000000..6e144a36 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/walker.go @@ -0,0 +1,301 @@ +package clikeextract + +import ( + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + "runtime" + "slices" + "sort" + "strings" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" +) + +// HeaderSource describes one (platform, language) header tree to walk. Each +// search dir is rooted at a stdlib or POSIX install location; one extractor +// run typically yields 1–3 sources (e.g. for linux/cpp: the libstdc++ tree). +type HeaderSource struct { + // Platform / Language echo Config.Target / Config.Language for stamping + // emitted JSON files. Stored on the source rather than re-derived later + // because cross-platform combinations may want platform-specific tags + // (e.g. mingw vs MSVC headers both target windows). + Platform string + Language string + + // SearchDirs is the absolute path(s) to walk. The walker enumerates the + // transitive contents and yields HeaderFile entries. Order is preserved: + // earlier dirs win on duplicate header names (rare, but happens with + // /usr/include vs /usr/local/include layouts). + SearchDirs []string + + // HeaderExts is the set of recognised header extensions. C uses + // `.h` only; C++ also accepts the extensionless form () and + // `.hpp`/`.hxx`. The empty string in this slice opts in to extensionless + // files — used for the C++ STL where files like /usr/include/c++/13/vector + // have no suffix. + HeaderExts []string + + // SystemTag identifies the source library + version, e.g. "glibc-2.39", + // "libstdc++-13", "mingw-w64-libstdc++-13". Stamped into manifest.SystemTag + // for downstream debugging. + SystemTag string +} + +// HeaderFile is one discovered header on disk: the header name as you'd #include +// it, plus the absolute path to read. +type HeaderFile struct { + // Name is the canonical #include form (e.g. "stdio.h", "vector", + // "sys/socket.h"). Computed by stripping the search-dir prefix from Path. + Name string + + // Path is the absolute filesystem path. Used to read the source bytes. + Path string +} + +// DiscoverHeaderSources returns the list of HeaderSource entries to walk for +// the given (target, language). PR-01 supports linux/c and linux/cpp; windows +// and darwin paths are scaffolded with a clear "not yet implemented" error so +// PR-03 can fill them in without re-shaping the surface. +// +// Detection of glibc/libstdc++ versions is best-effort: the function probes +// the filesystem for canonical install paths and falls back to a generic +// "unknown" tag on failure rather than refusing to run. This keeps the +// generator usable in containerised CI environments where the version-detection +// commands might not be available. +func DiscoverHeaderSources(target, language string) ([]HeaderSource, error) { + switch target + "/" + language { + case core.PlatformLinux + "/" + core.LanguageC: + return []HeaderSource{linuxCSource()}, nil + + case core.PlatformLinux + "/" + core.LanguageCpp: + src, err := linuxCppSource() + if err != nil { + return nil, err + } + return []HeaderSource{src}, nil + + case core.PlatformWindows + "/" + core.LanguageC, + core.PlatformWindows + "/" + core.LanguageCpp, + core.PlatformDarwin + "/" + core.LanguageC, + core.PlatformDarwin + "/" + core.LanguageCpp: + return nil, fmt.Errorf("DiscoverHeaderSources: target %q language %q is scheduled for PR-03; "+ + "PR-01 only ships %s/%s and %s/%s", target, language, + core.PlatformLinux, core.LanguageC, core.PlatformLinux, core.LanguageCpp) + + default: + return nil, fmt.Errorf("DiscoverHeaderSources: unknown target+language combination %q+%q", target, language) + } +} + +// linuxLibcRoots is the ordered list of canonical glibc lib directories probed +// by detectGlibcTag. Exposed as a package var (rather than a literal inside the +// function) so tests can override the search list to exercise both the hit and +// miss branches without depending on the host's filesystem layout. +var linuxLibcRoots = []string{ + "/lib/x86_64-linux-gnu", + "/lib/aarch64-linux-gnu", + "/lib64", + "/usr/lib/x86_64-linux-gnu", + "/usr/lib/aarch64-linux-gnu", + "/usr/lib64", +} + +// linuxCppRoot is the libstdc++ include directory probed by findLibstdcppRoot. +// Test override knob, same rationale as linuxLibcRoots. +var linuxCppRoot = "/usr/include/c++" + +// linuxCSource returns the C header source on Linux. Always succeeds because +// the search dir (/usr/include) is universally present on glibc systems and +// the walker tolerates absent dirs at WalkHeaders time anyway. +func linuxCSource() HeaderSource { + return HeaderSource{ + Platform: core.PlatformLinux, + Language: core.LanguageC, + SearchDirs: []string{"/usr/include"}, + HeaderExts: []string{".h"}, + SystemTag: detectGlibcTag(), + } +} + +// linuxCppSource returns the C++ header source on Linux. Probes for the libstdc++ +// install via linuxCppRoot/; returns an explicit error if no version is +// found because, unlike the C path, the C++ tree is unrecoverable without one +// (the directory name itself encodes the version). +func linuxCppSource() (HeaderSource, error) { + dir, version := findLibstdcppRoot() + if dir == "" { + return HeaderSource{}, errors.New("linuxCppSource: no libstdc++ headers found under " + linuxCppRoot + "/* " + + "(install libstdc++-dev / libstdc++-13-dev or run inside a container with C++ stdlib headers)") + } + return HeaderSource{ + Platform: core.PlatformLinux, + Language: core.LanguageCpp, + SearchDirs: []string{dir}, + HeaderExts: []string{".h", ".hpp", ".hxx", ""}, + SystemTag: "libstdc++-" + version, + }, nil +} + +// detectGlibcTag returns "glibc-" of the first existing dir from +// linuxLibcRoots, falling back to "glibc-unknown" so the manifest always has +// *something* identifiable. Generated registries don't have to match +// `ldd --version` exactly — the tag is for spotting registry-vs-runtime +// mismatches, not for downstream feature detection. +func detectGlibcTag() string { + for _, c := range linuxLibcRoots { + if dirExists(c) { + return "glibc-" + filepath.Base(c) + } + } + return "glibc-unknown" +} + +// findLibstdcppRoot inspects linuxCppRoot/ and returns the lexically- +// largest version directory (which, for purely-numeric names, is the freshest +// install) plus its version string. Returns ("","") if no install is found. +func findLibstdcppRoot() (dir, version string) { + entries, err := os.ReadDir(linuxCppRoot) + if err != nil { + return "", "" + } + + var versions []string + for _, e := range entries { + if !e.IsDir() { + continue + } + // Accept any directory name that contains at least one digit — covers + // the canonical `13`, `13.2.0`, and similar; skips bare `experimental`. + name := e.Name() + if !containsDigit(name) { + continue + } + versions = append(versions, name) + } + if len(versions) == 0 { + return "", "" + } + + sort.Strings(versions) + v := versions[len(versions)-1] + return filepath.Join(linuxCppRoot, v), v +} + +func containsDigit(s string) bool { + for _, r := range s { + if r >= '0' && r <= '9' { + return true + } + } + return false +} + +func dirExists(p string) bool { + info, err := os.Stat(p) + return err == nil && info.IsDir() +} + +// WalkHeaders enumerates header files under src.SearchDirs that match the +// configured extensions. Internal/private subdirectories (`bits/`, `internal/`, +// `__support/`, etc.) are skipped because their contents are compiler-internal +// and the public namespace is sufficient. +// +// The result is deterministic — entries are sorted by header Name — so manifest +// output is stable across runs (independent of filesystem walk order). +func (s HeaderSource) WalkHeaders() ([]HeaderFile, error) { + if len(s.SearchDirs) == 0 { + return nil, errors.New("WalkHeaders: HeaderSource has no SearchDirs") + } + + var found []HeaderFile + seen := make(map[string]struct{}) // dedupe by canonical name + + for _, dir := range s.SearchDirs { + if !dirExists(dir) { + return nil, fmt.Errorf("WalkHeaders: search dir %q does not exist (install missing system headers)", dir) + } + + walkErr := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() { + if shouldSkipDir(d.Name()) { + return fs.SkipDir + } + return nil + } + if !s.matchesExt(d.Name()) { + return nil + } + rel, relErr := filepath.Rel(dir, path) + if relErr != nil { + return fmt.Errorf("relative path of %q: %w", path, relErr) + } + name := filepath.ToSlash(rel) + if _, dup := seen[name]; dup { + return nil + } + seen[name] = struct{}{} + found = append(found, HeaderFile{Name: name, Path: path}) + return nil + }) + if walkErr != nil { + return nil, fmt.Errorf("walking %q: %w", dir, walkErr) + } + } + + sort.Slice(found, func(i, j int) bool { + return found[i].Name < found[j].Name + }) + return found, nil +} + +// shouldSkipDir reports whether the directory at the given name should be pruned +// from the walk. The pruned set is intentionally small and conservative: only +// directories that the C/C++ standards explicitly designate as implementation +// detail. +func shouldSkipDir(name string) bool { + switch name { + case "bits", "internal", "__support", "ext", "experimental", "tr1", "tr2", + "parallel", "pstl", "debug", "profile", "decimal": + return true + } + return false +} + +// matchesExt reports whether filename is one of this source's recognised header +// extensions. The empty extension (extensionless) is also matched: tree-sitter +// can parse `` from libstdc++ even though it has no suffix. +func (s HeaderSource) matchesExt(filename string) bool { + ext := filepath.Ext(filename) + return slices.Contains(s.HeaderExts, ext) +} + +// IsHostLinux is a runtime helper used by tests that need to short-circuit +// when the host isn't running glibc (e.g. CI macOS runners, Windows). Kept here +// rather than in a test helper so the generator binary itself can use it for +// graceful warnings if a developer runs it on a non-Linux machine before PR-03 +// adds darwin/windows support. +func IsHostLinux() bool { + return runtime.GOOS == core.PlatformLinux +} + +// HeaderName converts an absolute path under one of src's search dirs back into +// its #include form — the inverse of WalkHeaders' Name → Path mapping. Useful +// when a caller already has a path and wants the registry-key form. +// +// Returns "" if path is not under any of the source's search dirs. +func (s HeaderSource) HeaderName(path string) string { + for _, dir := range s.SearchDirs { + rel, err := filepath.Rel(dir, path) + if err != nil || strings.HasPrefix(rel, "..") { + continue + } + return filepath.ToSlash(rel) + } + return "" +} diff --git a/sast-engine/tools/internal/clikeextract/walker_test.go b/sast-engine/tools/internal/clikeextract/walker_test.go new file mode 100644 index 00000000..21dfdd60 --- /dev/null +++ b/sast-engine/tools/internal/clikeextract/walker_test.go @@ -0,0 +1,318 @@ +package clikeextract + +import ( + "os" + "path/filepath" + "sort" + "testing" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDiscoverHeaderSources_LinuxC(t *testing.T) { + got, err := DiscoverHeaderSources(core.PlatformLinux, core.LanguageC) + require.NoError(t, err) + require.Len(t, got, 1) + assert.Equal(t, core.PlatformLinux, got[0].Platform) + assert.Equal(t, core.LanguageC, got[0].Language) + assert.Equal(t, []string{".h"}, got[0].HeaderExts) + assert.Contains(t, got[0].SystemTag, "glibc-") +} + +func TestDiscoverHeaderSources_LinuxCpp_Found(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "13"), 0o755)) + withCppRoot(t, root) + + got, err := DiscoverHeaderSources(core.PlatformLinux, core.LanguageCpp) + require.NoError(t, err) + require.Len(t, got, 1) + assert.Equal(t, core.LanguageCpp, got[0].Language) + assert.Contains(t, got[0].HeaderExts, "") +} + +func TestDiscoverHeaderSources_LinuxCpp_NotInstalled(t *testing.T) { + withCppRoot(t, filepath.Join(t.TempDir(), "no-libstdcpp")) + got, err := DiscoverHeaderSources(core.PlatformLinux, core.LanguageCpp) + require.Error(t, err) + assert.Nil(t, got) + assert.Contains(t, err.Error(), "libstdc++") +} + +func TestDiscoverHeaderSources_NotImplementedTargets(t *testing.T) { + deferred := []struct { + platform, language string + }{ + {core.PlatformWindows, core.LanguageC}, + {core.PlatformWindows, core.LanguageCpp}, + {core.PlatformDarwin, core.LanguageC}, + {core.PlatformDarwin, core.LanguageCpp}, + } + for _, tt := range deferred { + _, err := DiscoverHeaderSources(tt.platform, tt.language) + require.Error(t, err) + assert.Contains(t, err.Error(), "PR-03") + } +} + +func TestDiscoverHeaderSources_UnknownCombination(t *testing.T) { + _, err := DiscoverHeaderSources("freebsd", "rust") + require.Error(t, err) + assert.Contains(t, err.Error(), "unknown target+language") +} + +func TestWalkHeaders_FlatDir(t *testing.T) { + dir := t.TempDir() + mustWriteFile(t, filepath.Join(dir, "stdio.h"), "// stdio") + mustWriteFile(t, filepath.Join(dir, "string.h"), "// string") + mustWriteFile(t, filepath.Join(dir, "ignore.txt"), "should be filtered out") + + src := HeaderSource{SearchDirs: []string{dir}, HeaderExts: []string{".h"}} + got, err := src.WalkHeaders() + require.NoError(t, err) + require.Len(t, got, 2) + + names := []string{got[0].Name, got[1].Name} + sort.Strings(names) + assert.Equal(t, []string{"stdio.h", "string.h"}, names) +} + +func TestWalkHeaders_SubdirNamePreserved(t *testing.T) { + dir := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(dir, "sys"), 0o755)) + mustWriteFile(t, filepath.Join(dir, "sys", "socket.h"), "// socket") + + src := HeaderSource{SearchDirs: []string{dir}, HeaderExts: []string{".h"}} + got, err := src.WalkHeaders() + require.NoError(t, err) + require.Len(t, got, 1) + assert.Equal(t, "sys/socket.h", got[0].Name) +} + +func TestWalkHeaders_SkipsPrivateSubdirs(t *testing.T) { + dir := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(dir, "bits"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(dir, "internal"), 0o755)) + mustWriteFile(t, filepath.Join(dir, "stdio.h"), "// public") + mustWriteFile(t, filepath.Join(dir, "bits", "types.h"), "// private") + mustWriteFile(t, filepath.Join(dir, "internal", "x.h"), "// private") + + src := HeaderSource{SearchDirs: []string{dir}, HeaderExts: []string{".h"}} + got, err := src.WalkHeaders() + require.NoError(t, err) + require.Len(t, got, 1) + assert.Equal(t, "stdio.h", got[0].Name) +} + +func TestWalkHeaders_ExtensionlessCpp(t *testing.T) { + dir := t.TempDir() + mustWriteFile(t, filepath.Join(dir, "vector"), "// vector") + mustWriteFile(t, filepath.Join(dir, "string"), "// string") + mustWriteFile(t, filepath.Join(dir, "memory.hpp"), "// memory") + mustWriteFile(t, filepath.Join(dir, "ignore.txt"), "// ignore") + + src := HeaderSource{SearchDirs: []string{dir}, HeaderExts: []string{".hpp", ""}} + got, err := src.WalkHeaders() + require.NoError(t, err) + require.Len(t, got, 3) +} + +func TestWalkHeaders_NoSearchDirs(t *testing.T) { + src := HeaderSource{HeaderExts: []string{".h"}} + _, err := src.WalkHeaders() + require.Error(t, err) + assert.Contains(t, err.Error(), "no SearchDirs") +} + +func TestWalkHeaders_MissingSearchDir(t *testing.T) { + src := HeaderSource{ + SearchDirs: []string{"/nonexistent-dir-c8a2-pr01-test"}, + HeaderExts: []string{".h"}, + } + _, err := src.WalkHeaders() + require.Error(t, err) + assert.Contains(t, err.Error(), "does not exist") +} + +func TestWalkHeaders_DedupAcrossSearchDirs(t *testing.T) { + dirA := t.TempDir() + dirB := t.TempDir() + mustWriteFile(t, filepath.Join(dirA, "stdio.h"), "// A") + mustWriteFile(t, filepath.Join(dirB, "stdio.h"), "// B (overrides — but dedup keeps A)") + + src := HeaderSource{SearchDirs: []string{dirA, dirB}, HeaderExts: []string{".h"}} + got, err := src.WalkHeaders() + require.NoError(t, err) + require.Len(t, got, 1) + assert.Equal(t, filepath.Join(dirA, "stdio.h"), got[0].Path, + "first SearchDir should win on duplicate header names") +} + +func TestWalkHeaders_PermissionDeniedSubdir(t *testing.T) { + if os.Geteuid() == 0 { + t.Skip("running as root: chmod 0 does not block reads") + } + dir := t.TempDir() + bad := filepath.Join(dir, "locked") + require.NoError(t, os.MkdirAll(bad, 0o755)) + mustWriteFile(t, filepath.Join(bad, "leaf.h"), "//") + require.NoError(t, os.Chmod(bad, 0o000)) + t.Cleanup(func() { _ = os.Chmod(bad, 0o755) }) + + src := HeaderSource{SearchDirs: []string{dir}, HeaderExts: []string{".h"}} + _, err := src.WalkHeaders() + require.Error(t, err) +} + +func TestWalkHeaders_DeterministicOrder(t *testing.T) { + dir := t.TempDir() + for _, name := range []string{"zebra.h", "alpha.h", "mango.h"} { + mustWriteFile(t, filepath.Join(dir, name), "//") + } + src := HeaderSource{SearchDirs: []string{dir}, HeaderExts: []string{".h"}} + + for range 3 { + got, err := src.WalkHeaders() + require.NoError(t, err) + require.Len(t, got, 3) + assert.Equal(t, "alpha.h", got[0].Name) + assert.Equal(t, "mango.h", got[1].Name) + assert.Equal(t, "zebra.h", got[2].Name) + } +} + +func TestHeaderName(t *testing.T) { + dir := t.TempDir() + src := HeaderSource{SearchDirs: []string{dir}, HeaderExts: []string{".h"}} + + abs := filepath.Join(dir, "sys", "socket.h") + require.NoError(t, os.MkdirAll(filepath.Dir(abs), 0o755)) + mustWriteFile(t, abs, "//") + + assert.Equal(t, "sys/socket.h", src.HeaderName(abs)) + assert.Equal(t, "", src.HeaderName("/etc/passwd"), "outside-tree path → empty") +} + +func TestShouldSkipDir(t *testing.T) { + skips := []string{"bits", "internal", "__support", "ext", "experimental", "tr1", "tr2", "parallel", "pstl", "debug", "profile", "decimal"} + for _, s := range skips { + assert.True(t, shouldSkipDir(s), s) + } + keeps := []string{"sys", "linux", "asm", "net"} + for _, s := range keeps { + assert.False(t, shouldSkipDir(s), s) + } +} + +func TestIsHostLinux(t *testing.T) { + got := IsHostLinux() + // We don't know the host, but the function must not panic and must return a bool. + assert.IsType(t, true, got) +} + +func TestContainsDigit(t *testing.T) { + assert.True(t, containsDigit("13")) + assert.True(t, containsDigit("13.2.0")) + assert.True(t, containsDigit("c++23")) + assert.False(t, containsDigit("experimental")) + assert.False(t, containsDigit("")) +} + +func TestDirExists(t *testing.T) { + d := t.TempDir() + assert.True(t, dirExists(d)) + assert.False(t, dirExists(filepath.Join(d, "nope"))) + + // File, not dir + f := filepath.Join(d, "file.txt") + mustWriteFile(t, f, "x") + assert.False(t, dirExists(f)) +} + +func TestDetectGlibcTag_HitFirstCandidate(t *testing.T) { + probe := t.TempDir() + libDir := filepath.Join(probe, "x86_64-linux-gnu") + require.NoError(t, os.MkdirAll(libDir, 0o755)) + + withRoots(t, []string{libDir, "/this/does/not/exist"}) + assert.Equal(t, "glibc-x86_64-linux-gnu", detectGlibcTag()) +} + +func TestDetectGlibcTag_FallbackUnknown(t *testing.T) { + withRoots(t, []string{"/no-libc-1", "/no-libc-2"}) + assert.Equal(t, "glibc-unknown", detectGlibcTag()) +} + +func TestFindLibstdcppRoot_PicksLargestVersion(t *testing.T) { + root := t.TempDir() + for _, v := range []string{"11", "13", "12.2", "experimental"} { + require.NoError(t, os.MkdirAll(filepath.Join(root, v), 0o755)) + } + withCppRoot(t, root) + + dir, version := findLibstdcppRoot() + assert.Equal(t, "13", version) + assert.Equal(t, filepath.Join(root, "13"), dir) +} + +func TestFindLibstdcppRoot_NoVersions(t *testing.T) { + root := t.TempDir() + // Only non-versioned entries, plus a stray file. + require.NoError(t, os.MkdirAll(filepath.Join(root, "experimental"), 0o755)) + mustWriteFile(t, filepath.Join(root, "stray-file"), "") + withCppRoot(t, root) + + dir, version := findLibstdcppRoot() + assert.Equal(t, "", dir) + assert.Equal(t, "", version) +} + +func TestFindLibstdcppRoot_RootMissing(t *testing.T) { + withCppRoot(t, filepath.Join(t.TempDir(), "missing-cpp-root")) + dir, version := findLibstdcppRoot() + assert.Equal(t, "", dir) + assert.Equal(t, "", version) +} + +func TestLinuxCppSource_Found(t *testing.T) { + root := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(root, "13"), 0o755)) + withCppRoot(t, root) + + src, err := linuxCppSource() + require.NoError(t, err) + assert.Equal(t, core.LanguageCpp, src.Language) + assert.Equal(t, "libstdc++-13", src.SystemTag) + assert.Contains(t, src.HeaderExts, "") +} + +func TestLinuxCppSource_NotFound(t *testing.T) { + withCppRoot(t, filepath.Join(t.TempDir(), "no-cpp")) + _, err := linuxCppSource() + require.Error(t, err) + assert.Contains(t, err.Error(), "no libstdc++") +} + +// withRoots overrides linuxLibcRoots for the duration of the test and restores +// it via t.Cleanup so other tests see the original list. +func withRoots(t *testing.T, roots []string) { + t.Helper() + original := linuxLibcRoots + linuxLibcRoots = roots + t.Cleanup(func() { linuxLibcRoots = original }) +} + +func withCppRoot(t *testing.T, root string) { + t.Helper() + original := linuxCppRoot + linuxCppRoot = root + t.Cleanup(func() { linuxCppRoot = original }) +} + +// mustWriteFile is a test helper that writes a small file and fails the test on error. +func mustWriteFile(t *testing.T, path, content string) { + t.Helper() + require.NoError(t, os.WriteFile(path, []byte(content), 0o644)) +} From 682fa021521d359167f4ead10082d6d93cdd27ed Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sun, 3 May 2026 17:46:23 -0400 Subject: [PATCH 3/3] feat(tools): C/C++ stdlib overlays + generator entry-point MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the user-facing generator binary plus the two hand-curated YAML overlay files that augment tree-sitter extraction with security tags, template return types, and skip rules: - tools/generate_clike_stdlib_registry.go — //go:build cpf_generate_stdlib_registry entry-point binary. Mirrors the layout of generate_go_stdlib_registry.go: flag-parse target / language / output-dir / overlay / base-url, hand off to clikeextract.NewExtractor(cfg).Run(). - tools/c_stdlib_overlay.yaml — 28 hand-curated entries covering format-string sinks (printf family with __attribute__((format))), command-injection sinks (system, popen, exec*), buffer-overflow sinks (strcpy, gets, sprintf), allocation sources (malloc, calloc), tainted sources (getenv, read), plus skip rules for compiler-internal symbols. - tools/cpp_stdlib_overlay.yaml — 55 entries covering STL methods whose template return types tree-sitter cannot substitute: vector at / operator[] / data, basic_string c_str / data, unique_ptr/shared_ptr get/reset/operator*, optional value/value_or, map/unordered_map find/ insert/operator[]/at, std::move / std::forward, ostream/istream stream operators. Throws annotations on at() (std::out_of_range), value() (std::bad_optional_access). End-to-end smoke against this host's /usr/include + /usr/include/c++/13 produced 1875 C headers (8467 functions) and 121 C++ headers (497 classes, 564 functions) — both manifests parsed and statistics check out. Run with: go run -tags cpf_generate_stdlib_registry tools/generate_clike_stdlib_registry.go \ --target=linux --language=c --output-dir=/tmp/cpf-c go run -tags cpf_generate_stdlib_registry tools/generate_clike_stdlib_registry.go \ --target=linux --language=cpp --output-dir=/tmp/cpf-cpp Output is local-only in this PR; remote deployment + CDN URL come in PR-03. Loader + engine integration come in PR-02. Co-Authored-By: Claude Opus 4.7 (1M context) --- sast-engine/tools/c_stdlib_overlay.yaml | 260 +++++++++++ sast-engine/tools/cpp_stdlib_overlay.yaml | 404 ++++++++++++++++++ .../tools/generate_clike_stdlib_registry.go | 94 ++++ 3 files changed, 758 insertions(+) create mode 100644 sast-engine/tools/c_stdlib_overlay.yaml create mode 100644 sast-engine/tools/cpp_stdlib_overlay.yaml create mode 100644 sast-engine/tools/generate_clike_stdlib_registry.go diff --git a/sast-engine/tools/c_stdlib_overlay.yaml b/sast-engine/tools/c_stdlib_overlay.yaml new file mode 100644 index 00000000..db94bc3f --- /dev/null +++ b/sast-engine/tools/c_stdlib_overlay.yaml @@ -0,0 +1,260 @@ +# C stdlib overlay — hand-curated augmentations to tree-sitter extraction. +# +# This file complements the generator's automatic header parse with entries +# tree-sitter cannot capture or that need richer metadata for downstream +# security analysis: __attribute__((format)) annotations stripped during +# parsing, security-tag classifications (sinks for command-injection, +# format-string, buffer-overflow), and skip rules for compiler-internal +# symbols. +# +# Schema: see tools/internal/clikeextract/overlay.go. +# Validation: language must equal the generator's --language flag; each +# override must specify exactly one of {function, typedef, constant}. + +schema_version: "1.0.0" +language: "c" + +overrides: + # ========================================================================= + # — format-string sinks and core file-IO surface. + # ========================================================================= + - header: stdio.h + function: printf + return_type: int + confidence: 1.0 + security_tag: format_string_sink + attribute: format(printf, 1, 2) + note: "Variadic format string — CWE-134" + + - header: stdio.h + function: fprintf + return_type: int + confidence: 1.0 + security_tag: format_string_sink + attribute: format(printf, 2, 3) + + - header: stdio.h + function: sprintf + return_type: int + confidence: 1.0 + security_tag: buffer_overflow_sink + note: "Use snprintf instead — CWE-120" + + - header: stdio.h + function: vsprintf + return_type: int + confidence: 1.0 + security_tag: buffer_overflow_sink + + - header: stdio.h + function: gets + return_type: char* + confidence: 1.0 + security_tag: buffer_overflow_sink + note: "Removed in C11; never safe — CWE-242" + + - header: stdio.h + function: scanf + return_type: int + confidence: 1.0 + security_tag: format_string_sink + attribute: format(scanf, 1, 2) + + - header: stdio.h + function: fscanf + return_type: int + confidence: 1.0 + security_tag: format_string_sink + + - header: stdio.h + function: sscanf + return_type: int + confidence: 1.0 + security_tag: format_string_sink + + # ========================================================================= + # — process-control + memory-allocation sinks. + # ========================================================================= + - header: stdlib.h + function: system + return_type: int + params: + - { name: command, type: const char* } + confidence: 1.0 + security_tag: command_injection_sink + note: "Calls /bin/sh -c — CWE-78" + + - header: stdlib.h + function: popen + return_type: FILE* + params: + - { name: command, type: const char* } + - { name: type, type: const char* } + confidence: 1.0 + security_tag: command_injection_sink + + - header: stdlib.h + function: getenv + return_type: char* + params: + - { name: name, type: const char* } + confidence: 1.0 + security_tag: tainted_source + note: "Environment variable — untrusted input source" + + - header: stdlib.h + function: setenv + return_type: int + params: + - { name: name, type: const char* } + - { name: value, type: const char* } + - { name: overwrite, type: int } + confidence: 1.0 + + - header: stdlib.h + function: malloc + return_type: void* + params: + - { name: size, type: size_t } + confidence: 1.0 + security_tag: allocation_source + + - header: stdlib.h + function: calloc + return_type: void* + params: + - { name: nmemb, type: size_t } + - { name: size, type: size_t } + confidence: 1.0 + security_tag: allocation_source + + - header: stdlib.h + function: realloc + return_type: void* + params: + - { name: ptr, type: void* } + - { name: size, type: size_t } + confidence: 1.0 + security_tag: allocation_source + + - header: stdlib.h + function: free + return_type: void + params: + - { name: ptr, type: void* } + confidence: 1.0 + security_tag: deallocation_sink + + # ========================================================================= + # — banned/dangerous string functions. + # ========================================================================= + - header: string.h + function: strcpy + return_type: char* + confidence: 1.0 + security_tag: buffer_overflow_sink + note: "No bound checking — CWE-120" + + - header: string.h + function: strcat + return_type: char* + confidence: 1.0 + security_tag: buffer_overflow_sink + + - header: string.h + function: strncpy + return_type: char* + confidence: 1.0 + note: "Bounded but does not always NUL-terminate" + + - header: string.h + function: strncat + return_type: char* + confidence: 1.0 + + - header: string.h + function: memcpy + return_type: void* + confidence: 1.0 + note: "Undefined behaviour on overlapping regions — use memmove" + + # ========================================================================= + # — exec family + file-descriptor sinks (POSIX). + # ========================================================================= + - header: unistd.h + function: execve + return_type: int + params: + - { name: pathname, type: const char* } + - { name: argv, type: "char* const[]" } + - { name: envp, type: "char* const[]" } + confidence: 1.0 + security_tag: command_injection_sink + + - header: unistd.h + function: execvp + return_type: int + confidence: 1.0 + security_tag: command_injection_sink + + - header: unistd.h + function: execl + return_type: int + confidence: 1.0 + security_tag: command_injection_sink + + - header: unistd.h + function: execlp + return_type: int + confidence: 1.0 + security_tag: command_injection_sink + + - header: unistd.h + function: read + return_type: ssize_t + params: + - { name: fd, type: int } + - { name: buf, type: void* } + - { name: count, type: size_t } + confidence: 1.0 + security_tag: tainted_source + + # ========================================================================= + # — file-open helpers. + # ========================================================================= + - header: fcntl.h + function: open + return_type: int + params: + - { name: pathname, type: const char* } + - { name: flags, type: int } + confidence: 1.0 + + # ========================================================================= + # — variadic-argument macros. + # Tree-sitter sees these as macros and skips them; record their canonical + # form so security rules can reference them. + # ========================================================================= + - header: stdarg.h + typedef: va_list + type: __builtin_va_list + +# Cross-platform aliases. Tracked for use by the loader (PR-02) when a +# project includes a platform-specific spelling — e.g. mingw uses _stdio.h +# in some compatibility shims. +cross_platform_aliases: + - alias: _stdio.h + canonical: stdio.h + - alias: _string.h + canonical: string.h + +# Skip rules. Applied AFTER overrides, so any override on a name that also +# matches a skip is silently dropped (matches the user expectation of +# "skip is the final say"). +skip: + - prefix: __builtin_ + - prefix: __GLIBC_ + - prefix: _GLIBCXX_ + - prefix: _LIBCPP_ + - exact: __asm + - exact: __asm__ diff --git a/sast-engine/tools/cpp_stdlib_overlay.yaml b/sast-engine/tools/cpp_stdlib_overlay.yaml new file mode 100644 index 00000000..8c579b1d --- /dev/null +++ b/sast-engine/tools/cpp_stdlib_overlay.yaml @@ -0,0 +1,404 @@ +# C++ stdlib overlay — hand-curated augmentations to tree-sitter extraction. +# +# Tree-sitter parses C++ headers structurally but cannot substitute template +# parameters into return types (e.g. `std::vector::operator[]` extracts as +# `T&` but the resolver wants `int&` when the receiver is `std::vector`). +# The Phase 2 design accepts this limitation and uses overlay entries to record +# the canonical templated form; runtime substitution happens at resolution +# time (PR-02) via the receiver type. +# +# Schema: see tools/internal/clikeextract/overlay.go. + +schema_version: "1.0.0" +language: "cpp" + +overrides: + # ========================================================================= + # — sequence container methods with template return types. + # ========================================================================= + - header: vector + class: std::vector + method: push_back + return_type: void + params: + - { name: value, type: "const T&" } + confidence: 1.0 + + - header: vector + class: std::vector + method: emplace_back + return_type: "T&" + confidence: 1.0 + + - header: vector + class: std::vector + method: pop_back + return_type: void + confidence: 1.0 + + - header: vector + class: std::vector + method: at + return_type: "T&" + params: + - { name: pos, type: size_t } + confidence: 1.0 + throws: std::out_of_range + + - header: vector + class: std::vector + method: "operator[]" + return_type: "T&" + params: + - { name: pos, type: size_t } + confidence: 1.0 + note: "No bounds check — CWE-129 candidate" + + - header: vector + class: std::vector + method: front + return_type: "T&" + confidence: 1.0 + + - header: vector + class: std::vector + method: back + return_type: "T&" + confidence: 1.0 + + - header: vector + class: std::vector + method: data + return_type: "T*" + confidence: 1.0 + + - header: vector + class: std::vector + method: begin + return_type: iterator + confidence: 1.0 + + - header: vector + class: std::vector + method: end + return_type: iterator + confidence: 1.0 + + - header: vector + class: std::vector + method: size + return_type: size_t + confidence: 1.0 + + - header: vector + class: std::vector + method: capacity + return_type: size_t + confidence: 1.0 + + - header: vector + class: std::vector + method: empty + return_type: bool + confidence: 1.0 + + - header: vector + class: std::vector + method: reserve + return_type: void + params: + - { name: n, type: size_t } + confidence: 1.0 + + - header: vector + class: std::vector + method: resize + return_type: void + confidence: 1.0 + + - header: vector + class: std::vector + method: clear + return_type: void + confidence: 1.0 + + # ========================================================================= + # — std::basic_string surface. + # ========================================================================= + - header: string + class: std::basic_string + method: c_str + return_type: "const CharT*" + confidence: 1.0 + note: "Critical for use-after-free analysis on string ownership" + + - header: string + class: std::basic_string + method: data + return_type: "const CharT*" + confidence: 1.0 + + - header: string + class: std::basic_string + method: size + return_type: size_t + confidence: 1.0 + + - header: string + class: std::basic_string + method: length + return_type: size_t + confidence: 1.0 + + - header: string + class: std::basic_string + method: empty + return_type: bool + confidence: 1.0 + + - header: string + class: std::basic_string + method: find + return_type: size_t + confidence: 1.0 + + - header: string + class: std::basic_string + method: substr + return_type: "std::basic_string" + confidence: 1.0 + + - header: string + class: std::basic_string + method: append + return_type: "std::basic_string&" + confidence: 1.0 + + - header: string + class: std::basic_string + method: at + return_type: "CharT&" + params: + - { name: pos, type: size_t } + confidence: 1.0 + throws: std::out_of_range + + - header: string + class: std::basic_string + method: "operator[]" + return_type: "CharT&" + params: + - { name: pos, type: size_t } + confidence: 1.0 + + - header: string + class: std::basic_string + method: "operator+=" + return_type: "std::basic_string&" + confidence: 1.0 + + # ========================================================================= + # — smart-pointer accessor methods. + # ========================================================================= + - header: memory + class: std::unique_ptr + method: get + return_type: "T*" + confidence: 1.0 + + - header: memory + class: std::unique_ptr + method: release + return_type: "T*" + confidence: 1.0 + + - header: memory + class: std::unique_ptr + method: reset + return_type: void + confidence: 1.0 + + - header: memory + class: std::unique_ptr + method: "operator*" + return_type: "T&" + confidence: 1.0 + + - header: memory + class: std::unique_ptr + method: "operator->" + return_type: "T*" + confidence: 1.0 + + - header: memory + class: std::shared_ptr + method: get + return_type: "T*" + confidence: 1.0 + + - header: memory + class: std::shared_ptr + method: reset + return_type: void + confidence: 1.0 + + - header: memory + class: std::shared_ptr + method: use_count + return_type: long + confidence: 1.0 + + - header: memory + class: std::shared_ptr + method: "operator*" + return_type: "T&" + confidence: 1.0 + + - header: memory + class: std::shared_ptr + method: "operator->" + return_type: "T*" + confidence: 1.0 + + # ========================================================================= + # — std::move / forward / pair primitives. + # ========================================================================= + - header: utility + function: std::move + return_type: "T&&" + params: + - { name: x, type: "T&" } + confidence: 1.0 + + - header: utility + function: std::forward + return_type: "T&&" + params: + - { name: x, type: "T&" } + confidence: 1.0 + + # ========================================================================= + # / — associative container surface. + # ========================================================================= + - header: map + class: std::map + method: find + return_type: iterator + confidence: 1.0 + + - header: map + class: std::map + method: insert + return_type: "std::pair" + confidence: 1.0 + + - header: map + class: std::map + method: "operator[]" + return_type: "T&" + params: + - { name: key, type: "const Key&" } + confidence: 1.0 + note: "Inserts default-constructed value if key missing" + + - header: map + class: std::map + method: at + return_type: "T&" + params: + - { name: key, type: "const Key&" } + confidence: 1.0 + throws: std::out_of_range + + - header: map + class: std::map + method: size + return_type: size_t + confidence: 1.0 + + - header: map + class: std::map + method: empty + return_type: bool + confidence: 1.0 + + - header: unordered_map + class: std::unordered_map + method: find + return_type: iterator + confidence: 1.0 + + - header: unordered_map + class: std::unordered_map + method: "operator[]" + return_type: "T&" + confidence: 1.0 + + - header: unordered_map + class: std::unordered_map + method: insert + return_type: "std::pair" + confidence: 1.0 + + - header: unordered_map + class: std::unordered_map + method: at + return_type: "T&" + confidence: 1.0 + throws: std::out_of_range + + # ========================================================================= + # — value access and presence checks. + # ========================================================================= + - header: optional + class: std::optional + method: value + return_type: "T&" + confidence: 1.0 + throws: std::bad_optional_access + + - header: optional + class: std::optional + method: value_or + return_type: "T" + confidence: 1.0 + + - header: optional + class: std::optional + method: has_value + return_type: bool + confidence: 1.0 + + - header: optional + class: std::optional + method: "operator*" + return_type: "T&" + confidence: 1.0 + + # ========================================================================= + # / — stream operators. + # ========================================================================= + - header: ostream + class: std::ostream + method: "operator<<" + return_type: "std::ostream&" + confidence: 1.0 + + - header: istream + class: std::istream + method: "operator>>" + return_type: "std::istream&" + confidence: 1.0 + +# Cross-platform aliases. Tree-sitter sometimes reaches the libstdc++-internal +# namespaces; record canonical aliases so the loader can normalise. +# (None for now — std::__cxx11 stripping happens directly in normalize.go, +# not via the alias mechanism. Reserved for cross-platform header-name +# divergence in PR-03.) +cross_platform_aliases: [] + +skip: + - prefix: __builtin_ + - prefix: _GLIBCXX_ + - prefix: _LIBCPP_ + - prefix: __cxxabiv1 diff --git a/sast-engine/tools/generate_clike_stdlib_registry.go b/sast-engine/tools/generate_clike_stdlib_registry.go new file mode 100644 index 00000000..f913041d --- /dev/null +++ b/sast-engine/tools/generate_clike_stdlib_registry.go @@ -0,0 +1,94 @@ +//go:build cpf_generate_stdlib_registry + +// generate_clike_stdlib_registry is a standalone tool that walks installed +// system headers (Linux glibc + libstdc++ in PR-01; Windows + Darwin in PR-03) +// and emits per-header JSON registry files describing functions, classes, +// methods, typedefs, and constants — the input the loader (PR-02) consumes +// when resolving stdlib calls during analysis. +// +// Usage: +// +// go run -tags cpf_generate_stdlib_registry tools/generate_clike_stdlib_registry.go \ +// --target=linux --language=c --output-dir=./out/linux/c/v1 +// +// Flags: +// +// --target "linux" (PR-01); "windows" / "darwin" land in PR-03 +// --language "c" or "cpp" +// --output-dir Directory to write manifest.json + per-header JSON files +// --overlay Path to YAML overlay; defaults to tools/_stdlib_overlay.yaml +// --base-url Override the URL stamped into manifest entries (default +// https://assets.codepathfinder.dev/registries — useful for +// file:// in local development) +// +// Output: /manifest.json + /
_stdlib.json files. +// +// The build tag `cpf_generate_stdlib_registry` excludes this file from the +// regular `go build` / `go test` invocations — matches the existing pattern +// used by tools/generate_go_stdlib_registry.go. +package main + +import ( + "flag" + "fmt" + "os" + "path/filepath" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/callgraph/core" + "github.com/shivasurya/code-pathfinder/sast-engine/tools/internal/clikeextract" +) + +func main() { + target := flag.String("target", core.PlatformLinux, + "target platform: linux | windows | darwin (windows/darwin land in PR-03)") + language := flag.String("language", "", + "language: c or cpp (required)") + outputDir := flag.String("output-dir", "", + "output directory (required)") + overlayPath := flag.String("overlay", "", + "path to YAML overlay (default: tools/_stdlib_overlay.yaml)") + baseURL := flag.String("base-url", "", + "override base URL stamped into manifest entries (default: assets.codepathfinder.dev/registries)") + flag.Parse() + + if *language == "" { + fail("--language is required (c or cpp)") + } + if *outputDir == "" { + fail("--output-dir is required") + } + + if *overlayPath == "" { + // Auto-detect overlay alongside this binary's source directory. + // Resolve relative to the cwd; callers running `go run` from the + // sast-engine directory will hit `tools/_stdlib_overlay.yaml`. + guess := filepath.Join("tools", *language+"_stdlib_overlay.yaml") + if _, err := os.Stat(guess); err == nil { + *overlayPath = guess + } + } + + cfg := clikeextract.Config{ + Target: *target, + Language: *language, + OutputDir: *outputDir, + OverlayPath: *overlayPath, + BaseURL: *baseURL, + } + if err := cfg.Validate(); err != nil { + fail(err.Error()) + } + + if err := clikeextract.NewExtractor(cfg).Run(); err != nil { + fail("generator failed: %v", err) + } + + fmt.Fprintf(os.Stderr, "wrote registry to %s (target=%s language=%s overlay=%s)\n", + *outputDir, *target, *language, *overlayPath) +} + +func fail(format string, args ...any) { + fmt.Fprintf(os.Stderr, "error: "+format+"\n", args...) + flag.Usage() + os.Exit(2) +}