From d4aae165faf9cf8c8306e66c62b5d17bb0bc280a Mon Sep 17 00:00:00 2001 From: shivasurya Date: Sat, 2 May 2026 10:09:30 -0400 Subject: [PATCH] =?UTF-8?q?feat(graph):=20C=20parser=20=E2=80=94=20functio?= =?UTF-8?q?n=20definitions,=20types,=20decls,=20calls,=20includes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert tree-sitter C AST nodes into graph.Node objects. After this PR, scanning a C project produces a populated CodeGraph for every node type the C parser is responsible for: function definitions, forward declarations, structs, enums, typedefs, variable declarations, includes, and call expressions. # parser_c.go (new) Single file in package graph (matching parser_python.go / parser_golang.go convention). Organised into clearly-marked sections — function definitions, struct/enum/typedef, variable declarations, call expressions, includes, and a small block of internal helpers. Two functions — parseCLikeDeclaration and parseCLikeInclude — accept an isCpp flag so parser_cpp.go (PR-04) can reuse them without duplicating logic. All AST extraction (function metadata, type strings, parameters, struct fields, call info) goes through graph/clike (PR-02). The parser is essentially a thin layer that turns clike's structured info into graph.Node objects with the right Type, Language, Metadata, and SourceLocation. Notable design choices: - Forward declarations: tree-sitter emits `declaration` (not `function_definition`) for prototypes such as `int add(int, int);`. parseCLikeDeclaration detects function_declarator children via isFunctionPrototype and routes them to emitFunctionDeclaration, which produces a function_definition node with Metadata["is_declaration"] = true. This means rule writers can find every callable function under a single Type, with the declaration/definition distinction surfaced as metadata. - Type-reference vs type-declaration: `struct Buffer*` in a parameter is not a struct declaration. parseCStructSpecifier and parseCEnumSpecifier short-circuit when the body field is nil, leaving the variable_declaration / parameter to carry the type information. - Multi-declarator declarations: `int a = 1, b = 2, c;` produces three variable_declaration nodes — one per init_declarator child reached via childrenByFieldName (which iterates field-name matches, since ChildByFieldName returns only the first). - Constants for Node.Type, Node.Language, and Metadata keys are declared at the top of the file so consumers (rules, call-graph builder) can reference them by symbol rather than string literal. # parser.go (modified) Two existing cases gained a C branch: - function_definition: C dispatch first, Python second - call_expression: C dispatch first, Go second Five new cases for C/C++ specific node types: - struct_specifier (C only at top level — C++ uses class_specifier) - enum_specifier - type_definition - declaration - preproc_include Java and Python paths are untouched; existing tests pass with zero changes. # Tests testdata/c/example.c covers every node type the parser handles, plus a neighbouring buffer.h with two forward declarations. parser_c_test.go runs Initialize() against the directory and asserts: - function definitions emit correct Name, ReturnType, params, modifiers - forward declarations carry Metadata["is_declaration"] = true - struct fields appear in MethodArgumentsType as "name: type" pairs - enum enumerators appear in Metadata["enumerators"] preserving values - typedefs capture both the alias name and the underlying type - multi-declarator declarations emit one node per variable - function-local variables carry their enclosing function name as Scope - system vs project includes are tagged correctly via Metadata - call expressions are linked to their enclosing function via OutgoingEdges Two focused unit tests cover the call-shape branches (arrow method, qualified call) and the isCpp=true path on parseCLikeDeclaration that the integration fixture cannot exercise yet — those branches go live when parser_cpp.go (PR-04) starts dispatching from .cpp files. Co-Authored-By: Claude --- sast-engine/graph/parser.go | 47 +- sast-engine/graph/parser_c.go | 593 +++++++++++++++++++++++++ sast-engine/graph/parser_c_test.go | 464 +++++++++++++++++++ sast-engine/graph/testdata/c/buffer.h | 7 + sast-engine/graph/testdata/c/example.c | 45 ++ 5 files changed, 1152 insertions(+), 4 deletions(-) create mode 100644 sast-engine/graph/parser_c.go create mode 100644 sast-engine/graph/parser_c_test.go create mode 100644 sast-engine/graph/testdata/c/buffer.h create mode 100644 sast-engine/graph/testdata/c/example.c diff --git a/sast-engine/graph/parser.go b/sast-engine/graph/parser.go index 1cfaa632..eac0bfbd 100644 --- a/sast-engine/graph/parser.go +++ b/sast-engine/graph/parser.go @@ -1,17 +1,26 @@ package graph -import sitter "github.com/smacker/go-tree-sitter" +import ( + "github.com/shivasurya/code-pathfinder/sast-engine/graph/clike" + sitter "github.com/smacker/go-tree-sitter" +) // buildGraphFromAST builds a code graph from an Abstract Syntax Tree. func buildGraphFromAST(node *sitter.Node, sourceCode []byte, graph *CodeGraph, currentContext *Node, file string) { isJavaSourceFile := isJavaSourceFile(file) isPythonSourceFile := isPythonSourceFile(file) isGoSourceFile := isGoSourceFile(file) + isCFile := clike.IsCSourceFile(file) + isCppFile := clike.IsCppSourceFile(file) switch node.Type() { - // Python-specific node types + // Python and C share the function_definition node type — dispatch by + // language. C/C++ branches come first because the dispatcher is the + // only place these node types are handled. case "function_definition": - if isPythonSourceFile { + if isCFile { + currentContext = parseCFunctionDefinition(node, sourceCode, graph, file) + } else if isPythonSourceFile { currentContext = parsePythonFunctionDefinition(node, sourceCode, graph, file, currentContext) } @@ -112,10 +121,40 @@ func buildGraphFromAST(node *sitter.Node, sourceCode []byte, graph *CodeGraph, c } case "call_expression": - if isGoSourceFile { + if isCFile { + parseCCallExpression(node, sourceCode, graph, file, currentContext) + } else if isGoSourceFile { parseGoCallExpression(node, sourceCode, graph, file, currentContext) } + // C/C++ specific node types. struct_specifier appears in C only at the + // top level (C++ uses class_specifier for the equivalent construct); + // the remaining four are shared between C and C++. + case "struct_specifier": + if isCFile { + parseCStructSpecifier(node, sourceCode, graph, file) + } + + case "enum_specifier": + if isCFile || isCppFile { + parseCEnumSpecifier(node, sourceCode, graph, file) + } + + case "type_definition": + if isCFile || isCppFile { + parseCTypeDefinition(node, sourceCode, graph, file) + } + + case "declaration": + if isCFile || isCppFile { + parseCLikeDeclaration(node, sourceCode, graph, file, currentContext, isCppFile) + } + + case "preproc_include": + if isCFile || isCppFile { + parseCLikeInclude(node, sourceCode, graph, file, isCppFile) + } + case "short_var_declaration": if isGoSourceFile { parseGoShortVarDeclaration(node, sourceCode, graph, file) diff --git a/sast-engine/graph/parser_c.go b/sast-engine/graph/parser_c.go new file mode 100644 index 00000000..e0ac2902 --- /dev/null +++ b/sast-engine/graph/parser_c.go @@ -0,0 +1,593 @@ +package graph + +import ( + "strings" + + "github.com/shivasurya/code-pathfinder/sast-engine/graph/clike" + sitter "github.com/smacker/go-tree-sitter" +) + +// parser_c.go converts tree-sitter C AST nodes into graph.Node objects. The +// dispatcher in buildGraphFromAST (parser.go) selects the parse functions +// declared here for files whose extension routes them to the tree-sitter C +// grammar — every entry point sets Language="c" on the produced node. +// +// All AST extraction (function metadata, type strings, parameter lists, +// struct fields, call info) goes through the graph/clike helper package +// so that parser_cpp.go (PR-04) can reuse the same primitives without +// duplicating logic. Two helpers in this file — parseCLikeDeclaration and +// parseCLikeInclude — are deliberately language-neutral and accept an +// isCpp flag so the C++ parser can call them directly. + +// Language tags used as Node.Language values for the C/C++ parsers. +const ( + languageC = "c" + languageCpp = "cpp" +) + +// Node.Type values produced by the C/C++ parsers. +const ( + nodeTypeFunctionDefinition = "function_definition" + nodeTypeStructDeclaration = "struct_declaration" + nodeTypeEnumDeclaration = "enum_declaration" + nodeTypeTypeDefinition = "type_definition" + nodeTypeVariableDecl = "variable_declaration" + nodeTypeCallExpression = "call_expression" + nodeTypeIncludeStatement = "include_statement" +) + +// Metadata keys used by the C/C++ parsers. Keeping these as constants makes +// downstream consumers (rule writers, the call-graph builder) discoverable +// and prevents key drift. +const ( + metaIsDeclaration = "is_declaration" + metaSystemInclude = "system_include" + metaStorageClasses = "storage_classes" + metaEnumerators = "enumerators" + metaUnderlyingType = "underlying_type" + metaIsAnonymous = "is_anonymous" +) + +// ============================================================================= +// Function definitions +// ============================================================================= + +// parseCFunctionDefinition converts a tree-sitter `function_definition` (or a +// top-level forward declaration of the same shape) into a graph.Node. The +// returned node becomes the currentContext for any nested AST traversal so +// call expressions inside the body can be linked back to their enclosing +// function. +// +// IsDeclaration semantics: a function with no compound_statement body +// (typical in headers, e.g. `int add(int a, int b);`) gets +// Metadata["is_declaration"] = true. Resolved definitions in .c files do +// not set this key. +// +// Storage-class qualifiers (static, inline, extern, _Noreturn, _Thread_local) +// appear as storage_class_specifier siblings in the AST. They are collected +// into Metadata["storage_classes"] for downstream rule writers and into +// Modifier (joined by space) for ergonomic single-string access. +func parseCFunctionDefinition(node *sitter.Node, sourceCode []byte, graph *CodeGraph, file string) *Node { + info := clike.ExtractFunctionInfo(node, sourceCode) + if info == nil { + return nil + } + + storageClasses := collectStorageClassSpecifiers(node, sourceCode) + metadata := map[string]any{} + if info.IsDeclaration { + metadata[metaIsDeclaration] = true + } + if len(storageClasses) > 0 { + metadata[metaStorageClasses] = storageClasses + } + + functionNode := &Node{ + ID: GenerateMethodID("function:"+info.Name, info.ParamTypes, file, info.LineNumber), + Type: nodeTypeFunctionDefinition, + Name: info.Name, + LineNumber: info.LineNumber, + ReturnType: info.ReturnType, + MethodArgumentsType: info.ParamTypes, + MethodArgumentsValue: info.ParamNames, + Modifier: strings.Join(storageClasses, " "), + File: file, + Language: languageC, + SourceLocation: newSourceLocation(file, node), + Metadata: metadata, + } + graph.AddNode(functionNode) + return functionNode +} + +// ============================================================================= +// Struct / Enum / Typedef +// ============================================================================= + +// parseCStructSpecifier records a `struct_specifier` declaration. Anonymous +// structs (no name child, common when used inline as a typedef target or +// declaration type) are still recorded so downstream rules can scope them +// to their declaring location; Metadata["is_anonymous"] = true marks them. +// +// Fields are stored as MethodArgumentsType (parallel slice of "name: type" +// strings) — reusing the existing field on graph.Node avoids a Metadata +// allocation for what is the most common access pattern. +func parseCStructSpecifier(node *sitter.Node, sourceCode []byte, graph *CodeGraph, file string) { + body := node.ChildByFieldName("body") + if body == nil { + // `struct S` used as a type reference (e.g. `struct S* p`) — not a + // declaration. Skip; the variable_declaration / parameter that + // references it carries the type information. + return + } + + name, isAnonymous := extractTaggedName(node, sourceCode) + fields := clike.ExtractStructFields(body, sourceCode) + fieldStrings := make([]string, 0, len(fields)) + for _, f := range fields { + if f.Name == "" { + fieldStrings = append(fieldStrings, f.TypeStr) + } else { + fieldStrings = append(fieldStrings, f.Name+": "+f.TypeStr) + } + } + + metadata := map[string]any{} + if isAnonymous { + metadata[metaIsAnonymous] = true + } + + graph.AddNode(&Node{ + ID: GenerateSha256("struct:" + name + "@" + file + "#" + lineRange(node)), + Type: nodeTypeStructDeclaration, + Name: name, + LineNumber: node.StartPoint().Row + 1, + MethodArgumentsType: fieldStrings, + File: file, + Language: languageC, + SourceLocation: newSourceLocation(file, node), + Metadata: metadata, + }) +} + +// parseCEnumSpecifier records an `enum_specifier`. Enumerators are stored in +// Metadata["enumerators"] as a []string of "NAME" or "NAME=VALUE" entries — +// keeping the original source form so rule writers see what authors wrote. +func parseCEnumSpecifier(node *sitter.Node, sourceCode []byte, graph *CodeGraph, file string) { + body := node.ChildByFieldName("body") + if body == nil { + // `enum E` used as a type reference. Skip — same reasoning as + // parseCStructSpecifier. + return + } + + name, isAnonymous := extractTaggedName(node, sourceCode) + enumerators := extractEnumerators(body, sourceCode) + + metadata := map[string]any{ + metaEnumerators: enumerators, + } + if isAnonymous { + metadata[metaIsAnonymous] = true + } + + graph.AddNode(&Node{ + ID: GenerateSha256("enum:" + name + "@" + file + "#" + lineRange(node)), + Type: nodeTypeEnumDeclaration, + Name: name, + LineNumber: node.StartPoint().Row + 1, + File: file, + Language: languageC, + SourceLocation: newSourceLocation(file, node), + Metadata: metadata, + }) +} + +// parseCTypeDefinition records a `type_definition` (typedef). The aliased +// type goes into DataType (e.g. "unsigned long", "struct { int x; int y; }") +// and into Metadata["underlying_type"] so consumers needing the structured +// form versus the alias can distinguish them. +// +// Multiple declarators in one typedef (`typedef int a, b, c;`) emit one +// graph node per alias name. +func parseCTypeDefinition(node *sitter.Node, sourceCode []byte, graph *CodeGraph, file string) { + typeNode := node.ChildByFieldName("type") + underlying := "" + if typeNode != nil { + underlying = strings.TrimSpace(typeNode.Content(sourceCode)) + } + + for _, declarator := range childrenByFieldName(node, "declarator") { + aliasName := bareIdentifierName(declarator, sourceCode) + if aliasName == "" { + aliasName = strings.TrimSpace(declarator.Content(sourceCode)) + } + graph.AddNode(&Node{ + ID: GenerateSha256("typedef:" + aliasName + "@" + file + "#" + lineRange(node)), + Type: nodeTypeTypeDefinition, + Name: aliasName, + DataType: underlying, + LineNumber: node.StartPoint().Row + 1, + File: file, + Language: languageC, + SourceLocation: newSourceLocation(file, node), + Metadata: map[string]any{metaUnderlyingType: underlying}, + }) + } +} + +// ============================================================================= +// Variable declarations (shared with C++ via isCpp flag) +// ============================================================================= + +// parseCLikeDeclaration records every variable introduced by a `declaration` +// node. The same code handles C and C++ because tree-sitter exposes the +// node identically in both grammars; the isCpp flag only changes the +// Language tag on the produced graph nodes. +// +// Multi-declarator forms (`int a = 1, b = 2, c;`) emit one graph node per +// variable. Each declarator is unwrapped with bareIdentifierName so pointer +// and array wrappers contribute to DataType (via clike.ExtractTypeString) +// rather than to Name. +// +// Initialisers (the `=` value) are captured as VariableValue when present. +// The Scope is "global" at translation-unit scope or the enclosing +// function's name when the declaration sits inside a function body — +// currentContext (set during AST descent in buildGraphFromAST) carries +// the latter. +func parseCLikeDeclaration(node *sitter.Node, sourceCode []byte, graph *CodeGraph, file string, currentContext *Node, isCpp bool) { + // A `declaration` node whose declarator chain reaches a function_declarator + // is a function prototype (forward declaration). Emit a + // function_definition node so callers and call-graph builders find it + // alongside actual definitions; Metadata["is_declaration"] = true + // distinguishes the prototype from a body-bearing definition. + if isFunctionPrototype(node) { + emitFunctionDeclaration(node, sourceCode, graph, file, isCpp) + return + } + + typeNode := node.ChildByFieldName("type") + scope := scopeFromContext(currentContext) + language := languageOfFile(isCpp) + lineNumber := node.StartPoint().Row + 1 + + for _, declarator := range childrenByFieldName(node, "declarator") { + name, valueText := bareIdentifierAndInitialiser(declarator, sourceCode) + if name == "" { + continue + } + dataType := clike.ExtractTypeString(typeNode, declarator, sourceCode) + + graph.AddNode(&Node{ + ID: GenerateSha256("var:" + scope + "::" + name + "@" + file + "#" + lineRange(node)), + Type: nodeTypeVariableDecl, + Name: name, + DataType: dataType, + VariableValue: valueText, + Scope: scope, + LineNumber: lineNumber, + File: file, + Language: language, + SourceLocation: newSourceLocation(file, node), + }) + } +} + +// isFunctionPrototype reports whether a `declaration` node carries a +// function_declarator (a forward declaration like `int add(int, int);`). +// Multi-declarator declarations (`int x; int f();`) are unusual but legal — +// any declarator being a function_declarator is enough to treat the whole +// declaration as a prototype, which matches what real C codebases do. +func isFunctionPrototype(node *sitter.Node) bool { + for _, declarator := range childrenByFieldName(node, "declarator") { + for cur := declarator; cur != nil; cur = cur.ChildByFieldName("declarator") { + if cur.Type() == "function_declarator" { + return true + } + } + } + return false +} + +// emitFunctionDeclaration produces a function_definition graph node from a +// body-less `declaration` (a function prototype). The shape mirrors +// parseCFunctionDefinition so consumers do not need to special-case +// declarations vs definitions — the only difference is the +// Metadata["is_declaration"] = true flag. +func emitFunctionDeclaration(node *sitter.Node, sourceCode []byte, graph *CodeGraph, file string, isCpp bool) { + info := clike.ExtractFunctionInfo(node, sourceCode) + if info == nil || info.Name == "" { + return + } + storageClasses := collectStorageClassSpecifiers(node, sourceCode) + metadata := map[string]any{metaIsDeclaration: true} + if len(storageClasses) > 0 { + metadata[metaStorageClasses] = storageClasses + } + + graph.AddNode(&Node{ + ID: GenerateMethodID("function:"+info.Name, info.ParamTypes, file, info.LineNumber), + Type: nodeTypeFunctionDefinition, + Name: info.Name, + LineNumber: info.LineNumber, + ReturnType: info.ReturnType, + MethodArgumentsType: info.ParamTypes, + MethodArgumentsValue: info.ParamNames, + Modifier: strings.Join(storageClasses, " "), + File: file, + Language: languageOfFile(isCpp), + SourceLocation: newSourceLocation(file, node), + Metadata: metadata, + }) +} + +// ============================================================================= +// Call expressions +// ============================================================================= + +// parseCCallExpression records a `call_expression`. The shape (free +// function vs method-dot vs method-arrow vs qualified) is determined by +// clike.ExtractCallInfo and stored alongside the target so downstream +// rule writers can match either on call shape or on the target name. +// +// currentContext links the call to its enclosing function via an edge so +// the call-graph builder (PR-07) can follow callers→callees without a +// second AST pass. +func parseCCallExpression(node *sitter.Node, sourceCode []byte, graph *CodeGraph, file string, currentContext *Node) { + info := clike.ExtractCallInfo(node, sourceCode) + if info == nil { + return + } + + metadata := map[string]any{} + if info.IsMethod { + metadata["is_method"] = true + } + if info.IsArrow { + metadata["is_arrow"] = true + } + if info.IsQualified { + metadata["is_qualified"] = true + } + if info.Receiver != "" { + metadata["receiver"] = info.Receiver + } + + callNode := &Node{ + ID: GenerateSha256("call:" + info.Target + "@" + file + "#" + lineRange(node)), + Type: nodeTypeCallExpression, + Name: info.Target, + MethodArgumentsValue: info.Args, + LineNumber: node.StartPoint().Row + 1, + File: file, + Language: languageC, + SourceLocation: newSourceLocation(file, node), + Metadata: metadata, + } + graph.AddNode(callNode) + if currentContext != nil { + graph.AddEdge(currentContext, callNode) + } +} + +// ============================================================================= +// Preprocessor includes (shared with C++ via isCpp flag) +// ============================================================================= + +// parseCLikeInclude records a `preproc_include` directive. Angle-bracket +// includes (``) are flagged as system includes via +// Metadata["system_include"] = true; quoted includes (`"myheader.h"`) +// are project-local. The header path is stored in Name with surrounding +// quotes/brackets stripped so resolvers can match on the bare path. +func parseCLikeInclude(node *sitter.Node, sourceCode []byte, graph *CodeGraph, file string, isCpp bool) { + pathNode := node.ChildByFieldName("path") + if pathNode == nil { + return + } + + rawPath := pathNode.Content(sourceCode) + headerPath, isSystem := normaliseIncludePath(pathNode.Type(), rawPath) + if headerPath == "" { + return + } + + graph.AddNode(&Node{ + ID: GenerateSha256("include:" + headerPath + "@" + file + "#" + lineRange(node)), + Type: nodeTypeIncludeStatement, + Name: headerPath, + LineNumber: node.StartPoint().Row + 1, + File: file, + Language: languageOfFile(isCpp), + SourceLocation: newSourceLocation(file, node), + Metadata: map[string]any{metaSystemInclude: isSystem}, + }) +} + +// ============================================================================= +// Internal helpers +// ============================================================================= + +// collectStorageClassSpecifiers returns the storage_class_specifier siblings +// of node (typically "static", "inline", "extern", "_Noreturn", +// "_Thread_local"). Order is preserved from source. +func collectStorageClassSpecifiers(node *sitter.Node, sourceCode []byte) []string { + var classes []string + for i := 0; i < int(node.NamedChildCount()); i++ { + child := node.NamedChild(i) + if child != nil && child.Type() == "storage_class_specifier" { + classes = append(classes, strings.TrimSpace(child.Content(sourceCode))) + } + } + return classes +} + +// childrenByFieldName returns every direct child of node whose field name +// matches name. tree-sitter exposes ChildByFieldName for the *first* match +// only, but several C constructs (declaration with multiple init_declarators, +// type_definition with multiple alias names) repeat the same field — this +// helper iterates the full child list and yields all of them in order. +func childrenByFieldName(node *sitter.Node, name string) []*sitter.Node { + var matches []*sitter.Node + for i := 0; i < int(node.ChildCount()); i++ { + if node.FieldNameForChild(i) == name { + if c := node.Child(i); c != nil { + matches = append(matches, c) + } + } + } + return matches +} + +// bareIdentifierName unwraps a declarator chain (init_declarator, +// pointer_declarator, reference_declarator, array_declarator) to return the +// inner identifier or field_identifier name. Returns "" when no identifier +// is present (e.g. abstract declarators). +func bareIdentifierName(declarator *sitter.Node, sourceCode []byte) string { + cur := declarator + for cur != nil { + switch cur.Type() { + case "identifier", "field_identifier", "type_identifier", "primitive_type": + return cur.Content(sourceCode) + case "init_declarator": + cur = cur.ChildByFieldName("declarator") + continue + case "pointer_declarator", "reference_declarator", "array_declarator": + cur = cur.ChildByFieldName("declarator") + continue + } + // Unrecognised wrapper — try the field-named "declarator" child if + // present, otherwise stop walking. + if next := cur.ChildByFieldName("declarator"); next != nil && !next.Equal(cur) { + cur = next + continue + } + return "" + } + return "" +} + +// bareIdentifierAndInitialiser pulls the variable name and the initialiser +// expression text (when present) out of an init_declarator / declarator +// chain. The initialiser is the source text of the node held in the +// init_declarator's "value" field. +func bareIdentifierAndInitialiser(declarator *sitter.Node, sourceCode []byte) (string, string) { + if declarator == nil { + return "", "" + } + if declarator.Type() == "init_declarator" { + nameNode := declarator.ChildByFieldName("declarator") + valueNode := declarator.ChildByFieldName("value") + name := bareIdentifierName(nameNode, sourceCode) + value := "" + if valueNode != nil { + value = strings.TrimSpace(valueNode.Content(sourceCode)) + } + return name, value + } + return bareIdentifierName(declarator, sourceCode), "" +} + +// extractTaggedName returns the tag name on a struct_specifier or +// enum_specifier, plus a flag indicating whether the construct is anonymous +// (no name child). +func extractTaggedName(node *sitter.Node, sourceCode []byte) (string, bool) { + nameNode := node.ChildByFieldName("name") + if nameNode == nil { + return "", true + } + return nameNode.Content(sourceCode), false +} + +// extractEnumerators reads an enumerator_list and returns one entry per +// enumerator. Entries with explicit values are formatted as "NAME=VALUE"; +// entries without a value are just "NAME". Returns nil when body is nil. +func extractEnumerators(body *sitter.Node, sourceCode []byte) []string { + if body == nil { + return nil + } + var values []string + for i := 0; i < int(body.NamedChildCount()); i++ { + child := body.NamedChild(i) + if child == nil || child.Type() != "enumerator" { + continue + } + nameNode := child.ChildByFieldName("name") + if nameNode == nil { + continue + } + entry := nameNode.Content(sourceCode) + if valueNode := child.ChildByFieldName("value"); valueNode != nil { + entry = entry + "=" + strings.TrimSpace(valueNode.Content(sourceCode)) + } + values = append(values, entry) + } + return values +} + +// normaliseIncludePath strips the surrounding `<>` or `""` from an include +// path and returns the bare path plus a flag indicating whether the +// directive used angle brackets (system include). +func normaliseIncludePath(pathNodeType, rawPath string) (string, bool) { + switch pathNodeType { + case "system_lib_string": + return strings.Trim(rawPath, "<>"), true + case "string_literal": + return strings.Trim(rawPath, `"`), false + } + // Defensive fallback — strip both shapes. + return strings.Trim(rawPath, `<>"`), strings.HasPrefix(rawPath, "<") +} + +// scopeFromContext returns the enclosing function's name when currentContext +// is a C/C++ function definition, or "global" at translation-unit scope. +func scopeFromContext(currentContext *Node) string { + if currentContext != nil && + currentContext.Type == nodeTypeFunctionDefinition && + (currentContext.Language == languageC || currentContext.Language == languageCpp) { + return currentContext.Name + } + return "global" +} + +// languageOfFile returns "cpp" when isCpp is true, otherwise "c". +func languageOfFile(isCpp bool) string { + if isCpp { + return languageCpp + } + return languageC +} + +// lineRange returns a "start-end" string used to disambiguate IDs for nodes +// that share a name within a translation unit (e.g. anonymous enums in +// different scopes, multiple typedefs of the same alias). +func lineRange(node *sitter.Node) string { + start := node.StartPoint().Row + 1 + end := node.EndPoint().Row + 1 + return strings.TrimSuffix(joinUint(start)+"-"+joinUint(end), "-") +} + +// joinUint formats a tree-sitter row number for use in ID strings. +func joinUint(v uint32) string { + const digits = "0123456789" + if v == 0 { + return "0" + } + buf := [11]byte{} + i := len(buf) + for v > 0 { + i-- + buf[i] = digits[v%10] + v /= 10 + } + return string(buf[i:]) +} + +// newSourceLocation builds a SourceLocation that lazy-loads the original +// source from file using the byte range of node. +func newSourceLocation(file string, node *sitter.Node) *SourceLocation { + return &SourceLocation{ + File: file, + StartByte: node.StartByte(), + EndByte: node.EndByte(), + } +} diff --git a/sast-engine/graph/parser_c_test.go b/sast-engine/graph/parser_c_test.go new file mode 100644 index 00000000..5c7d915f --- /dev/null +++ b/sast-engine/graph/parser_c_test.go @@ -0,0 +1,464 @@ +package graph + +import ( + "context" + "testing" + + sitter "github.com/smacker/go-tree-sitter" + clang "github.com/smacker/go-tree-sitter/c" + cpplang "github.com/smacker/go-tree-sitter/cpp" +) + +// parseCSnippetForTest parses C source for unit tests in this package. +// Lives here (rather than in graph/clike's testhelpers_test.go) because +// test-only symbols don't cross package boundaries. +func parseCSnippetForTest(t *testing.T, code string) (*sitter.Tree, *sitter.Node) { + t.Helper() + return parseSnippetForTest(t, code, false) +} + +func parseSnippetForTest(t *testing.T, code string, isCpp bool) (*sitter.Tree, *sitter.Node) { + t.Helper() + parser := sitter.NewParser() + if isCpp { + parser.SetLanguage(cpplang.GetLanguage()) + } else { + parser.SetLanguage(clang.GetLanguage()) + } + defer parser.Close() + tree, err := parser.ParseCtx(context.Background(), nil, []byte(code)) + if err != nil { + t.Fatalf("parse: %v", err) + } + return tree, tree.RootNode() +} + +// findFirstNodeOfType performs a pre-order walk and returns the first +// descendant whose Type() matches nodeType. Returns nil when no match +// exists. +func findFirstNodeOfType(node *sitter.Node, nodeType string) *sitter.Node { + if node == nil { + return nil + } + if node.Type() == nodeType { + return node + } + for i := 0; i < int(node.ChildCount()); i++ { + if found := findFirstNodeOfType(node.Child(i), nodeType); found != nil { + return found + } + } + return nil +} + +// TestParseCEndToEnd parses testdata/c/ as a complete project via Initialize +// and asserts that every node type the C parser is responsible for — +// function definitions, forward declarations, structs, enums, typedefs, +// variable declarations, includes, and call expressions — produces graph +// nodes with the right Type, Name, Language, and (where applicable) +// Metadata flags. +// +// The test is intentionally written against the public API surface +// (Initialize → CodeGraph) rather than against individual parse functions, +// so it doubles as a regression suite for the dispatch wiring in +// parser.go. If a future refactor moves parse functions into a subpackage +// or changes the dispatch order, this test will catch behavioural drift. +func TestParseCEndToEnd(t *testing.T) { + graph := Initialize("testdata/c", nil) + if graph == nil { + t.Fatal("Initialize returned nil") + } + + nodes := collectByType(graph) + + t.Run("function_definitions", func(t *testing.T) { + fns := nodes[nodeTypeFunctionDefinition] + // Definitions: fast, add (definition), process — 3 with bodies in + // example.c. The forward declaration of add and the two prototypes + // in buffer.h add three body-less function_definition nodes (every + // function-shaped node tree-sitter sees becomes a function_definition, + // and parser.go dispatches via fileExt before this point — so + // declaration-shaped functions in headers also flow through here). + if len(fns) < 3 { + t.Fatalf("expected at least 3 function_definition nodes, got %d", len(fns)) + } + + gotByName := map[string]*Node{} + for _, n := range fns { + gotByName[n.Name] = n + } + + // fast(int x) — definition, has body, has static+inline qualifiers + fast := gotByName["fast"] + if fast == nil { + t.Fatal("expected function 'fast' in graph") + } + if fast.Language != "c" { + t.Errorf("fast.Language = %q, want %q", fast.Language, "c") + } + if fast.ReturnType != "int" { + t.Errorf("fast.ReturnType = %q, want %q", fast.ReturnType, "int") + } + if fast.Modifier != "static inline" { + t.Errorf("fast.Modifier = %q, want %q", fast.Modifier, "static inline") + } + if got, _ := fast.Metadata[metaIsDeclaration].(bool); got { + t.Error("fast should not be marked as declaration (has body)") + } + + // add(int a, int b) — definition in example.c + add := gotByName["add"] + if add == nil { + t.Fatal("expected function 'add' in graph") + } + if len(add.MethodArgumentsValue) != 2 || + add.MethodArgumentsValue[0] != "a" || + add.MethodArgumentsValue[1] != "b" { + t.Errorf("add params = %v, want [a, b]", add.MethodArgumentsValue) + } + + // process(struct Buffer* buf, size_t_alias n) + process := gotByName["process"] + if process == nil { + t.Fatal("expected function 'process' in graph") + } + if process.ReturnType != "void" { + t.Errorf("process.ReturnType = %q, want %q", process.ReturnType, "void") + } + }) + + t.Run("forward_declaration_marked", func(t *testing.T) { + // buffer.h declares compute() and release_all() with no body. + // example.c has a forward decl of add(int, int) too. We expect at + // least one function_definition with Metadata[is_declaration]=true. + decls := 0 + for _, n := range nodes[nodeTypeFunctionDefinition] { + if v, _ := n.Metadata[metaIsDeclaration].(bool); v { + decls++ + } + } + if decls < 2 { + t.Errorf("expected ≥2 forward declarations, got %d", decls) + } + }) + + t.Run("struct_declaration", func(t *testing.T) { + structs := nodes[nodeTypeStructDeclaration] + if len(structs) == 0 { + t.Fatal("expected at least one struct_declaration") + } + + var buffer *Node + for _, n := range structs { + if n.Name == "Buffer" { + buffer = n + break + } + } + if buffer == nil { + t.Fatal("expected struct 'Buffer'") + } + // Buffer has fields: char* data, size_t_alias len, int capacity + if len(buffer.MethodArgumentsType) != 3 { + t.Errorf("Buffer fields = %v, want 3 entries", buffer.MethodArgumentsType) + } + }) + + t.Run("enum_declaration", func(t *testing.T) { + enums := nodes[nodeTypeEnumDeclaration] + var color *Node + for _, n := range enums { + if n.Name == "Color" { + color = n + break + } + } + if color == nil { + t.Fatal("expected enum 'Color'") + } + enumerators, _ := color.Metadata[metaEnumerators].([]string) + want := []string{"RED=0", "GREEN", "BLUE=5"} + if len(enumerators) != len(want) { + t.Fatalf("Color enumerators = %v, want %v", enumerators, want) + } + for i, w := range want { + if enumerators[i] != w { + t.Errorf("enumerator[%d] = %q, want %q", i, enumerators[i], w) + } + } + }) + + t.Run("type_definition_unsigned_long", func(t *testing.T) { + typedefs := nodes[nodeTypeTypeDefinition] + var alias *Node + for _, n := range typedefs { + if n.Name == "size_t_alias" { + alias = n + break + } + } + if alias == nil { + t.Fatal("expected typedef 'size_t_alias'") + } + if alias.DataType != "unsigned long" { + t.Errorf("size_t_alias.DataType = %q, want %q", alias.DataType, "unsigned long") + } + }) + + t.Run("type_definition_anonymous_struct", func(t *testing.T) { + typedefs := nodes[nodeTypeTypeDefinition] + var point *Node + for _, n := range typedefs { + if n.Name == "Point" { + point = n + break + } + } + if point == nil { + t.Fatal("expected typedef 'Point'") + } + // DataType is the underlying struct text; we just verify it + // references a struct. + if point.DataType == "" { + t.Error("Point typedef should have non-empty DataType") + } + }) + + t.Run("variable_declarations", func(t *testing.T) { + vars := nodes[nodeTypeVariableDecl] + // Globals: pi (initialised), global_buf (no init), a, b, c (3 from + // the multi-declarator), tmp (function-local in process). We + // expect at least 6 declared variables. + if len(vars) < 6 { + t.Fatalf("expected ≥6 variable declarations, got %d (%v)", len(vars), names(vars)) + } + + byName := map[string]*Node{} + for _, n := range vars { + byName[n.Name] = n + } + + // pi is a global float + pi := byName["pi"] + if pi == nil { + t.Fatal("expected variable 'pi'") + } + if pi.DataType != "const float" { + t.Errorf("pi.DataType = %q, want %q", pi.DataType, "const float") + } + if pi.VariableValue != "3.14f" { + t.Errorf("pi.VariableValue = %q, want %q", pi.VariableValue, "3.14f") + } + if pi.Scope != "global" { + t.Errorf("pi.Scope = %q, want %q", pi.Scope, "global") + } + + // global_buf is char* with no initialiser + buf := byName["global_buf"] + if buf == nil { + t.Fatal("expected variable 'global_buf'") + } + if buf.DataType != "char*" { + t.Errorf("global_buf.DataType = %q, want %q", buf.DataType, "char*") + } + if buf.VariableValue != "" { + t.Errorf("global_buf.VariableValue = %q, want empty", buf.VariableValue) + } + + // Multi-declarator: int a = 1, b = 2, c; + for _, n := range []string{"a", "b", "c"} { + v := byName[n] + if v == nil { + t.Errorf("expected variable %q from multi-declarator", n) + continue + } + if v.DataType != "int" { + t.Errorf("%s.DataType = %q, want %q", n, v.DataType, "int") + } + } + + // Function-local tmp inside process() + tmp := byName["tmp"] + if tmp == nil { + t.Fatal("expected function-local variable 'tmp'") + } + if tmp.Scope != "process" { + t.Errorf("tmp.Scope = %q, want %q", tmp.Scope, "process") + } + }) + + t.Run("includes_system_vs_local", func(t *testing.T) { + incs := nodes[nodeTypeIncludeStatement] + byName := map[string]*Node{} + for _, n := range incs { + byName[n.Name] = n + } + + // and should be system; "buffer.h" local. + stdio := byName["stdio.h"] + if stdio == nil { + t.Fatal("expected include 'stdio.h'") + } + if v, _ := stdio.Metadata[metaSystemInclude].(bool); !v { + t.Errorf("stdio.h should be system include") + } + + buffer := byName["buffer.h"] + if buffer == nil { + t.Fatal("expected include 'buffer.h'") + } + if v, _ := buffer.Metadata[metaSystemInclude].(bool); v { + t.Errorf("buffer.h should NOT be system include") + } + }) + + t.Run("call_expressions_linked_to_caller", func(t *testing.T) { + calls := nodes[nodeTypeCallExpression] + // process() body calls malloc, free, add. We expect at least these + // 3 names to appear among call_expression nodes. + want := map[string]bool{"malloc": false, "free": false, "add": false} + for _, n := range calls { + if _, ok := want[n.Name]; ok { + want[n.Name] = true + } + } + for name, found := range want { + if !found { + t.Errorf("expected call to %q in graph", name) + } + } + + // The call to add() inside process() should produce an edge from + // the process function node to the add call node. + var processFn *Node + for _, n := range nodes[nodeTypeFunctionDefinition] { + if n.Name == "process" { + processFn = n + break + } + } + if processFn == nil { + t.Fatal("expected process() function node") + } + hasAddEdge := false + for _, e := range processFn.OutgoingEdges { + if e.To.Type == nodeTypeCallExpression && e.To.Name == "add" { + hasAddEdge = true + break + } + } + if !hasAddEdge { + t.Error("expected outgoing edge from process() to call_expression 'add'") + } + }) +} + +// TestParseCCallExpression_MethodAndQualified covers the call-shape +// metadata branches (IsArrow, IsQualified, Receiver) that don't appear in +// the example.c integration fixture but are essential for C++ rule +// matching. Each case parses a synthetic snippet, locates the +// call_expression, and verifies the metadata flags that +// parseCCallExpression sets. +func TestParseCCallExpression_MethodAndQualified(t *testing.T) { + tests := []struct { + name string + code string + isCpp bool + wantName string + wantFlags map[string]bool + }{ + { + name: "arrow method call", + code: "void f(struct Buffer* b) { b->free(); }", + wantName: "free", + wantFlags: map[string]bool{ + "is_method": true, + "is_arrow": true, + }, + }, + { + name: "qualified namespace call", + code: "void f() { ns::do_thing(); }", + isCpp: true, + wantName: "ns::do_thing", + wantFlags: map[string]bool{ + "is_qualified": true, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewCodeGraph() + tree, root := parseSnippetForTest(t, tt.code, tt.isCpp) + defer tree.Close() + + call := findFirstNodeOfType(root, "call_expression") + if call == nil { + t.Fatal("call_expression not found") + } + parseCCallExpression(call, []byte(tt.code), g, "test.c", nil) + + calls := collectByType(g)[nodeTypeCallExpression] + if len(calls) != 1 { + t.Fatalf("expected 1 call_expression node, got %d", len(calls)) + } + got := calls[0] + if got.Name != tt.wantName { + t.Errorf("Name = %q, want %q", got.Name, tt.wantName) + } + for key, want := range tt.wantFlags { + v, _ := got.Metadata[key].(bool) + if v != want { + t.Errorf("Metadata[%q] = %v, want %v", key, v, want) + } + } + }) + } +} + +// TestParseCLikeDeclaration_IsCppFlag confirms that the isCpp flag flips +// the Language tag on produced variable_declaration nodes. The branch is +// otherwise unreachable until parser_cpp.go (PR-04) starts dispatching +// declaration nodes from .cpp files. +func TestParseCLikeDeclaration_IsCppFlag(t *testing.T) { + code := "int answer = 42;" + tree, root := parseCSnippetForTest(t, code) + defer tree.Close() + + decl := findFirstNodeOfType(root, "declaration") + if decl == nil { + t.Fatal("declaration not found") + } + + g := NewCodeGraph() + parseCLikeDeclaration(decl, []byte(code), g, "test.cpp", nil, true) + + vars := collectByType(g)[nodeTypeVariableDecl] + if len(vars) != 1 { + t.Fatalf("expected 1 variable_declaration, got %d", len(vars)) + } + if vars[0].Language != "cpp" { + t.Errorf("Language = %q, want %q", vars[0].Language, "cpp") + } +} + +// collectByType groups every node in the graph by its Type field. Useful +// for end-to-end assertions that need to enumerate one category at a time. +func collectByType(g *CodeGraph) map[string][]*Node { + out := map[string][]*Node{} + for _, n := range g.Nodes { + out[n.Type] = append(out[n.Type], n) + } + return out +} + +// names returns the Name field from every node, useful in error messages +// when assertions on a category fail. +func names(nodes []*Node) []string { + out := make([]string, 0, len(nodes)) + for _, n := range nodes { + out = append(out, n.Name) + } + return out +} diff --git a/sast-engine/graph/testdata/c/buffer.h b/sast-engine/graph/testdata/c/buffer.h new file mode 100644 index 00000000..5321fce0 --- /dev/null +++ b/sast-engine/graph/testdata/c/buffer.h @@ -0,0 +1,7 @@ +#ifndef BUFFER_H +#define BUFFER_H + +int compute(int a, int b); +void release_all(void); + +#endif diff --git a/sast-engine/graph/testdata/c/example.c b/sast-engine/graph/testdata/c/example.c new file mode 100644 index 00000000..68aec60a --- /dev/null +++ b/sast-engine/graph/testdata/c/example.c @@ -0,0 +1,45 @@ +#include +#include +#include "buffer.h" + +typedef unsigned long size_t_alias; +typedef struct { + int x; + int y; +} Point; + +struct Buffer { + char* data; + size_t_alias len; + int capacity; +}; + +enum Color { + RED = 0, + GREEN, + BLUE = 5 +}; + +static const float pi = 3.14f; +char* global_buf; +int a = 1, b = 2, c; + +int add(int a, int b); + +static inline int fast(int x) { + return x; +} + +int add(int a, int b) { + int result = a + b; + return result; +} + +void process(struct Buffer* buf, size_t_alias n) { + if (buf == NULL) { + return; + } + char* tmp = malloc(n); + free(tmp); + add(1, 2); +}