diff --git a/sast-engine/graph/clike/detection.go b/sast-engine/graph/clike/detection.go new file mode 100644 index 00000000..afb74d4f --- /dev/null +++ b/sast-engine/graph/clike/detection.go @@ -0,0 +1,120 @@ +// Package clike contains shared helpers for parsing C and C++ source files. +// +// The parsing pipeline treats C and C++ as two distinct languages with separate +// tree-sitter grammars but a large amount of shared structure (declarations, +// statements, type strings). The helpers in this package live alongside the +// language-specific siblings (graph/golang, graph/python, graph/java) and +// provide the cross-cutting primitives — language detection today, AST and +// type extraction in subsequent PRs. +package clike + +import ( + "bufio" + "os" + "path/filepath" + "strings" + "sync" +) + +// headerLanguageCache stores the C/C++ classification of .h files keyed by +// absolute path. Values are bool: true => C++, false => C. +// +// .h is shared between C and C++ grammars and the correct grammar can only be +// chosen by inspecting the file contents. Reading the file once per AST node +// would be unacceptable, so the worker that drives the parsing pipeline calls +// CacheHeaderLanguage exactly once per .h file before AST traversal begins; +// IsCSourceFile and IsCppSourceFile then read the cache without performing I/O. +var headerLanguageCache sync.Map + +// CacheHeaderLanguage records that filename is a C++ header (isCpp == true) or +// a C header (isCpp == false). Must be called once per .h file in the parsing +// worker before any IsCSourceFile / IsCppSourceFile lookup for that file. +func CacheHeaderLanguage(filename string, isCpp bool) { + headerLanguageCache.Store(filename, isCpp) +} + +// IsCSourceFile reports whether filename should be parsed with the C grammar. +// +// Source-extension cases (.c) are answered directly. For .h files the answer +// comes from the header cache populated by CacheHeaderLanguage; an uncached +// .h falls back to C as the safe default (the grammar overlap means a +// misclassified C++ header still parses as a structurally-valid translation +// unit, just with reduced fidelity). +func IsCSourceFile(filename string) bool { + ext := filepath.Ext(filename) + if ext == ".c" { + return true + } + if ext == ".h" { + if v, ok := headerLanguageCache.Load(filename); ok { + return !v.(bool) + } + return true + } + return false +} + +// IsCppSourceFile reports whether filename should be parsed with the C++ +// grammar. +// +// .cpp/.cc/.cxx and the C++-only header extensions (.hpp/.hh/.hxx) are +// answered directly. For .h files the answer comes from the header cache +// populated by CacheHeaderLanguage; an uncached .h falls back to "not C++" +// so that IsCSourceFile and IsCppSourceFile remain mutually exclusive. +func IsCppSourceFile(filename string) bool { + ext := filepath.Ext(filename) + switch ext { + case ".cpp", ".cc", ".cxx", ".hpp", ".hh", ".hxx": + return true + case ".h": + if v, ok := headerLanguageCache.Load(filename); ok { + return v.(bool) + } + return false + } + return false +} + +// cppHeaderIndicators are byte sequences whose presence in a header file is a +// strong signal that the header should be parsed as C++ rather than C. The +// list is intentionally small and conservative — every entry is invalid in +// pure C, so a positive match is high-confidence. +var cppHeaderIndicators = []string{ + "class ", "namespace ", "template<", "template <", + "public:", "private:", "protected:", "::", +} + +// detectCppHeaderScanLines bounds how far DetectCppInHeader reads into a file. +// 100 lines covers the include guards, license header, and the first few +// declarations of every real-world header surveyed during the design phase. +const detectCppHeaderScanLines = 100 + +// DetectCppInHeader scans the first detectCppHeaderScanLines lines of filename +// for C++-only indicators (see cppHeaderIndicators). +// +// This is a best-effort heuristic, not a full preprocessor or parser. It is +// invoked from the worker exactly once per file and its result is stored via +// CacheHeaderLanguage; AST-traversal code reads the cache and never calls +// this function on the hot path. +// +// A missing or unreadable file returns false (treated as C) so that downstream +// parsing fails on an obviously-broken input rather than mislabeling the +// language. +func DetectCppInHeader(filename string) bool { + f, err := os.Open(filename) + if err != nil { + return false + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for lineCount := 0; lineCount < detectCppHeaderScanLines && scanner.Scan(); lineCount++ { + line := scanner.Text() + for _, ind := range cppHeaderIndicators { + if strings.Contains(line, ind) { + return true + } + } + } + return false +} diff --git a/sast-engine/graph/clike/detection_test.go b/sast-engine/graph/clike/detection_test.go new file mode 100644 index 00000000..970c92b3 --- /dev/null +++ b/sast-engine/graph/clike/detection_test.go @@ -0,0 +1,158 @@ +package clike + +import ( + "os" + "path/filepath" + "testing" +) + +func TestIsCSourceFile(t *testing.T) { + defer headerLanguageCache.Delete("/tmp/cache_h_c.h") + defer headerLanguageCache.Delete("/tmp/cache_h_cpp.h") + + CacheHeaderLanguage("/tmp/cache_h_c.h", false) + CacheHeaderLanguage("/tmp/cache_h_cpp.h", true) + + tests := []struct { + name string + filename string + want bool + }{ + {".c file", "main.c", true}, + {".cpp file", "main.cpp", false}, + {".cc file", "main.cc", false}, + {".cxx file", "main.cxx", false}, + {".hpp file", "main.hpp", false}, + {".hh file", "main.hh", false}, + {".hxx file", "main.hxx", false}, + {".java file", "Main.java", false}, + {".py file", "main.py", false}, + {".go file", "main.go", false}, + {"no extension", "Makefile", false}, + {".h cached as C", "/tmp/cache_h_c.h", true}, + {".h cached as C++", "/tmp/cache_h_cpp.h", false}, + {".h not cached defaults to C", "/tmp/cache_h_unknown.h", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := IsCSourceFile(tt.filename); got != tt.want { + t.Errorf("IsCSourceFile(%q) = %v, want %v", tt.filename, got, tt.want) + } + }) + } +} + +func TestIsCppSourceFile(t *testing.T) { + defer headerLanguageCache.Delete("/tmp/cppcache_c.h") + defer headerLanguageCache.Delete("/tmp/cppcache_cpp.h") + + CacheHeaderLanguage("/tmp/cppcache_c.h", false) + CacheHeaderLanguage("/tmp/cppcache_cpp.h", true) + + tests := []struct { + name string + filename string + want bool + }{ + {".cpp file", "main.cpp", true}, + {".cc file", "main.cc", true}, + {".cxx file", "main.cxx", true}, + {".hpp file", "main.hpp", true}, + {".hh file", "main.hh", true}, + {".hxx file", "main.hxx", true}, + {".c file", "main.c", false}, + {".java file", "Main.java", false}, + {".py file", "main.py", false}, + {".go file", "main.go", false}, + {".h cached as C", "/tmp/cppcache_c.h", false}, + {".h cached as C++", "/tmp/cppcache_cpp.h", true}, + {".h not cached defaults to C", "/tmp/cppcache_unknown.h", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := IsCppSourceFile(tt.filename); got != tt.want { + t.Errorf("IsCppSourceFile(%q) = %v, want %v", tt.filename, got, tt.want) + } + }) + } +} + +func TestDetectCppInHeader(t *testing.T) { + tests := []struct { + name string + content string + want bool + }{ + { + name: "pure C header with typedef and struct", + content: `#ifndef UTILS_H +#define UTILS_H +typedef struct Point { int x; int y; } Point; +int add(int a, int b); +#endif +`, + want: false, + }, + { + name: "C++ class header", + content: `#pragma once +class Foo { +public: + int bar(); +}; +`, + want: true, + }, + { + name: "namespace header", + content: "namespace mylib {\nint compute();\n}\n", + want: true, + }, + { + name: "template header", + content: "template\nT identity(T v) { return v; }\n", + want: true, + }, + { + name: "qualified call uses ::", + content: "void f() { std::cout << 1; }\n", + want: true, + }, + { + name: "empty file", + content: "", + want: false, + }, + { + name: "extern C block (no C++ indicator on first lines)", + content: "#ifdef __cplusplus\nextern \"C\" {\n#endif\nint plain_c(void);\n", + want: false, + }, + } + + dir, err := os.MkdirTemp("", "detect_cpp_header") + if err != nil { + t.Fatalf("temp dir: %v", err) + } + defer os.RemoveAll(dir) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + path := filepath.Join(dir, "h_"+tt.name+".h") + if err := os.WriteFile(path, []byte(tt.content), 0644); err != nil { + t.Fatalf("write: %v", err) + } + if got := DetectCppInHeader(path); got != tt.want { + t.Errorf("DetectCppInHeader(%q) = %v, want %v", tt.name, got, tt.want) + } + }) + } + + t.Run("missing file returns false", func(t *testing.T) { + if DetectCppInHeader(filepath.Join(dir, "does_not_exist.h")) { + t.Error("expected false for missing file") + } + }) +} diff --git a/sast-engine/graph/initialize.go b/sast-engine/graph/initialize.go index 57581811..12274298 100644 --- a/sast-engine/graph/initialize.go +++ b/sast-engine/graph/initialize.go @@ -7,7 +7,10 @@ import ( "sync" "time" + "github.com/shivasurya/code-pathfinder/sast-engine/graph/clike" sitter "github.com/smacker/go-tree-sitter" + clang "github.com/smacker/go-tree-sitter/c" + cpplang "github.com/smacker/go-tree-sitter/cpp" "github.com/smacker/go-tree-sitter/golang" "github.com/smacker/go-tree-sitter/java" "github.com/smacker/go-tree-sitter/python" @@ -90,18 +93,27 @@ func Initialize(directory string, callbacks *ProgressCallbacks) *CodeGraph { continue } - // Handle tree-sitter based parsing for Java and Python - switch fileExt { - case ".java": + // For .h files, classify as C vs C++ once and cache the result so + // per-AST-node language checks remain zero-I/O during traversal. + if fileExt == ".h" { + clike.CacheHeaderLanguage(file, clike.DetectCppInHeader(file)) + } + + // Handle tree-sitter based parsing for Java, Python, Go, C, and C++. + // C/C++ cases come first because .h is shared across both grammars and + // must route via the cached heuristic, not a simple extension match. + switch { + case clike.IsCSourceFile(file): + parser.SetLanguage(clang.GetLanguage()) + case clike.IsCppSourceFile(file): + parser.SetLanguage(cpplang.GetLanguage()) + case fileExt == ".java": parser.SetLanguage(java.GetLanguage()) - case ".py": + case fileExt == ".py": parser.SetLanguage(python.GetLanguage()) - case ".go": + case fileExt == ".go": parser.SetLanguage(golang.GetLanguage()) default: - // NOTE: This case is currently unreachable because getFiles() only returns - // .java, .py, Dockerfile*, and docker-compose* files. This exists as defensive - // programming in case getFiles() is modified to include additional file types. Log("Unsupported file type:", file) if callbacks != nil && callbacks.OnProgress != nil { callbacks.OnProgress() diff --git a/sast-engine/graph/utils.go b/sast-engine/graph/utils.go index 4924d840..5aaa0354 100644 --- a/sast-engine/graph/utils.go +++ b/sast-engine/graph/utils.go @@ -252,8 +252,9 @@ func extractMethodName(node *sitter.Node, sourceCode []byte, filepath string) (s return methodName, methodID } -// getFiles walks through a directory and returns all source files (Java, Python, Go, Dockerfile, docker-compose). -// It skips vendor/, testdata/, node_modules/, .git/, and directories starting with "_". +// getFiles walks through a directory and returns all source files (Java, Python, Go, C/C++, Dockerfile, docker-compose). +// It skips vendor/, testdata/, node_modules/, .git/, common C/C++ build artifact directories, +// and directories starting with "_". func getFiles(directory string) ([]string, error) { var files []string err := filepath.Walk(directory, func(path string, info os.FileInfo, err error) error { @@ -264,15 +265,17 @@ func getFiles(directory string) ([]string, error) { if info.IsDir() { name := info.Name() switch name { - case "vendor", "testdata", "node_modules", ".git": + case "vendor", "testdata", "node_modules", ".git", + "build", "cmake-build-debug", "cmake-build-release", + "third_party", "external", "obj", "bin", "dist", ".cache": return filepath.SkipDir } - if strings.HasPrefix(name, "_") { + if strings.HasPrefix(name, "_") || strings.HasPrefix(name, "cmake-build-") { return filepath.SkipDir } return nil } - // append java, python, go, dockerfile, and docker-compose files + // append java, python, go, c/c++, dockerfile, and docker-compose files ext := filepath.Ext(path) base := filepath.Base(path) baseLower := strings.ToLower(base) @@ -280,6 +283,11 @@ func getFiles(directory string) ([]string, error) { switch { case ext == ".java" || ext == ".py" || ext == ".go": files = append(files, path) + case ext == ".c" || ext == ".h": + files = append(files, path) + case ext == ".cpp" || ext == ".cc" || ext == ".cxx" || + ext == ".hpp" || ext == ".hh" || ext == ".hxx": + files = append(files, path) case strings.HasPrefix(baseLower, "dockerfile"): // Match Dockerfile, Dockerfile.dev, dockerfile, etc. files = append(files, path) diff --git a/sast-engine/graph/utils_test.go b/sast-engine/graph/utils_test.go index cdce1c50..7ebf5e45 100644 --- a/sast-engine/graph/utils_test.go +++ b/sast-engine/graph/utils_test.go @@ -259,6 +259,88 @@ func TestGetFilesComprehensive(t *testing.T) { } } +// TestGetFilesIncludesCAndCpp asserts that getFiles discovers every +// supported C and C++ source/header extension and skips the build-artifact +// directories that are typical in C/C++ projects (build/, cmake-build-*, +// third_party/, external/, obj/, bin/, dist/, .cache/). +func TestGetFilesIncludesCAndCpp(t *testing.T) { + dir, err := os.MkdirTemp("", "getfiles_clike") + if err != nil { + t.Fatalf("temp dir: %v", err) + } + defer os.RemoveAll(dir) + + files := []string{ + "a.c", "b.cpp", "c.cc", "d.cxx", + "e.h", "f.hpp", "g.hh", "h.hxx", + "keep.java", "keep.py", "keep.go", + "build/skip.c", + "cmake-build-debug/skip.cpp", + "cmake-build-release/skip.h", + "cmake-build-foo/skip.cpp", + "third_party/skip.c", + "external/skip.h", + "obj/skip.c", + "bin/skip.cpp", + "dist/skip.h", + ".cache/skip.c", + "src/keep.cpp", + "include/keep.hpp", + } + + for _, f := range files { + full := filepath.Join(dir, f) + if err := os.MkdirAll(filepath.Dir(full), 0755); err != nil { + t.Fatalf("mkdir: %v", err) + } + if err := os.WriteFile(full, []byte("// stub\n"), 0644); err != nil { + t.Fatalf("write: %v", err) + } + } + + got, err := getFiles(dir) + if err != nil { + t.Fatalf("getFiles: %v", err) + } + + gotSet := make(map[string]bool, len(got)) + for _, p := range got { + rel, _ := filepath.Rel(dir, p) + gotSet[rel] = true + } + + wantPresent := []string{ + "a.c", "b.cpp", "c.cc", "d.cxx", + "e.h", "f.hpp", "g.hh", "h.hxx", + "keep.java", "keep.py", "keep.go", + filepath.Join("src", "keep.cpp"), + filepath.Join("include", "keep.hpp"), + } + for _, w := range wantPresent { + if !gotSet[w] { + t.Errorf("expected %q in result, missing", w) + } + } + + wantAbsent := []string{ + filepath.Join("build", "skip.c"), + filepath.Join("cmake-build-debug", "skip.cpp"), + filepath.Join("cmake-build-release", "skip.h"), + filepath.Join("cmake-build-foo", "skip.cpp"), + filepath.Join("third_party", "skip.c"), + filepath.Join("external", "skip.h"), + filepath.Join("obj", "skip.c"), + filepath.Join("bin", "skip.cpp"), + filepath.Join("dist", "skip.h"), + filepath.Join(".cache", "skip.c"), + } + for _, w := range wantAbsent { + if gotSet[w] { + t.Errorf("expected %q to be excluded but found in result", w) + } + } +} + func TestGetFilesErrors(t *testing.T) { tests := []struct { name string