Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions sast-engine/graph/clike/detection.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Package clike contains shared helpers for parsing C and C++ source files.
//
// The parsing pipeline treats C and C++ as two distinct languages with separate
// tree-sitter grammars but a large amount of shared structure (declarations,
// statements, type strings). The helpers in this package live alongside the
// language-specific siblings (graph/golang, graph/python, graph/java) and
// provide the cross-cutting primitives — language detection today, AST and
// type extraction in subsequent PRs.
package clike

import (
"bufio"
"os"
"path/filepath"
"strings"
"sync"
)

// headerLanguageCache stores the C/C++ classification of .h files keyed by
// absolute path. Values are bool: true => C++, false => C.
//
// .h is shared between C and C++ grammars and the correct grammar can only be
// chosen by inspecting the file contents. Reading the file once per AST node
// would be unacceptable, so the worker that drives the parsing pipeline calls
// CacheHeaderLanguage exactly once per .h file before AST traversal begins;
// IsCSourceFile and IsCppSourceFile then read the cache without performing I/O.
var headerLanguageCache sync.Map

// CacheHeaderLanguage records that filename is a C++ header (isCpp == true) or
// a C header (isCpp == false). Must be called once per .h file in the parsing
// worker before any IsCSourceFile / IsCppSourceFile lookup for that file.
func CacheHeaderLanguage(filename string, isCpp bool) {
headerLanguageCache.Store(filename, isCpp)
}

// IsCSourceFile reports whether filename should be parsed with the C grammar.
//
// Source-extension cases (.c) are answered directly. For .h files the answer
// comes from the header cache populated by CacheHeaderLanguage; an uncached
// .h falls back to C as the safe default (the grammar overlap means a
// misclassified C++ header still parses as a structurally-valid translation
// unit, just with reduced fidelity).
func IsCSourceFile(filename string) bool {
ext := filepath.Ext(filename)
if ext == ".c" {
return true
}
if ext == ".h" {
if v, ok := headerLanguageCache.Load(filename); ok {
return !v.(bool)
}
return true
}
return false
}

// IsCppSourceFile reports whether filename should be parsed with the C++
// grammar.
//
// .cpp/.cc/.cxx and the C++-only header extensions (.hpp/.hh/.hxx) are
// answered directly. For .h files the answer comes from the header cache
// populated by CacheHeaderLanguage; an uncached .h falls back to "not C++"
// so that IsCSourceFile and IsCppSourceFile remain mutually exclusive.
func IsCppSourceFile(filename string) bool {
ext := filepath.Ext(filename)
switch ext {
case ".cpp", ".cc", ".cxx", ".hpp", ".hh", ".hxx":
return true
case ".h":
if v, ok := headerLanguageCache.Load(filename); ok {
return v.(bool)
}
return false
}
return false
}

// cppHeaderIndicators are byte sequences whose presence in a header file is a
// strong signal that the header should be parsed as C++ rather than C. The
// list is intentionally small and conservative — every entry is invalid in
// pure C, so a positive match is high-confidence.
var cppHeaderIndicators = []string{
"class ", "namespace ", "template<", "template <",
"public:", "private:", "protected:", "::",
}

// detectCppHeaderScanLines bounds how far DetectCppInHeader reads into a file.
// 100 lines covers the include guards, license header, and the first few
// declarations of every real-world header surveyed during the design phase.
const detectCppHeaderScanLines = 100

// DetectCppInHeader scans the first detectCppHeaderScanLines lines of filename
// for C++-only indicators (see cppHeaderIndicators).
//
// This is a best-effort heuristic, not a full preprocessor or parser. It is
// invoked from the worker exactly once per file and its result is stored via
// CacheHeaderLanguage; AST-traversal code reads the cache and never calls
// this function on the hot path.
//
// A missing or unreadable file returns false (treated as C) so that downstream
// parsing fails on an obviously-broken input rather than mislabeling the
// language.
func DetectCppInHeader(filename string) bool {
f, err := os.Open(filename)
if err != nil {
return false
}
defer f.Close()

scanner := bufio.NewScanner(f)
for lineCount := 0; lineCount < detectCppHeaderScanLines && scanner.Scan(); lineCount++ {
line := scanner.Text()
for _, ind := range cppHeaderIndicators {
if strings.Contains(line, ind) {
return true
}
}
}
return false
}
158 changes: 158 additions & 0 deletions sast-engine/graph/clike/detection_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package clike

import (
"os"
"path/filepath"
"testing"
)

func TestIsCSourceFile(t *testing.T) {
defer headerLanguageCache.Delete("/tmp/cache_h_c.h")
defer headerLanguageCache.Delete("/tmp/cache_h_cpp.h")

CacheHeaderLanguage("/tmp/cache_h_c.h", false)
CacheHeaderLanguage("/tmp/cache_h_cpp.h", true)

tests := []struct {
name string
filename string
want bool
}{
{".c file", "main.c", true},
{".cpp file", "main.cpp", false},
{".cc file", "main.cc", false},
{".cxx file", "main.cxx", false},
{".hpp file", "main.hpp", false},
{".hh file", "main.hh", false},
{".hxx file", "main.hxx", false},
{".java file", "Main.java", false},
{".py file", "main.py", false},
{".go file", "main.go", false},
{"no extension", "Makefile", false},
{".h cached as C", "/tmp/cache_h_c.h", true},
{".h cached as C++", "/tmp/cache_h_cpp.h", false},
{".h not cached defaults to C", "/tmp/cache_h_unknown.h", true},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := IsCSourceFile(tt.filename); got != tt.want {
t.Errorf("IsCSourceFile(%q) = %v, want %v", tt.filename, got, tt.want)
}
})
}
}

func TestIsCppSourceFile(t *testing.T) {
defer headerLanguageCache.Delete("/tmp/cppcache_c.h")
defer headerLanguageCache.Delete("/tmp/cppcache_cpp.h")

CacheHeaderLanguage("/tmp/cppcache_c.h", false)
CacheHeaderLanguage("/tmp/cppcache_cpp.h", true)

tests := []struct {
name string
filename string
want bool
}{
{".cpp file", "main.cpp", true},
{".cc file", "main.cc", true},
{".cxx file", "main.cxx", true},
{".hpp file", "main.hpp", true},
{".hh file", "main.hh", true},
{".hxx file", "main.hxx", true},
{".c file", "main.c", false},
{".java file", "Main.java", false},
{".py file", "main.py", false},
{".go file", "main.go", false},
{".h cached as C", "/tmp/cppcache_c.h", false},
{".h cached as C++", "/tmp/cppcache_cpp.h", true},
{".h not cached defaults to C", "/tmp/cppcache_unknown.h", false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := IsCppSourceFile(tt.filename); got != tt.want {
t.Errorf("IsCppSourceFile(%q) = %v, want %v", tt.filename, got, tt.want)
}
})
}
}

func TestDetectCppInHeader(t *testing.T) {
tests := []struct {
name string
content string
want bool
}{
{
name: "pure C header with typedef and struct",
content: `#ifndef UTILS_H
#define UTILS_H
typedef struct Point { int x; int y; } Point;
int add(int a, int b);
#endif
`,
want: false,
},
{
name: "C++ class header",
content: `#pragma once
class Foo {
public:
int bar();
};
`,
want: true,
},
{
name: "namespace header",
content: "namespace mylib {\nint compute();\n}\n",
want: true,
},
{
name: "template header",
content: "template<typename T>\nT identity(T v) { return v; }\n",
want: true,
},
{
name: "qualified call uses ::",
content: "void f() { std::cout << 1; }\n",
want: true,
},
{
name: "empty file",
content: "",
want: false,
},
{
name: "extern C block (no C++ indicator on first lines)",
content: "#ifdef __cplusplus\nextern \"C\" {\n#endif\nint plain_c(void);\n",
want: false,
},
}

dir, err := os.MkdirTemp("", "detect_cpp_header")
if err != nil {
t.Fatalf("temp dir: %v", err)
}
defer os.RemoveAll(dir)

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
path := filepath.Join(dir, "h_"+tt.name+".h")
if err := os.WriteFile(path, []byte(tt.content), 0644); err != nil {
t.Fatalf("write: %v", err)
}
if got := DetectCppInHeader(path); got != tt.want {
t.Errorf("DetectCppInHeader(%q) = %v, want %v", tt.name, got, tt.want)
}
})
}

t.Run("missing file returns false", func(t *testing.T) {
if DetectCppInHeader(filepath.Join(dir, "does_not_exist.h")) {
t.Error("expected false for missing file")
}
})
}
28 changes: 20 additions & 8 deletions sast-engine/graph/initialize.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ import (
"sync"
"time"

"github.com/shivasurya/code-pathfinder/sast-engine/graph/clike"
sitter "github.com/smacker/go-tree-sitter"
clang "github.com/smacker/go-tree-sitter/c"
cpplang "github.com/smacker/go-tree-sitter/cpp"
"github.com/smacker/go-tree-sitter/golang"
"github.com/smacker/go-tree-sitter/java"
"github.com/smacker/go-tree-sitter/python"
Expand Down Expand Up @@ -90,18 +93,27 @@ func Initialize(directory string, callbacks *ProgressCallbacks) *CodeGraph {
continue
}

// Handle tree-sitter based parsing for Java and Python
switch fileExt {
case ".java":
// For .h files, classify as C vs C++ once and cache the result so
// per-AST-node language checks remain zero-I/O during traversal.
if fileExt == ".h" {
clike.CacheHeaderLanguage(file, clike.DetectCppInHeader(file))
}

// Handle tree-sitter based parsing for Java, Python, Go, C, and C++.
// C/C++ cases come first because .h is shared across both grammars and
// must route via the cached heuristic, not a simple extension match.
switch {
case clike.IsCSourceFile(file):
parser.SetLanguage(clang.GetLanguage())
case clike.IsCppSourceFile(file):
parser.SetLanguage(cpplang.GetLanguage())
case fileExt == ".java":
parser.SetLanguage(java.GetLanguage())
case ".py":
case fileExt == ".py":
parser.SetLanguage(python.GetLanguage())
case ".go":
case fileExt == ".go":
parser.SetLanguage(golang.GetLanguage())
default:
// NOTE: This case is currently unreachable because getFiles() only returns
// .java, .py, Dockerfile*, and docker-compose* files. This exists as defensive
// programming in case getFiles() is modified to include additional file types.
Log("Unsupported file type:", file)
if callbacks != nil && callbacks.OnProgress != nil {
callbacks.OnProgress()
Expand Down
18 changes: 13 additions & 5 deletions sast-engine/graph/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,9 @@ func extractMethodName(node *sitter.Node, sourceCode []byte, filepath string) (s
return methodName, methodID
}

// getFiles walks through a directory and returns all source files (Java, Python, Go, Dockerfile, docker-compose).
// It skips vendor/, testdata/, node_modules/, .git/, and directories starting with "_".
// getFiles walks through a directory and returns all source files (Java, Python, Go, C/C++, Dockerfile, docker-compose).
// It skips vendor/, testdata/, node_modules/, .git/, common C/C++ build artifact directories,
// and directories starting with "_".
func getFiles(directory string) ([]string, error) {
var files []string
err := filepath.Walk(directory, func(path string, info os.FileInfo, err error) error {
Expand All @@ -264,22 +265,29 @@ func getFiles(directory string) ([]string, error) {
if info.IsDir() {
name := info.Name()
switch name {
case "vendor", "testdata", "node_modules", ".git":
case "vendor", "testdata", "node_modules", ".git",
"build", "cmake-build-debug", "cmake-build-release",
"third_party", "external", "obj", "bin", "dist", ".cache":
return filepath.SkipDir
}
if strings.HasPrefix(name, "_") {
if strings.HasPrefix(name, "_") || strings.HasPrefix(name, "cmake-build-") {
return filepath.SkipDir
}
return nil
}
// append java, python, go, dockerfile, and docker-compose files
// append java, python, go, c/c++, dockerfile, and docker-compose files
ext := filepath.Ext(path)
base := filepath.Base(path)
baseLower := strings.ToLower(base)

switch {
case ext == ".java" || ext == ".py" || ext == ".go":
files = append(files, path)
case ext == ".c" || ext == ".h":
files = append(files, path)
case ext == ".cpp" || ext == ".cc" || ext == ".cxx" ||
ext == ".hpp" || ext == ".hh" || ext == ".hxx":
files = append(files, path)
case strings.HasPrefix(baseLower, "dockerfile"):
// Match Dockerfile, Dockerfile.dev, dockerfile, etc.
files = append(files, path)
Expand Down
Loading
Loading