Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 218 additions & 0 deletions .github/workflows/generate-clike-stdlib.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
name: Generate C/C++ Stdlib Registries

# Generates C and C++ stdlib registries for Linux, Windows, and Darwin and
# uploads them to Cloudflare R2 (the same backend that serves the Go stdlib
# registries). Three jobs run in parallel:
#
# - generate-linux ubuntu-latest with libc6-dev + libstdc++-13-dev
# - generate-windows ubuntu-latest with mingw-w64 (Win32 + mingw libstdc++)
# - generate-darwin macos-latest using xcrun's Command Line Tools SDK
#
# The publish job collects the six manifest sets and uploads them to R2 only
# when both R2 secrets are present — running this workflow on a fork without
# secrets still validates that generation works.
#
# Trigger: push to main when generator code or overlays change, or on demand
# via workflow_dispatch.

on:
push:
branches: [main]
paths:
- 'sast-engine/tools/generate_clike_stdlib_registry.go'
- 'sast-engine/tools/internal/clikeextract/**'
- 'sast-engine/tools/c_stdlib_overlay.yaml'
- 'sast-engine/tools/cpp_stdlib_overlay.yaml'
- '.github/workflows/generate-clike-stdlib.yml'
workflow_dispatch:

permissions:
contents: read

jobs:
generate-linux:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: actions/setup-go@v6
with:
go-version: '1.23'
cache: false

- name: Install Linux stdlib headers
run: |
sudo apt-get update
sudo apt-get install -y libc6-dev libstdc++-13-dev

- name: Generate Linux C registry
run: |
cd sast-engine
go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \
--target=linux --language=c \
--output-dir=/tmp/clike-out/linux/c/v1

- name: Generate Linux C++ registry
run: |
cd sast-engine
go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \
--target=linux --language=cpp \
--output-dir=/tmp/clike-out/linux/cpp/v1

- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: clike-registries-linux
path: /tmp/clike-out/linux/
retention-days: 7

generate-windows:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: actions/setup-go@v6
with:
go-version: '1.23'
cache: false

- name: Install mingw-w64 (Win32 + libstdc++ headers on Linux)
run: |
sudo apt-get update
sudo apt-get install -y mingw-w64 g++-mingw-w64

- name: Generate Windows C registry
run: |
cd sast-engine
go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \
--target=windows --language=c \
--output-dir=/tmp/clike-out/windows/c/v1

- name: Generate Windows C++ registry
run: |
cd sast-engine
go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \
--target=windows --language=cpp \
--output-dir=/tmp/clike-out/windows/cpp/v1

- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: clike-registries-windows
path: /tmp/clike-out/windows/
retention-days: 7

generate-darwin:
runs-on: macos-latest
steps:
- uses: actions/checkout@v6
- uses: actions/setup-go@v6
with:
go-version: '1.23'
cache: false

- name: Verify SDK availability
run: |
xcrun --show-sdk-path
ls /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/stdio.h
ls /Library/Developer/CommandLineTools/usr/include/c++/v1/vector

- name: Generate Darwin C registry
run: |
cd sast-engine
go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \
--target=darwin --language=c \
--output-dir=/tmp/clike-out/darwin/c/v1

- name: Generate Darwin C++ registry
run: |
cd sast-engine
go run -tags cpf_generate_stdlib_registry ./tools/generate_clike_stdlib_registry.go \
--target=darwin --language=cpp \
--output-dir=/tmp/clike-out/darwin/cpp/v1

- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: clike-registries-darwin
path: /tmp/clike-out/darwin/
retention-days: 7

publish:
needs: [generate-linux, generate-windows, generate-darwin]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6

- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: /tmp/clike-out
pattern: clike-registries-*
merge-multiple: false

- name: Stage registries into the expected layout
run: |
mkdir -p /tmp/clike-staged
for plat in linux windows darwin; do
mkdir -p /tmp/clike-staged/$plat
if [ -d "/tmp/clike-out/clike-registries-$plat/$plat" ]; then
cp -R /tmp/clike-out/clike-registries-$plat/$plat/* /tmp/clike-staged/$plat/
fi
done

- name: Verify all 6 manifest.json files
run: |
for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do
path="/tmp/clike-staged/$combo/v1/manifest.json"
test -f "$path" || { echo "Missing: $path"; exit 1; }
echo "OK $combo/v1/manifest.json ($(wc -c < "$path") bytes)"
done

# Cloudflare R2 upload runs only when both secrets are configured. On a
# fork or before the secrets are provisioned the workflow still passes
# generation + verification and skips the upload — keeps the pipeline
# green during operator setup.
- name: Upload to Cloudflare R2
if: ${{ env.HAS_R2_CREDS == 'true' }}
env:
HAS_R2_CREDS: ${{ secrets.R2_ACCOUNT_ID != '' && secrets.R2_ACCESS_KEY_ID != '' && secrets.R2_SECRET_ACCESS_KEY != '' }}
R2_ACCOUNT_ID: ${{ secrets.R2_ACCOUNT_ID }}
R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
run: |
export AWS_ACCESS_KEY_ID="$R2_ACCESS_KEY_ID"
export AWS_SECRET_ACCESS_KEY="$R2_SECRET_ACCESS_KEY"
R2_ENDPOINT="https://${R2_ACCOUNT_ID}.r2.cloudflarestorage.com"

for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do
echo "Uploading $combo..."
aws s3 sync "/tmp/clike-staged/$combo/v1/" \
"s3://code-pathfinder-assets/registries/$combo/v1/" \
--endpoint-url "$R2_ENDPOINT" \
--delete \
--content-type "application/json" \
--cache-control "public, max-age=86400"
done

- name: Verify CDN URLs
if: ${{ env.HAS_R2_CREDS == 'true' }}
env:
HAS_R2_CREDS: ${{ secrets.R2_ACCOUNT_ID != '' && secrets.R2_ACCESS_KEY_ID != '' && secrets.R2_SECRET_ACCESS_KEY != '' }}
run: |
for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do
url="https://assets.codepathfinder.dev/registries/$combo/v1/manifest.json"
status=$(curl -s -o /dev/null -w "%{http_code}" "$url")
echo "$status $url"
test "$status" = "200" || { echo "URL not reachable: $url"; exit 1; }
done

- name: Summary
run: |
echo "## C/C++ Stdlib Registry Generation" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
for combo in linux/c linux/cpp windows/c windows/cpp darwin/c darwin/cpp; do
mf="/tmp/clike-staged/$combo/v1/manifest.json"
if [ -f "$mf" ]; then
size=$(wc -c < "$mf")
echo "- ${combo} manifest.json: ${size} bytes" >> "$GITHUB_STEP_SUMMARY"
fi
done
18 changes: 13 additions & 5 deletions sast-engine/cmd/scan_stdlib_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,19 @@ func TestInitClikeStdlib_BarePathTreatedAsFile(t *testing.T) {
require.NotNil(t, cfg.cppLoader)
}

func TestInitClikeStdlib_HTTPSchemeIsStubbed(t *testing.T) {
// HTTP path returns a constructed loader, but LoadManifest fails
// with the PR-03 stub error. Both loaders should be nil after the
// failed load.
cfg, wired := initClikeStdlib(t.TempDir(), "linux", "https://example.test/registries", newTestLogger())
func TestInitClikeStdlib_HTTPSchemeFailsGracefullyOnUnreachableHost(t *testing.T) {
// PR-03 wires HTTP up — when the URL doesn't resolve and there's no
// disk cache to fall back on, both loaders fail to load and stay
// nil. The scan continues under Phase 1 behavior.
//
// Point the cache at a fresh temp dir so a previously populated
// developer cache (e.g. from running another test) cannot serve a
// stale manifest and turn this into a false-positive success.
t.Setenv("XDG_CACHE_HOME", t.TempDir())
t.Setenv("HOME", t.TempDir())
t.Setenv("LOCALAPPDATA", t.TempDir())

cfg, wired := initClikeStdlib(t.TempDir(), "linux", "http://127.0.0.1:1/registries", newTestLogger())
assert.False(t, wired)
assert.Nil(t, cfg.cLoader)
assert.Nil(t, cfg.cppLoader)
Expand Down
109 changes: 100 additions & 9 deletions sast-engine/graph/callgraph/registry/c_stdlib_remote.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,57 @@ func (r *CStdlibRegistryRemote) loadManifestFromFile(logger core.CStdlibLogger)
return nil
}

// loadManifestFromHTTP is the PR-03 hook. PR-02 ships it as a deliberate stub
// so the type satisfies CStdlibLoader without any half-built network code
// shipping early.
func (r *CStdlibRegistryRemote) loadManifestFromHTTP(_ core.CStdlibLogger) error {
return errors.New("CStdlibRegistryRemote: HTTP loader not yet implemented; tracked in PR-03")
// loadManifestFromHTTP downloads the top-level manifest.json over HTTP, with
// fallback to a stale on-disk cache when the network is unreachable. Layout:
//
// GET <baseURL>/<platform>/c/v1/manifest.json
// └─ on success: parse JSON, write to disk cache, populate r.manifest
// └─ on network failure: read disk cache (regardless of TTL), warn, continue
// └─ on cache miss too: surface the original network error
//
// Disk-cache writes are best-effort: a failed write logs a warning but does
// not block in-memory population (the scan still benefits from this run, the
// next run just has to re-fetch).
func (r *CStdlibRegistryRemote) loadManifestFromHTTP(logger core.CStdlibLogger) error {
url := joinURL(r.baseURL, r.platform, "c", "v1", "manifest.json")
if logger != nil {
logger.Debug("Downloading C stdlib manifest: %s", url)
}

data, err := fetchURL(r.httpClient, url)
if err != nil {
// Network failed — try disk cache irrespective of freshness so a
// scan in a no-network environment still resolves stdlib calls.
if cached, cerr := r.diskCache.GetManifest(); cerr == nil {
if logger != nil {
logger.Warning("Network failed for %s; serving cached manifest. Underlying: %v", url, err)
}
r.cacheMutex.Lock()
r.manifest = cached
r.cacheMutex.Unlock()
return nil
}
return fmt.Errorf("loadManifestFromHTTP: %w", err)
}

var manifest core.CStdlibManifest
if err := json.Unmarshal(data, &manifest); err != nil {
return fmt.Errorf("loadManifestFromHTTP: parsing manifest from %s: %w", url, err)
}

if cerr := r.diskCache.SaveManifest(data); cerr != nil && logger != nil {
logger.Warning("Failed to save C manifest to disk cache: %v", cerr)
}

r.cacheMutex.Lock()
r.manifest = &manifest
r.cacheMutex.Unlock()

if logger != nil {
logger.Statistic("Loaded C stdlib manifest over HTTP: %d headers for %s",
len(manifest.Headers), r.platform)
}
return nil
}

// GetHeader retrieves the per-header content, fetching on first reference and
Expand Down Expand Up @@ -179,10 +225,55 @@ func (r *CStdlibRegistryRemote) fetchHeaderFromFile(entry *core.CStdlibHeaderEnt
return &h, nil
}

// fetchHeaderFromHTTP is the PR-03 hook. PR-02 stub keeps the type
// satisfying its interface contract without shipping half-built network code.
func (r *CStdlibRegistryRemote) fetchHeaderFromHTTP(_ *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) {
return nil, errors.New("CStdlibRegistryRemote: HTTP fetch not yet implemented; tracked in PR-03")
// fetchHeaderFromHTTP downloads one per-header JSON over HTTP, with disk-cache
// freshness checks on the way in and stale-cache fallback on network failure.
//
// The lookup chain:
// 1. Disk cache hit AND fresh (< 24h) → return cached, no network.
// 2. Otherwise GET the entry's URL (or construct one from baseURL + entry.File
// when the manifest predates URL embedding).
// 3. On 200 OK: verify checksum (when present in the manifest), parse JSON,
// persist to disk cache, return.
// 4. On any network or parse failure: try the on-disk cache irrespective of
// freshness — a stale registry beats no resolution at all.
func (r *CStdlibRegistryRemote) fetchHeaderFromHTTP(entry *core.CStdlibHeaderEntry) (*core.CStdlibHeader, error) {
if r.diskCache.IsFresh(entry.File, stdlibCacheTTL) {
if cached, err := r.diskCache.GetHeader(entry.File); err == nil {
return cached, nil
}
}

url := r.headerURL(entry)
data, err := fetchURL(r.httpClient, url)
if err != nil {
if cached, cerr := r.diskCache.GetHeader(entry.File); cerr == nil {
return cached, nil
}
return nil, fmt.Errorf("fetchHeaderFromHTTP: %w", err)
}

if err := verifyChecksum(data, entry.Checksum); err != nil {
return nil, fmt.Errorf("fetchHeaderFromHTTP: %s: %w", entry.Header, err)
}

var h core.CStdlibHeader
if err := json.Unmarshal(data, &h); err != nil {
return nil, fmt.Errorf("fetchHeaderFromHTTP: parsing %s: %w", url, err)
}

_ = r.diskCache.SaveHeader(entry.File, data) // best-effort

return &h, nil
}

// headerURL prefers the manifest-embedded URL when present (lets the registry
// publisher point individual files at a different host or a versioned path)
// and otherwise constructs one from the loader's baseURL + entry.File.
func (r *CStdlibRegistryRemote) headerURL(entry *core.CStdlibHeaderEntry) string {
if entry.URL != "" {
return entry.URL
}
return joinURL(r.baseURL, r.platform, "c", "v1", entry.File)
}

// GetFunction is a convenience accessor: GetHeader followed by a function
Expand Down
Loading
Loading