From 9c0a0292111df8a839783c183e3adf5b68708147 Mon Sep 17 00:00:00 2001 From: AntoineBastide47 <148970403+AntoineBastide47@users.noreply.github.com> Date: Wed, 20 May 2026 14:53:52 +0200 Subject: [PATCH 1/4] feat: faster file indexing --- LLVM/main.cpp | 107 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 87 insertions(+), 20 deletions(-) diff --git a/LLVM/main.cpp b/LLVM/main.cpp index cb01a61..540f8c0 100644 --- a/LLVM/main.cpp +++ b/LLVM/main.cpp @@ -1,7 +1,9 @@ +#include #include #include #include #include +#include #include #include #include @@ -49,6 +51,16 @@ static long long elapsed_ms(const Clock::time_point start, const Clock::time_poi return std::chrono::duration_cast(end - start).count(); } +// Index of the executing OpenMP thread (0 when built without OpenMP). Used to +// route work into per-thread storage that needs no locking. +static int current_thread() { +#if USE_OPENMP + return omp_get_thread_num(); +#else + return 0; +#endif +} + static void log_info(const std::string &message) { std::istringstream lines(message); std::string line; @@ -217,41 +229,96 @@ int main(const int argc, char *argv[]) { extToLang.try_emplace(std::string(ext), lang); } - // Single directory walk: bucket files by language. - // lexically_normal() is pure string arithmetic (no stat syscalls). + // Parallel directory walk: bucket files by language. + // The readdir/stat syscalls and the per-file work (extension parsing plus + // lexically_normal, which is pure string arithmetic) are spread across + // threads via OpenMP tasks — one task per subdirectory, so sibling subtrees + // are walked concurrently. Each thread fills its own bucket set, so the hot + // path is lock-free; a cheap serial merge below deduplicates and assembles + // the global buckets. const auto indexingStart = Clock::now(); std::unordered_set seen; std::map>> buckets; +#if USE_OPENMP + const int indexThreads = omp_get_max_threads(); +#else + const int indexThreads = 1; +#endif + std::vector>>> tlBuckets(indexThreads); + + // Resolve roots up front so their normalized input-path strings live in + // stable storage that every spawned task can safely reference. + std::vector> roots; + roots.reserve(rawPaths.size()); for (const auto &rawPath: rawPaths) { const fs::path p(rawPath); if (!fs::exists(p)) { std::cerr << "Path does not exist: " << p << "\n"; continue; } - const std::string inputPath = p.lexically_normal().string(); - - auto try_add = [&](const fs::path &fp) { - const std::string extStr = fp.extension().string(); - if (extStr.size() <= 1) - return; - const auto it = extToLang.find(extStr.substr(1)); - if (it == extToLang.end()) - return; - const std::string norm = fp.lexically_normal().string(); - if (!seen.insert(norm).second) - return; - buckets[it->second].emplace_back(norm, inputPath); - }; + roots.emplace_back(p, p.lexically_normal().string()); + } + // Match a candidate file against the extension table and stash it in the + // current thread's bucket. Deduplication is deferred to the serial merge, so + // this touches only thread-local state and needs no synchronization. + auto try_add = [&](const fs::path &fp, const std::string &inputPath) { + const std::string extStr = fp.extension().string(); + if (extStr.size() <= 1) + return; + const auto it = extToLang.find(extStr.substr(1)); + if (it == extToLang.end()) + return; + tlBuckets[current_thread()][it->second].emplace_back(fp.lexically_normal().string(), inputPath); + }; + + // Non-recursive scan of one directory; each subdirectory is handed to its own + // task. is_directory/is_regular_file consume the file type cached by readdir + // where the platform provides it, avoiding extra stat calls. Errors (e.g. + // permission denied) skip the offending entry rather than aborting the walk. + std::function walk = + [&](const fs::path &dir, const std::string &inputPath) { + std::vector subdirs; + std::error_code wec; + for (fs::directory_iterator it(dir, fs::directory_options::skip_permission_denied, wec), end; + !wec && it != end; it.increment(wec)) { + std::error_code tec; + if (it->is_directory(tec)) + subdirs.push_back(it->path()); + else if (it->is_regular_file(tec)) + try_add(it->path(), inputPath); + } + for (auto &sd: subdirs) { +#if USE_OPENMP +#pragma omp task firstprivate(sd) shared(walk, inputPath) +#endif + walk(sd, inputPath); + } + }; + +#if USE_OPENMP +#pragma omp parallel +#pragma omp single +#endif + for (const auto &[p, inputPath]: roots) { if (fs::is_regular_file(p)) - try_add(p); + try_add(p, inputPath); else if (fs::is_directory(p)) - for (const auto &entry: fs::recursive_directory_iterator(p)) - if (fs::is_regular_file(entry)) - try_add(entry.path()); + walk(p, inputPath); } + // Serial merge: deduplicate across threads and assemble the global buckets, + // then sort each bucket so output is reproducible regardless of the order in + // which threads happened to discover files. + for (auto &tb: tlBuckets) + for (auto &[lang, files]: tb) + for (auto &entry: files) + if (seen.insert(entry.first).second) + buckets[lang].push_back(std::move(entry)); + for (auto &files: buckets | std::views::values) + std::ranges::sort(files); + // Assemble langFiles in kLanguageData order for deterministic output. std::vector>>> langFiles; size_t totalFiles = 0; From ca1f6a108d9d8d0044c42d95d1e2465f85558cf8 Mon Sep 17 00:00:00 2001 From: AntoineBastide47 <148970403+AntoineBastide47@users.noreply.github.com> Date: Wed, 20 May 2026 16:17:18 +0200 Subject: [PATCH 2/4] feat: change how cloning works --- LLVM/.gitignore | 3 +- LLVM/clone-orgs.sh | 263 ++++++++++++++++++++++++++++++++++++--------- LLVM/main.cpp | 23 ++-- 3 files changed, 229 insertions(+), 60 deletions(-) diff --git a/LLVM/.gitignore b/LLVM/.gitignore index 7c6db17..76e6b39 100644 --- a/LLVM/.gitignore +++ b/LLVM/.gitignore @@ -5,4 +5,5 @@ build-test ./ignore ./include/LanguageModel.generated.hpp result/** -train/__pycache__ \ No newline at end of file +train/__pycache__ +repos/** \ No newline at end of file diff --git a/LLVM/clone-orgs.sh b/LLVM/clone-orgs.sh index 954a868..d8fe39e 100755 --- a/LLVM/clone-orgs.sh +++ b/LLVM/clone-orgs.sh @@ -1,7 +1,9 @@ #!/usr/bin/env bash # # clone-orgs.sh — clone every (non-archived) repository in one or more GitHub -# orgs in parallel, designed to stay well under GitHub's API and abuse limits. +# orgs in parallel, parse each repo as it lands, and emit a whole-org parse +# before discarding the clone tree. Designed to stay well under GitHub's API +# and abuse limits. # # Usage: # ./clone-orgs.sh [-o ] [ ...] @@ -18,7 +20,18 @@ # SKIP_FORKS skip forked repos (default 0) # RETRIES per-repo retry count (default 3, exponential backoff) # -# Requirements: gh (authenticated — run `gh auth login` once), git, jq. +# Per-org flow: +# 1. List the org's repos. +# 2. Clone them in parallel (bounded by JOBS). The instant a repo is on disk +# its tree is handed to `parser ` in the background, so parsing +# overlaps with the next download. The org's special `.github` repo is +# never parsed. +# 3. Once every clone and every per-repo parse has finished, run one more +# `parser` pass over the whole org folder, then delete that folder and +# move on to the next org. +# +# Requirements: gh (authenticated — run `gh auth login` once), git, jq, and the +# built `parser` binary next to this script. # For private repos also run `gh auth setup-git` so plain `git clone` picks # up the gh token via git's credential helper. # @@ -70,6 +83,10 @@ command -v git >/dev/null || die "git not found" command -v jq >/dev/null || die "jq not found" gh auth status >/dev/null 2>&1 || die "not authenticated — run: gh auth login" +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +PARSER="${SCRIPT_DIR}/parser" +[ -x "${PARSER}" ] || die "parser binary not found / not executable at ${PARSER} (build it first)" + remaining=$(gh api rate_limit -q '.resources.core.remaining' 2>/dev/null || echo 0) log "GitHub REST budget: ${remaining}/5000 remaining" [ "${remaining}" -ge 100 ] || die "REST budget too low (${remaining}); wait or use a different token" @@ -91,28 +108,25 @@ list_repos() { -q "${jq_filter}" } -ALL_REPOS=$(mktemp -t clone-orgs.XXXXXX) -trap 'rm -f "${ALL_REPOS}"' EXIT - -for org in "$@"; do - list_repos "${org}" >> "${ALL_REPOS}" || log "warning: skipping ${org} (listing failed)" -done - -total=$(wc -l < "${ALL_REPOS}" | tr -d ' ') -log "queued ${total} repos across $# org(s); JOBS=${JOBS}, DEPTH=${DEPTH}, OUTPUT=${OUTPUT}" -[ "${total}" -gt 0 ] || { log "nothing to clone"; exit 0; } - CLONE_LOG="${OUTPUT}/clone.log" -: > "${CLONE_LOG}" +PARSE_LOG="${OUTPUT}/parse.log" # one line per repo the consumer has parsed +PARSE_QUEUE_LOG="${OUTPUT}/parse_queue.log" # one line per repo enqueued for parsing +TOTAL_FAIL=0 # Per-repo worker: skip-if-exists, clone, retry on failure with quadratic # backoff. Records its outcome in CLONE_LOG (one short line, append is atomic # for the lengths used here) and stays silent on stdout so the progress bar # rendered by the watcher below is the only thing on the user's terminal. # Always returns 0 so a single bad repo doesn't sink the batch. +# +# As soon as the repo is on disk its path is pushed onto PARSE_FIFO. A single +# background consumer (see parse_consumer) drains that queue and parses repos +# one at a time, so parsing overlaps with downloads yet never runs more than one +# parser at once. The org's special `.github` repo is never enqueued. clone_one() { local slug=$1 local dest="${OUTPUT}/${slug}" + local name=${slug##*/} local outcome if [ -d "${dest}/.git" ]; then @@ -137,13 +151,32 @@ clone_one() { fi printf '%s\n' "${outcome}" >> "${CLONE_LOG}" + + # Account for this repo as one parse unit. Repos with parseable content are + # enqueued for the single consumer (which ticks the parse log once parsed); + # everything else — the org `.github` repo, a missing tree, or a failed + # clone — has nothing to parse, so it ticks the parse log here as a resolved + # unit. Either way each repo advances the parse bar exactly once. + case "${outcome}" in + OK*|SKIP*) + if [ "${name}" != ".github" ] && [ -d "${dest}" ]; then + printf 'Q\n' >> "${PARSE_QUEUE_LOG}" + printf '%s\n' "${dest}" > "${PARSE_FIFO}" + else + printf 'S\n' >> "${PARSE_LOG}" + fi + ;; + *) + printf 'S\n' >> "${PARSE_LOG}" + ;; + esac } export -f clone_one -export OUTPUT DEPTH RETRIES CLONE_LOG +export OUTPUT DEPTH RETRIES CLONE_LOG PARSE_LOG PARSE_QUEUE_LOG PARSER # Progress bar matching main.cpp's per-language renderer: 40-wide "[####....]" -# refreshed in place. A background watcher polls the clone log every 200 ms -# and re-renders, so all stdio races stay inside this one process. +# refreshed in place. A background watcher polls the clone log every 200 ms and +# re-renders, so all stdio races stay inside this one process. draw_bar() { local n=$1 tot=$2 local pct=$(( tot > 0 ? n * 100 / tot : 0 )) @@ -156,40 +189,168 @@ draw_bar() { } export -f draw_bar -( - while :; do - sleep 0.2 - if [ -f "${CLONE_LOG}" ]; then - n=$(wc -l < "${CLONE_LOG}" 2>/dev/null | tr -d ' ') - [ -z "$n" ] && n=0 - [ "$n" -gt "${total}" ] && n=${total} - draw_bar "$n" "${total}" - fi +# Render the clone and parse progress side-by-side on one in-place line. Parse's +# denominator is "enqueued so far", which grows during cloning and is final once +# cloning ends, so the bar tracks real outstanding work rather than a guess. +draw_bars() { + local cn=$1 ct=$2 pn=$3 pt=$4 + local W=20 + local full='########################################' + local empty='........................................' + # Guard the divisions explicitly: bash 3.2's $(( a ? b/c : 0 )) still + # evaluates b/c when c is 0, and the parse denominator starts at 0. + local cf=0 pf=0 + [ "${ct}" -gt 0 ] && cf=$(( cn * 100 / ct * W / 100 )) + [ "${pt}" -gt 0 ] && pf=$(( pn * 100 / pt * W / 100 )) + printf '\r clone [%s%s] %d/%d parse [%s%s] %d/%d ' \ + "${full:0:cf}" "${empty:0:$((W - cf))}" "$cn" "$ct" \ + "${full:0:pf}" "${empty:0:$((W - pf))}" "$pn" "$pt" >&2 +} +export -f draw_bars + +# Single parse consumer: drains repo paths from PARSE_FIFO and parses them one +# at a time, guaranteeing only one parser process is ever live. Holds the FIFO +# open read-write (fd 3) so enqueuing clones never block on a missing reader, +# and stops on the `__DONE__` sentinel the org driver sends once cloning ends. +parse_consumer() { + local repo + exec 3<>"${PARSE_FIFO}" + while IFS= read -r repo <&3; do + [ "${repo}" = "__DONE__" ] && break + [ -d "${repo}" ] && { "${PARSER}" "${repo}" >/dev/null 2>&1 || true; } + printf 'P\n' >> "${PARSE_LOG}" done -) & -WATCHER_PID=$! -# Make sure the watcher dies even if we exit via an error. -trap 'rm -f "${ALL_REPOS}"; kill "${WATCHER_PID}" 2>/dev/null; wait "${WATCHER_PID}" 2>/dev/null || true' EXIT - -# Fan out one slug per worker, bounded by JOBS. The `bash -c '… "$@"' _` -# pattern is the portable way to invoke an exported function via xargs. -< "${ALL_REPOS}" xargs -n1 -P "${JOBS}" bash -c 'clone_one "$@"' _ - -# Stop the watcher and paint the final state. -kill "${WATCHER_PID}" 2>/dev/null -wait "${WATCHER_PID}" 2>/dev/null || true - -ok=$(grep -c '^OK ' "${CLONE_LOG}" || true) -sk=$(grep -c '^SKIP ' "${CLONE_LOG}" || true) -ko=$(grep -c '^FAIL ' "${CLONE_LOG}" || true) -finished=$(( ok + sk + ko )) -draw_bar "${finished}" "${total}" -printf '\n' >&2 + exec 3<&- +} + +# Tear down any live progress watcher / parse consumer on exit, even on error. +CURRENT_WATCHER="" +CURRENT_CONSUMER="" +CURRENT_FIFO="" +cleanup() { + [ -n "${CURRENT_WATCHER}" ] && kill "${CURRENT_WATCHER}" 2>/dev/null + [ -n "${CURRENT_CONSUMER}" ] && kill "${CURRENT_CONSUMER}" 2>/dev/null + wait "${CURRENT_WATCHER}" 2>/dev/null || true + wait "${CURRENT_CONSUMER}" 2>/dev/null || true + [ -n "${CURRENT_FIFO}" ] && rm -f "${CURRENT_FIFO}" + rm -f "${CLONE_LOG}" "${PARSE_LOG}" "${PARSE_QUEUE_LOG}" +} +trap cleanup EXIT + +# Clone one org (bounded by JOBS), parse each repo as it lands, then run a +# whole-org parse and delete the org's clone tree. +process_org() { + local org=$1 + local list + list=$(mktemp -t clone-orgs.XXXXXX) + + if ! list_repos "${org}" > "${list}"; then + log "warning: skipping ${org} (listing failed)" + rm -f "${list}" + return + fi + + local total + total=$(wc -l < "${list}" | tr -d ' ') + if [ "${total}" -eq 0 ]; then + log "${org}: nothing to clone" + rm -f "${list}" + return + fi + + # The org's clone tree is OUTPUT/, where is gh's canonical + # casing taken from the first slug. + local org_owner org_dir + org_owner=$(head -n1 "${list}" | cut -d/ -f1) + org_dir="${OUTPUT}/${org_owner}" + + log "${org}: queued ${total} repos; JOBS=${JOBS}, DEPTH=${DEPTH}, OUTPUT=${OUTPUT}" + + : > "${CLONE_LOG}" + : > "${PARSE_LOG}" + : > "${PARSE_QUEUE_LOG}" + + # Start the lone parse consumer and the queue it reads from. + PARSE_FIFO=$(mktemp -u -t clone-parse.XXXXXX) + mkfifo "${PARSE_FIFO}" + export PARSE_FIFO + CURRENT_FIFO="${PARSE_FIFO}" + parse_consumer & + CURRENT_CONSUMER=$! + + # One watcher renders both bars and stays alive until parsing has drained, so + # the parse bar keeps advancing through the backlog after cloning finishes. + ( + while :; do + sleep 0.2 + cn=0; pn=0 + [ -f "${CLONE_LOG}" ] && cn=$(wc -l < "${CLONE_LOG}" 2>/dev/null | tr -d ' ') + [ -f "${PARSE_LOG}" ] && pn=$(wc -l < "${PARSE_LOG}" 2>/dev/null | tr -d ' ') + [ -z "${cn}" ] && cn=0; [ -z "${pn}" ] && pn=0 + [ "${cn}" -gt "${total}" ] && cn=${total} + # Parse total is fixed: every repo (one unit each) plus the whole-org pass. + draw_bars "${cn}" "${total}" "${pn}" "$(( total + 1 ))" + done + ) & + CURRENT_WATCHER=$! + + # Fan out one slug per worker, bounded by JOBS. The `bash -c '… "$@"' _` + # pattern is the portable way to invoke an exported function via xargs. + < "${list}" xargs -n1 -P "${JOBS}" bash -c 'clone_one "$@"' _ + + # Cloning done: signal end-of-queue and let the consumer drain the parse + # backlog. The watcher keeps painting both bars throughout, so the whole-org + # pass below stays the only live parser. + printf '%s\n' "__DONE__" > "${PARSE_FIFO}" + wait "${CURRENT_CONSUMER}" 2>/dev/null || true + CURRENT_CONSUMER="" + rm -f "${PARSE_FIFO}" + CURRENT_FIFO="" + + # Whole-org pass: the final "+1" parse unit folded into the parse bar. Drop + # the org `.github` repo first so the walk skips it; output is silenced so the + # bar stays the only thing on screen, and a parse-log tick lifts the bar to + # 100% once it completes. The watcher keeps painting throughout. + local org_parsed=0 + if [ -d "${org_dir}" ]; then + rm -rf "${org_dir}/.github" + # Put the whole-org aggregate in its own subdir alongside the per-repo + # results: ./result/// (e.g. result/repos/dedis/dedis), + # so it doesn't collide with the per-repo dirs at result/repos/dedis/. + "${PARSER}" "${org_dir}" --output "${OUTPUT#./}/${org_owner}/${org_owner}" >/dev/null 2>&1 \ + || log "warning: whole-org parse failed for ${org}" + printf 'P\n' >> "${PARSE_LOG}" + org_parsed=1 + fi + + # Stop the watcher and paint the final state of both bars. + kill "${CURRENT_WATCHER}" 2>/dev/null + wait "${CURRENT_WATCHER}" 2>/dev/null || true + CURRENT_WATCHER="" + + local ok sk ko parsed queued + ok=$(grep -c '^OK ' "${CLONE_LOG}" || true) + sk=$(grep -c '^SKIP ' "${CLONE_LOG}" || true) + ko=$(grep -c '^FAIL ' "${CLONE_LOG}" || true) + parsed=$(wc -l < "${PARSE_LOG}" 2>/dev/null | tr -d ' '); [ -z "${parsed}" ] && parsed=0 + queued=$(wc -l < "${PARSE_QUEUE_LOG}" 2>/dev/null | tr -d ' '); [ -z "${queued}" ] && queued=0 + draw_bars "$(( ok + sk + ko ))" "${total}" "${parsed}" "$(( total + org_parsed ))" + printf '\n' >&2 + log "${org}: ${ok} cloned, ${sk} skipped, ${ko} failed (of ${total}); parsed ${queued} repos$( [ "${org_parsed}" -eq 1 ] && printf ' + whole-org pass' )" + if [ "${ko}" -gt 0 ]; then + grep '^FAIL ' "${CLONE_LOG}" | sed 's/^FAIL / /' >&2 + fi + TOTAL_FAIL=$(( TOTAL_FAIL + ko )) + + # Org tree no longer needed. + [ -d "${org_dir}" ] && rm -rf "${org_dir}" + rm -f "${list}" +} + +for org in "$@"; do + process_org "${org}" +done log "" -log "done: ${ok} cloned, ${sk} skipped, ${ko} failed (of ${total})" -if [ "${ko}" -gt 0 ]; then - log "failed repos (also in ${CLONE_LOG}):" - grep '^FAIL ' "${CLONE_LOG}" | sed 's/^FAIL / /' >&2 -fi -[ "${ko}" -eq 0 ] +log "all orgs done" +[ "${TOTAL_FAIL}" -eq 0 ] diff --git a/LLVM/main.cpp b/LLVM/main.cpp index 540f8c0..54cddad 100644 --- a/LLVM/main.cpp +++ b/LLVM/main.cpp @@ -38,7 +38,7 @@ static const char *kUsage = "\n" "OPTIONS:\n" " --language Only analyze files of the given language\n" - " --output Directory to write results to (default: ./result)\n" + " --output Write results to ./result/ (default: ./result/)\n" " --compiler Override the compiler used for parsing\n" " --extensions Comma-separated list of additional file extensions\n" "\n" @@ -129,7 +129,7 @@ int main(const int argc, char *argv[]) { } std::string compilerOverride; - std::string outputDir = "./result"; + std::string outputName; // when set via --output, names the subdir under ./result auto filterLanguage = LanguageEnum::Unknown; std::vector extraExtensions; std::vector rawPaths; @@ -142,7 +142,7 @@ int main(const int argc, char *argv[]) { std::fprintf(stderr, kUsage, argv[0]); return -1; } - outputDir = argv[++i]; + outputName = argv[++i]; continue; } if (arg == "--compiler") { @@ -194,12 +194,19 @@ int main(const int argc, char *argv[]) { for (const auto &p: rawPaths) inputPaths.push_back(fs::path(p).lexically_normal().string()); - // Derive per-repo output dir: ./result/ - // For relative inputs use the path as-is; for absolute inputs use the last component. + // Output goes under ./result. With --output the subdir is exactly + // ; otherwise it is derived from the first input path (relative paths + // as-is, absolute paths by their last component). + std::string outputDir; { - const fs::path ip = fs::path(rawPaths[0]).lexically_normal(); - const fs::path sub = ip.is_relative() ? ip : ip.filename(); - outputDir = (fs::path(outputDir) / sub).string(); + fs::path sub; + if (!outputName.empty()) + sub = fs::path(outputName); + else { + const fs::path ip = fs::path(rawPaths[0]).lexically_normal(); + sub = ip.is_relative() ? ip : ip.filename(); + } + outputDir = (fs::path("./result") / sub).lexically_normal().string(); } std::error_code ec; From aa49e95e227f1b1cb4e70250fe1b95a30b5bc0d5 Mon Sep 17 00:00:00 2001 From: AntoineBastide47 <148970403+AntoineBastide47@users.noreply.github.com> Date: Wed, 20 May 2026 18:13:54 +0200 Subject: [PATCH 3/4] feat: all the parsers --- .gitmodules | 72 +++++ LLVM/CMakeLists.txt | 38 +-- LLVM/extern/tree-sitter/CMakeLists.txt | 76 +++++ LLVM/extern/tree-sitter/tree-sitter | 1 + LLVM/extern/tree-sitter/tree-sitter-c-sharp | 1 + LLVM/extern/tree-sitter/tree-sitter-dart | 1 + LLVM/extern/tree-sitter/tree-sitter-elixir | 1 + LLVM/extern/tree-sitter/tree-sitter-erlang | 1 + LLVM/extern/tree-sitter/tree-sitter-glsl | 1 + LLVM/extern/tree-sitter/tree-sitter-go | 1 + LLVM/extern/tree-sitter/tree-sitter-haskell | 1 + LLVM/extern/tree-sitter/tree-sitter-hlsl | 1 + LLVM/extern/tree-sitter/tree-sitter-java | 1 + .../extern/tree-sitter/tree-sitter-javascript | 1 + LLVM/extern/tree-sitter/tree-sitter-kotlin | 1 + LLVM/extern/tree-sitter/tree-sitter-lua | 1 + LLVM/extern/tree-sitter/tree-sitter-objc | 1 + LLVM/extern/tree-sitter/tree-sitter-ocaml | 1 + LLVM/extern/tree-sitter/tree-sitter-perl | 1 + LLVM/extern/tree-sitter/tree-sitter-php | 1 + LLVM/extern/tree-sitter/tree-sitter-python | 1 + LLVM/extern/tree-sitter/tree-sitter-r | 1 + LLVM/extern/tree-sitter/tree-sitter-ruby | 1 + LLVM/extern/tree-sitter/tree-sitter-rust | 1 + LLVM/extern/tree-sitter/tree-sitter-scala | 1 + LLVM/extern/tree-sitter/tree-sitter-swift | 1 + .../extern/tree-sitter/tree-sitter-typescript | 1 + LLVM/include/LanguageData.hpp | 72 ++--- LLVM/include/LanguageParser.hpp | 6 +- LLVM/include/TreeSitterParser.hpp | 24 ++ LLVM/main.cpp | 18 +- LLVM/parsers/parser.go | 89 ------ LLVM/parsers/parser.py | 97 ------ LLVM/src/LanguageParser.cpp | 242 +-------------- LLVM/src/TreeSitterParser.cpp | 278 ++++++++++++++++++ 35 files changed, 540 insertions(+), 496 deletions(-) create mode 100644 LLVM/extern/tree-sitter/CMakeLists.txt create mode 160000 LLVM/extern/tree-sitter/tree-sitter create mode 160000 LLVM/extern/tree-sitter/tree-sitter-c-sharp create mode 160000 LLVM/extern/tree-sitter/tree-sitter-dart create mode 160000 LLVM/extern/tree-sitter/tree-sitter-elixir create mode 160000 LLVM/extern/tree-sitter/tree-sitter-erlang create mode 160000 LLVM/extern/tree-sitter/tree-sitter-glsl create mode 160000 LLVM/extern/tree-sitter/tree-sitter-go create mode 160000 LLVM/extern/tree-sitter/tree-sitter-haskell create mode 160000 LLVM/extern/tree-sitter/tree-sitter-hlsl create mode 160000 LLVM/extern/tree-sitter/tree-sitter-java create mode 160000 LLVM/extern/tree-sitter/tree-sitter-javascript create mode 160000 LLVM/extern/tree-sitter/tree-sitter-kotlin create mode 160000 LLVM/extern/tree-sitter/tree-sitter-lua create mode 160000 LLVM/extern/tree-sitter/tree-sitter-objc create mode 160000 LLVM/extern/tree-sitter/tree-sitter-ocaml create mode 160000 LLVM/extern/tree-sitter/tree-sitter-perl create mode 160000 LLVM/extern/tree-sitter/tree-sitter-php create mode 160000 LLVM/extern/tree-sitter/tree-sitter-python create mode 160000 LLVM/extern/tree-sitter/tree-sitter-r create mode 160000 LLVM/extern/tree-sitter/tree-sitter-ruby create mode 160000 LLVM/extern/tree-sitter/tree-sitter-rust create mode 160000 LLVM/extern/tree-sitter/tree-sitter-scala create mode 160000 LLVM/extern/tree-sitter/tree-sitter-swift create mode 160000 LLVM/extern/tree-sitter/tree-sitter-typescript create mode 100644 LLVM/include/TreeSitterParser.hpp delete mode 100644 LLVM/parsers/parser.go delete mode 100644 LLVM/parsers/parser.py create mode 100644 LLVM/src/TreeSitterParser.cpp diff --git a/.gitmodules b/.gitmodules index 6d81eec..31736b4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,75 @@ [submodule "LLVM/extern/Serde"] path = LLVM/extern/Serde url = https://github.com/AntoineBastide47/Serde.git +[submodule "LLVM/extern/tree-sitter/tree-sitter"] + path = LLVM/extern/tree-sitter/tree-sitter + url = https://github.com/tree-sitter/tree-sitter.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-javascript"] + path = LLVM/extern/tree-sitter/tree-sitter-javascript + url = https://github.com/tree-sitter/tree-sitter-javascript.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-typescript"] + path = LLVM/extern/tree-sitter/tree-sitter-typescript + url = https://github.com/tree-sitter/tree-sitter-typescript.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-java"] + path = LLVM/extern/tree-sitter/tree-sitter-java + url = https://github.com/tree-sitter/tree-sitter-java.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-c-sharp"] + path = LLVM/extern/tree-sitter/tree-sitter-c-sharp + url = https://github.com/tree-sitter/tree-sitter-c-sharp.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-rust"] + path = LLVM/extern/tree-sitter/tree-sitter-rust + url = https://github.com/tree-sitter/tree-sitter-rust.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-ruby"] + path = LLVM/extern/tree-sitter/tree-sitter-ruby + url = https://github.com/tree-sitter/tree-sitter-ruby.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-php"] + path = LLVM/extern/tree-sitter/tree-sitter-php + url = https://github.com/tree-sitter/tree-sitter-php.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-perl"] + path = LLVM/extern/tree-sitter/tree-sitter-perl + url = https://github.com/tree-sitter-perl/tree-sitter-perl.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-lua"] + path = LLVM/extern/tree-sitter/tree-sitter-lua + url = https://github.com/tree-sitter-grammars/tree-sitter-lua.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-swift"] + path = LLVM/extern/tree-sitter/tree-sitter-swift + url = https://github.com/alex-pinkus/tree-sitter-swift.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-kotlin"] + path = LLVM/extern/tree-sitter/tree-sitter-kotlin + url = https://github.com/fwcd/tree-sitter-kotlin.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-r"] + path = LLVM/extern/tree-sitter/tree-sitter-r + url = https://github.com/r-lib/tree-sitter-r.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-scala"] + path = LLVM/extern/tree-sitter/tree-sitter-scala + url = https://github.com/tree-sitter/tree-sitter-scala.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-haskell"] + path = LLVM/extern/tree-sitter/tree-sitter-haskell + url = https://github.com/tree-sitter/tree-sitter-haskell.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-ocaml"] + path = LLVM/extern/tree-sitter/tree-sitter-ocaml + url = https://github.com/tree-sitter/tree-sitter-ocaml.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-erlang"] + path = LLVM/extern/tree-sitter/tree-sitter-erlang + url = https://github.com/WhatsApp/tree-sitter-erlang.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-elixir"] + path = LLVM/extern/tree-sitter/tree-sitter-elixir + url = https://github.com/elixir-lang/tree-sitter-elixir.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-dart"] + path = LLVM/extern/tree-sitter/tree-sitter-dart + url = https://github.com/UserNobody14/tree-sitter-dart.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-objc"] + path = LLVM/extern/tree-sitter/tree-sitter-objc + url = https://github.com/tree-sitter-grammars/tree-sitter-objc.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-glsl"] + path = LLVM/extern/tree-sitter/tree-sitter-glsl + url = https://github.com/tree-sitter-grammars/tree-sitter-glsl.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-hlsl"] + path = LLVM/extern/tree-sitter/tree-sitter-hlsl + url = https://github.com/tree-sitter-grammars/tree-sitter-hlsl.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-go"] + path = LLVM/extern/tree-sitter/tree-sitter-go + url = https://github.com/tree-sitter/tree-sitter-go.git +[submodule "LLVM/extern/tree-sitter/tree-sitter-python"] + path = LLVM/extern/tree-sitter/tree-sitter-python + url = https://github.com/tree-sitter/tree-sitter-python.git diff --git a/LLVM/CMakeLists.txt b/LLVM/CMakeLists.txt index 588fff7..a8050e7 100644 --- a/LLVM/CMakeLists.txt +++ b/LLVM/CMakeLists.txt @@ -54,8 +54,10 @@ add_executable(${PROJECT_NAME} include/LanguageStats.hpp include/LanguageModel.generated.hpp src/LanguageParser.cpp + src/TreeSitterParser.cpp include/LanguageParser.hpp include/LanguageData.hpp + include/TreeSitterParser.hpp ) add_executable(LanguageClassifierTests @@ -91,8 +93,10 @@ add_executable(ParserFixtureTests include/LanguageStats.hpp include/LanguageModel.generated.hpp src/LanguageParser.cpp + src/TreeSitterParser.cpp include/LanguageParser.hpp include/LanguageData.hpp + include/TreeSitterParser.hpp ) target_include_directories(ParserFixtureTests PRIVATE include) target_link_libraries(ParserFixtureTests clangLex clangBasic) @@ -135,8 +139,10 @@ add_executable(DebugLanguageOutputTests include/LanguageStats.hpp include/LanguageModel.generated.hpp src/LanguageParser.cpp + src/TreeSitterParser.cpp include/LanguageParser.hpp include/LanguageData.hpp + include/TreeSitterParser.hpp ) target_include_directories(DebugLanguageOutputTests PRIVATE include) target_link_libraries(DebugLanguageOutputTests clangLex clangBasic) @@ -173,8 +179,10 @@ add_executable(StringLiteralDecodeTests include/LanguageStats.hpp include/LanguageModel.generated.hpp src/LanguageParser.cpp + src/TreeSitterParser.cpp include/LanguageParser.hpp include/LanguageData.hpp + include/TreeSitterParser.hpp ) target_include_directories(StringLiteralDecodeTests PRIVATE include) target_link_libraries(StringLiteralDecodeTests clangLex clangBasic) @@ -192,29 +200,6 @@ find_package(Clang REQUIRED CONFIG) include_directories(${LLVM_INCLUDE_DIRS}) target_link_libraries(${PROJECT_NAME} clangLex clangBasic) -target_compile_definitions(${PROJECT_NAME} PRIVATE MATCHERTEXT_PARSERS_DIR="${CMAKE_SOURCE_DIR}/parsers") - -# === Go parser binary === -# Pre-compile parsers/parser.go into a native binary so each per-file invocation -# avoids recompiling, and dodges `go run`'s rule that disallows .go arguments -# from a different directory than the source file. -find_program(GO_EXECUTABLE go) -if (GO_EXECUTABLE) - set(GO_PARSER_BIN "${CMAKE_BINARY_DIR}/matchertext_go_parser") - add_custom_command( - OUTPUT ${GO_PARSER_BIN} - COMMAND ${GO_EXECUTABLE} build -o ${GO_PARSER_BIN} ${CMAKE_SOURCE_DIR}/parsers/parser.go - DEPENDS ${CMAKE_SOURCE_DIR}/parsers/parser.go - COMMENT "Building Go parser binary" - VERBATIM - ) - add_custom_target(go_parser ALL DEPENDS ${GO_PARSER_BIN}) - add_dependencies(${PROJECT_NAME} go_parser) - target_compile_definitions(${PROJECT_NAME} PRIVATE - MATCHERTEXT_GO_PARSER_BIN="${GO_PARSER_BIN}") -else () - message(WARNING "go executable not found; Go parsing will be disabled at runtime") -endif () # === OpenMP === find_package(OpenMP QUIET) @@ -234,6 +219,13 @@ target_link_libraries(ParserFixtureTests Serde) target_link_libraries(DebugLanguageOutputTests Serde) target_link_libraries(StringLiteralDecodeTests Serde) +# === tree-sitter (in-process grammars for ~21 programming languages) === +add_subdirectory(extern/tree-sitter) +target_link_libraries(${PROJECT_NAME} tree-sitter-grammars) +target_link_libraries(ParserFixtureTests tree-sitter-grammars) +target_link_libraries(DebugLanguageOutputTests tree-sitter-grammars) +target_link_libraries(StringLiteralDecodeTests tree-sitter-grammars) + # -------------------------- # Build Configs # -------------------------- diff --git a/LLVM/extern/tree-sitter/CMakeLists.txt b/LLVM/extern/tree-sitter/CMakeLists.txt new file mode 100644 index 0000000..62331aa --- /dev/null +++ b/LLVM/extern/tree-sitter/CMakeLists.txt @@ -0,0 +1,76 @@ +# Builds the tree-sitter runtime + one isolated static library per grammar, then +# aggregates them into the `tree-sitter-grammars` INTERFACE target that the main +# binary links. Each grammar bundles its OWN src/tree_sitter/parser.h, so every +# grammar is compiled with ONLY its own src/ on the include path to avoid header +# clashes between grammars and with the core runtime. + +set(TS_ROOT ${CMAKE_CURRENT_SOURCE_DIR}) + +if (NOT EXISTS ${TS_ROOT}/tree-sitter/lib/src/lib.c) + message(FATAL_ERROR + "tree-sitter submodules not initialized.\n" + "Run: git submodule update --init --recursive") +endif () + +# --- core runtime ----------------------------------------------------------- +add_library(tree-sitter-core STATIC ${TS_ROOT}/tree-sitter/lib/src/lib.c) +target_include_directories(tree-sitter-core + PUBLIC ${TS_ROOT}/tree-sitter/lib/include # tree_sitter/api.h for our C++ code + PRIVATE ${TS_ROOT}/tree-sitter/lib/src) # lib.c #includes the rest internally +target_compile_options(tree-sitter-core PRIVATE -w -O2) +set_target_properties(tree-sitter-core PROPERTIES POSITION_INDEPENDENT_CODE ON) + +# --- one static lib per grammar --------------------------------------------- +# `name` becomes target ts_; the language symbol tree_sitter_ lives +# in the grammar's parser.c. `src_dir` is relative to TS_ROOT and holds parser.c +# (+ optional scanner.c/.cc). Generated tables are huge: -O2 (not -O3) and -w. +function(add_ts_grammar name src_dir) + set(dir ${TS_ROOT}/${src_dir}) + if (NOT EXISTS ${dir}/parser.c) + message(FATAL_ERROR "tree-sitter grammar '${name}' missing parser.c at ${dir}") + endif () + set(srcs ${dir}/parser.c) + if (EXISTS ${dir}/scanner.c) + list(APPEND srcs ${dir}/scanner.c) + endif () + if (EXISTS ${dir}/scanner.cc) + list(APPEND srcs ${dir}/scanner.cc) + endif () + add_library(ts_${name} STATIC ${srcs}) + target_include_directories(ts_${name} PRIVATE ${dir}) + target_compile_options(ts_${name} PRIVATE -w -O2) + set_target_properties(ts_${name} PROPERTIES POSITION_INDEPENDENT_CODE ON) +endfunction() + +add_ts_grammar(javascript tree-sitter-javascript/src) +add_ts_grammar(typescript tree-sitter-typescript/typescript/src) +add_ts_grammar(java tree-sitter-java/src) +add_ts_grammar(c_sharp tree-sitter-c-sharp/src) +add_ts_grammar(rust tree-sitter-rust/src) +add_ts_grammar(ruby tree-sitter-ruby/src) +add_ts_grammar(php tree-sitter-php/php/src) +add_ts_grammar(perl tree-sitter-perl/src) +add_ts_grammar(lua tree-sitter-lua/src) +add_ts_grammar(swift tree-sitter-swift/src) +add_ts_grammar(kotlin tree-sitter-kotlin/src) +add_ts_grammar(r tree-sitter-r/src) +add_ts_grammar(scala tree-sitter-scala/src) +add_ts_grammar(haskell tree-sitter-haskell/src) +add_ts_grammar(ocaml tree-sitter-ocaml/grammars/ocaml/src) +add_ts_grammar(erlang tree-sitter-erlang/src) +add_ts_grammar(elixir tree-sitter-elixir/src) +add_ts_grammar(dart tree-sitter-dart/src) +add_ts_grammar(objc tree-sitter-objc/src) +add_ts_grammar(glsl tree-sitter-glsl/src) +add_ts_grammar(hlsl tree-sitter-hlsl/src) +add_ts_grammar(go tree-sitter-go/src) +add_ts_grammar(python tree-sitter-python/src) + +# --- aggregate -------------------------------------------------------------- +add_library(tree-sitter-grammars INTERFACE) +target_include_directories(tree-sitter-grammars INTERFACE ${TS_ROOT}/tree-sitter/lib/include) +target_link_libraries(tree-sitter-grammars INTERFACE + tree-sitter-core + ts_javascript ts_typescript ts_java ts_c_sharp ts_rust ts_ruby ts_php + ts_perl ts_lua ts_swift ts_kotlin ts_r ts_scala ts_haskell ts_ocaml + ts_erlang ts_elixir ts_dart ts_objc ts_glsl ts_hlsl ts_go ts_python) diff --git a/LLVM/extern/tree-sitter/tree-sitter b/LLVM/extern/tree-sitter/tree-sitter new file mode 160000 index 0000000..b1fa972 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter @@ -0,0 +1 @@ +Subproject commit b1fa9725cf7bad668ac98b9fd3ec303f1e9076ec diff --git a/LLVM/extern/tree-sitter/tree-sitter-c-sharp b/LLVM/extern/tree-sitter/tree-sitter-c-sharp new file mode 160000 index 0000000..cac6d5f --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-c-sharp @@ -0,0 +1 @@ +Subproject commit cac6d5fb595f5811a076336682d5d595ac1c9e85 diff --git a/LLVM/extern/tree-sitter/tree-sitter-dart b/LLVM/extern/tree-sitter/tree-sitter-dart new file mode 160000 index 0000000..a9bdfa3 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-dart @@ -0,0 +1 @@ +Subproject commit a9bdfa3db2fbc9b9f12c93450d04a671f33a5102 diff --git a/LLVM/extern/tree-sitter/tree-sitter-elixir b/LLVM/extern/tree-sitter/tree-sitter-elixir new file mode 160000 index 0000000..7937d3b --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-elixir @@ -0,0 +1 @@ +Subproject commit 7937d3b4d65fa574163cfa59394515d3c1cf16f4 diff --git a/LLVM/extern/tree-sitter/tree-sitter-erlang b/LLVM/extern/tree-sitter/tree-sitter-erlang new file mode 160000 index 0000000..e446ec6 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-erlang @@ -0,0 +1 @@ +Subproject commit e446ec60022a7cafe157805742b41c04b499cc5d diff --git a/LLVM/extern/tree-sitter/tree-sitter-glsl b/LLVM/extern/tree-sitter/tree-sitter-glsl new file mode 160000 index 0000000..24a6c8e --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-glsl @@ -0,0 +1 @@ +Subproject commit 24a6c8ef698e4480fecf8340d771fbcb5de8fbb4 diff --git a/LLVM/extern/tree-sitter/tree-sitter-go b/LLVM/extern/tree-sitter/tree-sitter-go new file mode 160000 index 0000000..2346a3a --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-go @@ -0,0 +1 @@ +Subproject commit 2346a3ab1bb3857b48b29d779a1ef9799a248cd7 diff --git a/LLVM/extern/tree-sitter/tree-sitter-haskell b/LLVM/extern/tree-sitter/tree-sitter-haskell new file mode 160000 index 0000000..0975ef7 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-haskell @@ -0,0 +1 @@ +Subproject commit 0975ef72fc3c47b530309ca93937d7d143523628 diff --git a/LLVM/extern/tree-sitter/tree-sitter-hlsl b/LLVM/extern/tree-sitter/tree-sitter-hlsl new file mode 160000 index 0000000..bab9111 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-hlsl @@ -0,0 +1 @@ +Subproject commit bab9111922d53d43668fabb61869bec51bbcb915 diff --git a/LLVM/extern/tree-sitter/tree-sitter-java b/LLVM/extern/tree-sitter/tree-sitter-java new file mode 160000 index 0000000..e10607b --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-java @@ -0,0 +1 @@ +Subproject commit e10607b45ff745f5f876bfa3e94fbcc6b44bdc11 diff --git a/LLVM/extern/tree-sitter/tree-sitter-javascript b/LLVM/extern/tree-sitter/tree-sitter-javascript new file mode 160000 index 0000000..58404d8 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-javascript @@ -0,0 +1 @@ +Subproject commit 58404d8cf191d69f2674a8fd507bd5776f46cb11 diff --git a/LLVM/extern/tree-sitter/tree-sitter-kotlin b/LLVM/extern/tree-sitter/tree-sitter-kotlin new file mode 160000 index 0000000..f66d290 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-kotlin @@ -0,0 +1 @@ +Subproject commit f66d2908542e93c0204c6c241f794afe4e9cd5d1 diff --git a/LLVM/extern/tree-sitter/tree-sitter-lua b/LLVM/extern/tree-sitter/tree-sitter-lua new file mode 160000 index 0000000..10fe005 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-lua @@ -0,0 +1 @@ +Subproject commit 10fe0054734eec83049514ea2e718b2a56acd0c9 diff --git a/LLVM/extern/tree-sitter/tree-sitter-objc b/LLVM/extern/tree-sitter/tree-sitter-objc new file mode 160000 index 0000000..181a81b --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-objc @@ -0,0 +1 @@ +Subproject commit 181a81b8f23a2d593e7ab4259981f50122909fda diff --git a/LLVM/extern/tree-sitter/tree-sitter-ocaml b/LLVM/extern/tree-sitter/tree-sitter-ocaml new file mode 160000 index 0000000..6902a86 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-ocaml @@ -0,0 +1 @@ +Subproject commit 6902a86ab5b3b80c622030210aae2d8cb95eb775 diff --git a/LLVM/extern/tree-sitter/tree-sitter-perl b/LLVM/extern/tree-sitter/tree-sitter-perl new file mode 160000 index 0000000..9b651c0 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-perl @@ -0,0 +1 @@ +Subproject commit 9b651c0ca29a2c581e2bb07815e17432ac3820d3 diff --git a/LLVM/extern/tree-sitter/tree-sitter-php b/LLVM/extern/tree-sitter/tree-sitter-php new file mode 160000 index 0000000..3f2465c --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-php @@ -0,0 +1 @@ +Subproject commit 3f2465c217d0a966d41e584b42d75522f2a3149e diff --git a/LLVM/extern/tree-sitter/tree-sitter-python b/LLVM/extern/tree-sitter/tree-sitter-python new file mode 160000 index 0000000..26855ea --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-python @@ -0,0 +1 @@ +Subproject commit 26855eabccb19c6abf499fbc5b8dc7cc9ab8bc64 diff --git a/LLVM/extern/tree-sitter/tree-sitter-r b/LLVM/extern/tree-sitter/tree-sitter-r new file mode 160000 index 0000000..0e6ef77 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-r @@ -0,0 +1 @@ +Subproject commit 0e6ef7741712c09dc3ee6e81c42e919820cc65ef diff --git a/LLVM/extern/tree-sitter/tree-sitter-ruby b/LLVM/extern/tree-sitter/tree-sitter-ruby new file mode 160000 index 0000000..ad907a6 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-ruby @@ -0,0 +1 @@ +Subproject commit ad907a69da0c8a4f7a943a7fe012712208da6dee diff --git a/LLVM/extern/tree-sitter/tree-sitter-rust b/LLVM/extern/tree-sitter/tree-sitter-rust new file mode 160000 index 0000000..77a3747 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-rust @@ -0,0 +1 @@ +Subproject commit 77a3747266f4d621d0757825e6b11edcbf991ca5 diff --git a/LLVM/extern/tree-sitter/tree-sitter-scala b/LLVM/extern/tree-sitter/tree-sitter-scala new file mode 160000 index 0000000..4d081d9 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-scala @@ -0,0 +1 @@ +Subproject commit 4d081d98670ff6e98ca42c085294fc75eec15e1d diff --git a/LLVM/extern/tree-sitter/tree-sitter-swift b/LLVM/extern/tree-sitter/tree-sitter-swift new file mode 160000 index 0000000..a923ac6 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-swift @@ -0,0 +1 @@ +Subproject commit a923ac6d78d6f3132d996ecd189a4c7ca13c8d9d diff --git a/LLVM/extern/tree-sitter/tree-sitter-typescript b/LLVM/extern/tree-sitter/tree-sitter-typescript new file mode 160000 index 0000000..75b3874 --- /dev/null +++ b/LLVM/extern/tree-sitter/tree-sitter-typescript @@ -0,0 +1 @@ +Subproject commit 75b3874edb2dc714fb1fd77a32013d0f8699989f diff --git a/LLVM/include/LanguageData.hpp b/LLVM/include/LanguageData.hpp index 6f392d3..62d92a4 100644 --- a/LLVM/include/LanguageData.hpp +++ b/LLVM/include/LanguageData.hpp @@ -1,14 +1,6 @@ #ifndef LANGUAGE_DATA_HPP #define LANGUAGE_DATA_HPP -#ifndef MATCHERTEXT_PARSERS_DIR -#define MATCHERTEXT_PARSERS_DIR "./parsers" -#endif - -#ifndef MATCHERTEXT_GO_PARSER_BIN -#define MATCHERTEXT_GO_PARSER_BIN "./matchertext_go_parser" -#endif - #include #include #include @@ -22,20 +14,42 @@ struct LanguageData { const std::span alias; /// The language's common file extensions const std::span extensions; - /// The language's common compilers - const std::span compilers; - /// The language's compiler command call template - const std::string_view cmdTemplate; }; -#define LANGUAGE_LIST(X) \ - X(C, ("c"), ("c", "h"), (), "") \ - X(CPP, ("cpp", "c++"), ("cc", "cpp", "cxx", "hpp", "hh", "hxx"), (), "") \ - X(Go, ("go"), ("go"), (MATCHERTEXT_GO_PARSER_BIN), R"("{}" "{}")") \ - X(Python, ("python", "py"), ("py", "pyw", "pyi", "pyz", "pyzw"), ("python3", "python"), "{} \"" MATCHERTEXT_PARSERS_DIR "/parser.py\" \"{}\"") - -// The code bellow is to make sure the data above ^^^^^ is compiled into the parser instead of being constructed at -// each parser startup for some speed optimizations. +// Every language is parsed in-process (clang for C/C++, tree-sitter otherwise), +// so only aliases and extensions are needed. Order matters: extension lookup is +// first-wins by this list's order (main.cpp's try_emplace), so later entries +// never re-list an extension owned by an earlier one (e.g. Objective-C omits +// .h, which stays C). +#define LANGUAGE_LIST(X) \ + X(C, ("c"), ("c", "h")) \ + X(CPP, ("cpp", "c++"), ("cc", "cpp", "cxx", "hpp", "hh", "hxx")) \ + X(Go, ("go"), ("go")) \ + X(Python, ("python", "py"), ("py", "pyw", "pyi", "pyz", "pyzw")) \ + X(JavaScript, ("javascript", "js"), ("js", "mjs", "cjs", "jsx")) \ + X(TypeScript, ("typescript", "ts"), ("ts", "mts", "cts", "tsx")) \ + X(Java, ("java"), ("java")) \ + X(CSharp, ("csharp", "c#", "cs"), ("cs", "csx")) \ + X(Rust, ("rust", "rs"), ("rs")) \ + X(Ruby, ("ruby", "rb"), ("rb", "rbw", "rake", "gemspec")) \ + X(PHP, ("php"), ("php", "php3", "php4", "php5", "phtml")) \ + X(Perl, ("perl", "pl"), ("pl", "pm", "perl")) \ + X(Lua, ("lua"), ("lua")) \ + X(Swift, ("swift"), ("swift")) \ + X(Kotlin, ("kotlin", "kt"), ("kt", "kts")) \ + X(R, ("r"), ("r", "R")) \ + X(Scala, ("scala"), ("scala", "sc")) \ + X(Haskell, ("haskell", "hs"), ("hs", "lhs")) \ + X(OCaml, ("ocaml", "ml"), ("ml", "mli")) \ + X(Erlang, ("erlang", "erl"), ("erl", "hrl")) \ + X(Elixir, ("elixir", "ex"), ("ex", "exs")) \ + X(Dart, ("dart"), ("dart")) \ + X(Objective_C, ("objc", "objectivec", "objective-c"), ("m", "mm")) \ + X(GLSL, ("glsl"), ("glsl", "vert", "frag", "geom", "comp", "tesc", "tese")) \ + X(HLSL, ("hlsl"), ("hlsl", "fx", "hlsli")) + +// The code below ensures the data above is compiled into the parser instead of +// being constructed at each parser startup, for some speed optimizations. template constexpr std::array sv_array(Ts... values) { return {std::string_view{values}...}; @@ -44,23 +58,20 @@ template constexpr std::array s #define STRIP_PARENS(...) __VA_ARGS__ #define AS_ARRAY(x) sv_array(STRIP_PARENS x) -#define MAKE_ARRAYS(name, aliases, extensions, comps, cmd) \ +#define MAKE_ARRAYS(name, aliases, extensions) \ constexpr auto k##name##Aliases = AS_ARRAY(aliases); \ - constexpr auto k##name##Extensions = AS_ARRAY(extensions); \ - constexpr auto k##name##Compilers = AS_ARRAY(comps); + constexpr auto k##name##Extensions = AS_ARRAY(extensions); LANGUAGE_LIST(MAKE_ARRAYS) #undef MAKE_ARRAYS -#define MAKE_ENTRY(name, aliases, extensions, comps, cmd) \ +#define MAKE_ENTRY(name, aliases, extensions) \ std::pair{ \ LanguageEnum::name, \ LanguageData{ \ std::span{k##name##Aliases}, \ - std::span{k##name##Extensions}, \ - std::span{k##name##Compilers}, \ - std::string_view{cmd} \ + std::span{k##name##Extensions} \ } \ }, @@ -68,13 +79,6 @@ constexpr auto kLanguageData = std::array{LANGUAGE_LIST(MAKE_ENTRY)}; #undef MAKE_ENTRY -static constexpr const LanguageData &GetLanguageData(const LanguageEnum lang) { - for (const auto &[key, value]: kLanguageData) - if (key == lang) - return value; - throw std::logic_error{"Unknown Language, make sure that the language is registered in LanguageData.hpp"}; -} - static LanguageEnum GetLanguage(const std::string_view lang) { for (const auto &[key, value]: kLanguageData) if (const auto it = std::ranges::find(value.alias, lang); it != value.alias.end()) diff --git a/LLVM/include/LanguageParser.hpp b/LLVM/include/LanguageParser.hpp index a82b826..1d7e1eb 100644 --- a/LLVM/include/LanguageParser.hpp +++ b/LLVM/include/LanguageParser.hpp @@ -14,13 +14,9 @@ class LanguageParser { public: [[nodiscard]] static bool ExtractData( - LanguageEnum language, const std::string &compilerOverride, const std::string &filePath, Serde::JSON &result + LanguageEnum language, const std::string &filePath, Serde::JSON &result ); static bool ParseLanguage(const std::string &name, LanguageEnum &out); - private: - [[nodiscard]] static bool RunBuildCommand( - LanguageEnum language, const std::string &compilerOverride, const std::string &filePath, std::string &out - ); }; #endif //LANGUAGE_PARSER_HPP diff --git a/LLVM/include/TreeSitterParser.hpp b/LLVM/include/TreeSitterParser.hpp new file mode 100644 index 0000000..3eeeb4a --- /dev/null +++ b/LLVM/include/TreeSitterParser.hpp @@ -0,0 +1,24 @@ +#ifndef TREE_SITTER_PARSER_HPP +#define TREE_SITTER_PARSER_HPP + +#include + +#include "JSON.hpp" +#include "LanguageClassifier.hpp" + +/// In-process tree-sitter string/comment extractor. Counterpart of +/// Parser::ParseC_CPP for the ~21 programming languages whose grammars are +/// linked into the binary (see extern/tree-sitter). Emits the same JSON +/// contract: an array of {"kind":"string"|"comment","value":"..."}. +namespace TreeSitter { + /// True if `language` is parsed in-process via a linked tree-sitter grammar. + bool IsTreeSitterLanguage(LanguageEnum language); + + /// Parse `path` for `language`, appending {"kind","value"} objects to + /// `result` (made an array if it isn't one). Thread-safe (thread_local + /// parser). Returns false only on unreadable file / unsupported language; + /// parse errors yield whatever was collected (possibly empty). + bool Parse(LanguageEnum language, const std::string &path, Serde::JSON &result); +} + +#endif // TREE_SITTER_PARSER_HPP diff --git a/LLVM/main.cpp b/LLVM/main.cpp index 54cddad..a001100 100644 --- a/LLVM/main.cpp +++ b/LLVM/main.cpp @@ -39,7 +39,6 @@ static const char *kUsage = "OPTIONS:\n" " --language Only analyze files of the given language\n" " --output Write results to ./result/ (default: ./result/)\n" - " --compiler Override the compiler used for parsing\n" " --extensions Comma-separated list of additional file extensions\n" "\n" "EXAMPLES:\n" @@ -128,7 +127,6 @@ int main(const int argc, char *argv[]) { log_info(msg.str()); } - std::string compilerOverride; std::string outputName; // when set via --output, names the subdir under ./result auto filterLanguage = LanguageEnum::Unknown; std::vector extraExtensions; @@ -145,15 +143,6 @@ int main(const int argc, char *argv[]) { outputName = argv[++i]; continue; } - if (arg == "--compiler") { - if (i + 1 >= argc) { - std::fprintf(stderr, "--compiler requires a value\n\n"); - std::fprintf(stderr, kUsage, argv[0]); - return -1; - } - compilerOverride = argv[++i]; - continue; - } if (arg == "--extensions") { if (i + 1 >= argc) { std::fprintf(stderr, "--extensions requires a value\n\n"); @@ -375,11 +364,11 @@ int main(const int argc, char *argv[]) { const auto langStart = Clock::now(); #if USE_OPENMP - #pragma omp parallel for schedule(dynamic) default(none) shared(lang, compilerOverride, pls, done, displayedPct, langTotal) + #pragma omp parallel for schedule(dynamic) default(none) shared(lang, pls, done, displayedPct, langTotal) #endif for (const auto &[filePath, inputPath]: lang.second) { try { - if (Serde::JSON result; LanguageParser::ExtractData(lang.first, compilerOverride, filePath, result)) + if (Serde::JSON result; LanguageParser::ExtractData(lang.first, filePath, result)) Parser::GatherStatistics(std::move(result), filePath, inputPath, pls); } catch (const std::exception &e) { #pragma omp critical @@ -389,8 +378,7 @@ int main(const int argc, char *argv[]) { } const size_t n = ++done; - const int newPct = static_cast(n * 100 / langTotal); - if (newPct > displayedPct.load(std::memory_order_relaxed)) { + if (const int newPct = static_cast(n * 100 / langTotal); newPct > displayedPct.load(std::memory_order_relaxed)) { #pragma omp critical { if (const int cur = displayedPct.load(); newPct > cur) { diff --git a/LLVM/parsers/parser.go b/LLVM/parsers/parser.go deleted file mode 100644 index 122b0ac..0000000 --- a/LLVM/parsers/parser.go +++ /dev/null @@ -1,89 +0,0 @@ -// parser.go -// Go counterpart of Parser::ParseC_CPP. -// Emits a JSON array of {"kind": "string"|"comment", "value": "..."} on stdout. -// Usage: go run parser.go -package main - -import ( - "encoding/json" - "go/scanner" - "go/token" - "os" - "strings" -) - -type Item struct { - Kind string `json:"kind"` - Value string `json:"value"` -} - -// extractStringBody strips the surrounding delimiters of a Go string literal. -// Supports interpreted strings ("...") and raw strings (`...`); preserves -// inner content verbatim — escape sequences are kept as written. -func extractStringBody(lit string) string { - if len(lit) < 2 { - return lit - } - first, last := lit[0], lit[len(lit)-1] - if (first == '"' && last == '"') || (first == '`' && last == '`') { - return lit[1 : len(lit)-1] - } - return lit -} - -func parse(path string) []Item { - items := make([]Item, 0) - - src, err := os.ReadFile(path) - if err != nil { - return items - } - - fset := token.NewFileSet() - file := fset.AddFile(path, fset.Base(), len(src)) - var s scanner.Scanner - s.Init(file, src, func(_ token.Position, _ string) {}, scanner.ScanComments) - - var pending strings.Builder - hasPending := false - flushPending := func() { - if hasPending { - items = append(items, Item{Kind: "string", Value: pending.String()}) - pending.Reset() - hasPending = false - } - } - - for { - _, tok, lit := s.Scan() - if tok == token.EOF { - break - } - switch tok { - case token.STRING: - pending.WriteString(extractStringBody(lit)) - hasPending = true - case token.COMMENT: - flushPending() - items = append(items, Item{Kind: "comment", Value: lit}) - case token.SEMICOLON: - // Inserted by the scanner — does not break adjacency for our purposes. - default: - flushPending() - } - } - flushPending() - - return items -} - -func main() { - if len(os.Args) < 2 { - os.Stdout.WriteString("[]") - return - } - items := parse(os.Args[1]) - enc := json.NewEncoder(os.Stdout) - enc.SetEscapeHTML(false) - _ = enc.Encode(items) -} diff --git a/LLVM/parsers/parser.py b/LLVM/parsers/parser.py deleted file mode 100644 index 3c10c70..0000000 --- a/LLVM/parsers/parser.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python3 -# -# parser.py -# Python counterpart of Parser::ParseC_CPP. -# Emits a JSON array of {"kind": "string"|"comment", "value": "..."} on stdout. -# Usage: parser.py -# parser.py --server (reads file paths from stdin, one per line; writes one JSON array per line) - -import json -import re -import sys -import tokenize - - -_PREFIX_QUOTE = re.compile(r"^([rRbBuUfF]{0,3})('''|\"\"\"|'|\")") - - -def extract_string_body(spelling: str) -> str: - m = _PREFIX_QUOTE.match(spelling) - if not m: - return spelling - quote = m.group(2) - body_start = m.end() - body_end = spelling.rfind(quote) - if body_end <= body_start - len(quote): - return spelling[body_start:] - return spelling[body_start:body_end] - - -def parse(path: str): - items = [] - pending = None # accumulator for adjacent string concatenation - - def flush_pending(): - nonlocal pending - if pending is not None: - items.append({"kind": "string", "value": pending}) - pending = None - - try: - with open(path, "rb") as f: - tokens = tokenize.tokenize(f.readline) - for tok in tokens: - ttype = tok.type - if ttype == tokenize.STRING: - body = extract_string_body(tok.string) - pending = body if pending is None else pending + body - elif ttype == tokenize.COMMENT: - flush_pending() - items.append({"kind": "comment", "value": tok.string}) - elif ttype in ( - tokenize.NL, - tokenize.NEWLINE, - tokenize.INDENT, - tokenize.DEDENT, - tokenize.ENCODING, - tokenize.ENDMARKER, - ): - continue - else: - flush_pending() - flush_pending() - except (tokenize.TokenError, SyntaxError, OSError, UnicodeDecodeError): - # Bail out gracefully — return what we collected so far. - flush_pending() - - return items - - -def server_mode(): - """Read file paths from stdin line by line, write one JSON array per line to stdout.""" - for line in sys.stdin: - path = line.rstrip("\n") - if not path: - sys.stdout.write("[]\n") - sys.stdout.flush() - continue - items = parse(path) - json.dump(items, sys.stdout, ensure_ascii=False, separators=(",", ":")) - sys.stdout.write("\n") - sys.stdout.flush() - - -def main() -> int: - if len(sys.argv) >= 2 and sys.argv[1] == "--server": - server_mode() - return 0 - if len(sys.argv) < 2: - sys.stdout.write("[]") - return 0 - items = parse(sys.argv[1]) - json.dump(items, sys.stdout, ensure_ascii=False, separators=(",", ":")) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/LLVM/src/LanguageParser.cpp b/LLVM/src/LanguageParser.cpp index 26c5285..8ad9f23 100644 --- a/LLVM/src/LanguageParser.cpp +++ b/LLVM/src/LanguageParser.cpp @@ -5,200 +5,26 @@ // #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern char **environ; +#include +#include #include "../include/LanguageParser.hpp" -#include "JSON.hpp" -#include "JsonParser.hpp" #include "../include/LanguageData.hpp" #include "../include/Parser.hpp" - -static std::string format(const std::string &tpl, const std::string &a, const std::string &b) { - std::string out; - out.reserve(tpl.size() + a.size() * 2 + b.size() * 2); - - int arg = 0; - for (size_t i = 0; i < tpl.size(); ++i) { - if (i + 1 < tpl.size() && tpl[i] == '{' && tpl[i + 1] == '}') { - if (arg == 0) - out += a; - else if (arg == 1) - out += b; - else if (arg == 2 || arg == 3) - out += b + ".out"; - ++arg; - ++i; - } else { - out += tpl[i]; - } - } - return out; -} - -static bool isAvailable(const std::string_view cmd) { - const std::string check = "command -v " + std::string(cmd) + " >/dev/null 2>&1"; - return std::system(check.c_str()) == 0; -} - -static std::string firstAvailable(const std::span candidates) { - for (const auto c: candidates) - if (isAvailable(c)) - return std::string(c); - return ""; -} - -namespace { - // Bidirectional persistent subprocess. One instance per thread per language. - struct PersistentProcess { - int write_fd = -1; - int read_fd = -1; - pid_t pid = -1; - - [[nodiscard]] bool valid() const { - return write_fd >= 0 && read_fd >= 0 && pid > 0; - } - - bool start(const std::string &cmd) { - int to_child[2], from_child[2]; - if (pipe(to_child) < 0) - return false; - if (pipe(from_child) < 0) { - ::close(to_child[0]); - ::close(to_child[1]); - return false; - } - - posix_spawn_file_actions_t fa; - posix_spawn_file_actions_init(&fa); - posix_spawn_file_actions_adddup2(&fa, to_child[0], STDIN_FILENO); - posix_spawn_file_actions_adddup2(&fa, from_child[1], STDOUT_FILENO); - posix_spawn_file_actions_addclose(&fa, to_child[0]); - posix_spawn_file_actions_addclose(&fa, to_child[1]); - posix_spawn_file_actions_addclose(&fa, from_child[0]); - posix_spawn_file_actions_addclose(&fa, from_child[1]); - - // Put the child in its own process group so kill(-pid, SIGKILL) reaches - // both /bin/sh and the python3 it forks, preventing orphaned subprocesses. - posix_spawnattr_t attr; - posix_spawnattr_init(&attr); - posix_spawnattr_setpgroup(&attr, 0); - posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETPGROUP); - - const char *argv[] = {"/bin/sh", "-c", cmd.c_str(), nullptr}; - const int r = posix_spawn(&pid, "/bin/sh", &fa, &attr, const_cast(argv), environ); - posix_spawn_file_actions_destroy(&fa); - posix_spawnattr_destroy(&attr); - - ::close(to_child[0]); - ::close(from_child[1]); - - if (r != 0) { - ::close(to_child[1]); - ::close(from_child[0]); - pid = -1; - return false; - } - - write_fd = to_child[1]; - read_fd = from_child[0]; - return true; - } - - // Send a file path, read back the JSON line. Returns false if the subprocess died or timed out. - bool request(const std::string &path, std::string &out, int timeout_ms = 10000) { - const std::string msg = path + "\n"; - const char *p = msg.c_str(); - size_t remaining = msg.size(); - while (remaining > 0) { - const ssize_t n = write(write_fd, p, remaining); - if (n <= 0) - return false; - p += n; - remaining -= static_cast(n); - } - - out.clear(); - char buf[4096]; - const auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms); - - while (true) { - const auto now = std::chrono::steady_clock::now(); - if (now >= deadline) - return false; - - const int wait_ms = static_cast( - std::chrono::duration_cast(deadline - now).count() - ); - struct pollfd pfd = {read_fd, POLLIN, 0}; - if (::poll(&pfd, 1, wait_ms) <= 0) - return false; - if (!(pfd.revents & POLLIN)) - return false; - - const ssize_t n = ::read(read_fd, buf, sizeof(buf)); - if (n <= 0) - return false; - out.append(buf, static_cast(n)); - - if (!out.empty() && out.back() == '\n') { - out.pop_back(); - return true; - } - } - } - - void stop() { - if (write_fd >= 0) { - ::close(write_fd); - write_fd = -1; - } - if (read_fd >= 0) { - ::close(read_fd); - read_fd = -1; - } - if (pid > 0) { - kill(-pid, SIGKILL); // kill entire process group (shell + python3 child) - waitpid(pid, nullptr, 0); - pid = -1; - } - } - }; - - struct ThreadProcesses { - std::map procs; - - ~ThreadProcesses() { - for (auto &[lang, proc]: procs) - proc.stop(); - } - }; - - thread_local ThreadProcesses tl_procs; -} +#include "../include/TreeSitterParser.hpp" bool LanguageParser::ExtractData( - const LanguageEnum language, const std::string &compilerOverride, const std::string &filePath, Serde::JSON &result + const LanguageEnum language, const std::string &filePath, Serde::JSON &result ) { + // C/C++ use the in-process clang lexer; every other supported language uses an + // in-process tree-sitter grammar (see src/TreeSitterParser.cpp). if (language == LanguageEnum::C || language == LanguageEnum::CPP) return Parser::ParseC_CPP(filePath, result) && result.IsArray(); - std::string out; - if (!RunBuildCommand(language, compilerOverride, filePath, out)) - return false; + if (TreeSitter::IsTreeSitterLanguage(language)) + return TreeSitter::Parse(language, filePath, result) && result.IsArray(); - result = Serde::JSONParser{out}.Parse(); - return result.IsArray(); + return false; } bool LanguageParser::ParseLanguage(const std::string &name, LanguageEnum &out) { @@ -211,53 +37,3 @@ bool LanguageParser::ParseLanguage(const std::string &name, LanguageEnum &out) { out = GetLanguage(lower); return out != LanguageEnum::Unknown; } - -bool LanguageParser::RunBuildCommand( - const LanguageEnum language, const std::string &compilerOverride, const std::string &filePath, std::string &out -) { - const auto data = GetLanguageData(language); - const std::string cc = compilerOverride.empty() ? firstAvailable(data.compilers) : compilerOverride; - if (cc.empty()) - return false; - - // Script-based parsers use a persistent per-thread subprocess to avoid interpreter startup overhead. - if (language == LanguageEnum::Python) { - auto &proc = tl_procs.procs[language]; - auto try_start = [&]() -> bool { - if (proc.valid()) - return true; - const std::string serverCmd = cc + " \"" MATCHERTEXT_PARSERS_DIR - "/parser.py\" --server"; - return proc.start(serverCmd); - }; - if (!try_start()) - return false; - if (!proc.request(filePath, out)) { - // Subprocess died mid-run; restart and retry once. - proc.stop(); - if (!try_start()) - return false; - return proc.request(filePath, out); - } - return true; - } - - // One-shot popen for other external parsers (e.g. Go binary). - std::array < char, 4096 > buffer{}; - const std::string cmd = format(std::string(data.cmdTemplate), cc, filePath); - FILE *pipe = popen(cmd.c_str(), "r"); - if (!pipe) - throw std::runtime_error("popen() failed"); - - while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) - out += buffer.data(); - - int status = pclose(pipe); - if (status == -1) - throw std::runtime_error("pclose() failed"); - - if (WIFEXITED(status) && WEXITSTATUS(status) != 0) - throw std::runtime_error("command failed with exit code " + std::to_string(WEXITSTATUS(status))); - - return true; -} diff --git a/LLVM/src/TreeSitterParser.cpp b/LLVM/src/TreeSitterParser.cpp new file mode 100644 index 0000000..ef36995 --- /dev/null +++ b/LLVM/src/TreeSitterParser.cpp @@ -0,0 +1,278 @@ +#include "../include/TreeSitterParser.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// Each grammar's language function lives in its vendored parser.c. +extern "C" { + const TSLanguage *tree_sitter_javascript(void); + const TSLanguage *tree_sitter_typescript(void); + const TSLanguage *tree_sitter_java(void); + const TSLanguage *tree_sitter_c_sharp(void); + const TSLanguage *tree_sitter_rust(void); + const TSLanguage *tree_sitter_ruby(void); + const TSLanguage *tree_sitter_php(void); + const TSLanguage *tree_sitter_perl(void); + const TSLanguage *tree_sitter_lua(void); + const TSLanguage *tree_sitter_swift(void); + const TSLanguage *tree_sitter_kotlin(void); + const TSLanguage *tree_sitter_r(void); + const TSLanguage *tree_sitter_scala(void); + const TSLanguage *tree_sitter_haskell(void); + const TSLanguage *tree_sitter_ocaml(void); + const TSLanguage *tree_sitter_erlang(void); + const TSLanguage *tree_sitter_elixir(void); + const TSLanguage *tree_sitter_dart(void); + const TSLanguage *tree_sitter_objc(void); + const TSLanguage *tree_sitter_glsl(void); + const TSLanguage *tree_sitter_hlsl(void); + const TSLanguage *tree_sitter_go(void); + const TSLanguage *tree_sitter_python(void); +} + +namespace { + // ---- grammar registry ---------------------------------------------------- + const TSLanguage *languageFor(const LanguageEnum lang) { + switch (lang) { + case LanguageEnum::JavaScript: return tree_sitter_javascript(); + case LanguageEnum::TypeScript: return tree_sitter_typescript(); + case LanguageEnum::Java: return tree_sitter_java(); + case LanguageEnum::CSharp: return tree_sitter_c_sharp(); + case LanguageEnum::Rust: return tree_sitter_rust(); + case LanguageEnum::Ruby: return tree_sitter_ruby(); + case LanguageEnum::PHP: return tree_sitter_php(); + case LanguageEnum::Perl: return tree_sitter_perl(); + case LanguageEnum::Lua: return tree_sitter_lua(); + case LanguageEnum::Swift: return tree_sitter_swift(); + case LanguageEnum::Kotlin: return tree_sitter_kotlin(); + case LanguageEnum::R: return tree_sitter_r(); + case LanguageEnum::Scala: return tree_sitter_scala(); + case LanguageEnum::Haskell: return tree_sitter_haskell(); + case LanguageEnum::OCaml: return tree_sitter_ocaml(); + case LanguageEnum::Erlang: return tree_sitter_erlang(); + case LanguageEnum::Elixir: return tree_sitter_elixir(); + case LanguageEnum::Dart: return tree_sitter_dart(); + case LanguageEnum::Objective_C: return tree_sitter_objc(); + case LanguageEnum::GLSL: return tree_sitter_glsl(); + case LanguageEnum::HLSL: return tree_sitter_hlsl(); + case LanguageEnum::Go: return tree_sitter_go(); + case LanguageEnum::Python: return tree_sitter_python(); + default: return nullptr; + } + } + + // ---- node-type classification -------------------------------------------- + enum class Kind { Other, String, Comment }; + + struct Classifier { + std::unordered_set stringTypes; + std::unordered_set commentTypes; + + Kind classify(const char *type) const { + const std::string_view t(type); + if (commentTypes.contains(t)) + return Kind::Comment; + if (stringTypes.contains(t)) + return Kind::String; + // Heuristic fallback — resilient to grammar version drift and the many + // string/comment node-type spellings across grammars. + if (t.find("comment") != std::string_view::npos) + return Kind::Comment; + if (t.find("string") != std::string_view::npos || t == "char" || t.find("char_literal") != std::string_view::npos) + return Kind::String; + return Kind::Other; + } + }; + + // Per-language overrides for node types the heuristic misses (no "string"/ + // "comment" substring). Languages absent here rely purely on the heuristic. + const Classifier &classifierFor(const LanguageEnum lang) { + static const std::unordered_map table = [] { + std::unordered_map m; + m[LanguageEnum::Java] = {{"text_block"}, {}}; + m[LanguageEnum::Ruby] = {{"heredoc_body", "heredoc_beginning", "bare_string"}, {}}; + m[LanguageEnum::PHP] = {{"heredoc", "nowdoc", "encapsed_string"}, {}}; + m[LanguageEnum::Perl] = {{"heredoc_content"}, {"=pod"}}; + m[LanguageEnum::Elixir] = {{"charlist", "sigil", "quoted_content"}, {}}; + m[LanguageEnum::Erlang] = {{"sigil"}, {}}; + m[LanguageEnum::OCaml] = {{"quoted_string"}, {}}; + m[LanguageEnum::Haskell] = {{}, {"haddock"}}; + return m; + }(); + static const Classifier kEmpty{}; + const auto it = table.find(lang); + return it == table.end() ? kEmpty : it->second; + } + + // ---- text helpers -------------------------------------------------------- + std::string nodeText(const TSNode node, const std::string &src) { + uint32_t s = ts_node_start_byte(node); + uint32_t e = ts_node_end_byte(node); + if (e > src.size()) + e = static_cast(src.size()); + if (s > e) + s = e; + return src.substr(s, e - s); + } + + bool isQuote(const char c) { return c == '"' || c == '\'' || c == '`'; } + + // Strip surrounding string delimiters (and optional letter/@/$ prefixes), + // handling triple-quote runs. Fallback when the grammar exposes no content + // child. Mirrors the spirit of parser.py / parser.go body extraction. + std::string stripDelimiters(const std::string &s) { + size_t b = 0; + const size_t e = s.size(); + while (b < e && !isQuote(s[b]) && (std::isalpha(static_cast(s[b])) || s[b] == '@' || s[b] == '$')) + ++b; + if (b >= e || !isQuote(s[b])) + return s; // no recognizable quote delimiter + const char q = s[b]; + size_t open = 0; + while (b + open < e && s[b + open] == q) + ++open; + size_t close = 0; + while (close < e - b && s[e - 1 - close] == q) + ++close; + const size_t take = std::min(open, close); + const size_t bodyStart = b + take; + const size_t bodyEnd = e >= take && e - take >= bodyStart ? e - take : bodyStart; + return s.substr(bodyStart, bodyEnd - bodyStart); + } + + // Body of a string node: prefer content/fragment children (delimiters and + // interpolation markers excluded for free); otherwise strip delimiters. + std::string stringBody(const TSNode node, const std::string &src) { + std::string body; + bool found = false; + const uint32_t n = ts_node_named_child_count(node); + for (uint32_t i = 0; i < n; ++i) { + const TSNode c = ts_node_named_child(node, i); + const char *t = ts_node_type(c); + if (std::strstr(t, "content") || std::strstr(t, "fragment")) { + body += nodeText(c, src); + found = true; + } + } + return found ? body : stripDelimiters(nodeText(node, src)); + } + + Serde::JSON makeObj(const char *kind, const std::string &value) { + return Serde::JSON::Object({{"kind", Serde::JSON(kind)}, {"value", Serde::JSON(value)}}); + } + + // ---- tree walk ----------------------------------------------------------- + // Iterative (explicit stack) to avoid recursion-depth blowups on deep trees. + // String/comment nodes are emitted whole and not descended into; ERROR nodes + // are descended through so well-formed descendants are still collected. + void walk(const TSNode root, const std::string &src, const Classifier &cls, Serde::JSON &out) { + std::vector stack; + stack.push_back(root); + while (!stack.empty()) { + const TSNode node = stack.back(); + stack.pop_back(); + + switch (cls.classify(ts_node_type(node))) { + case Kind::Comment: + out.PushBack(makeObj("comment", nodeText(node, src))); + continue; + case Kind::String: + out.PushBack(makeObj("string", stringBody(node, src))); + continue; + case Kind::Other: + break; + } + + const uint32_t count = ts_node_child_count(node); + for (uint32_t i = count; i-- > 0;) + stack.push_back(ts_node_child(node, i)); + } + } + + // Cap per-file parse time so a pathological input (e.g. a huge minified + // bundle, common in large repos) can't wedge a worker thread forever. + // tree-sitter invokes the progress callback periodically while parsing; + // returning true cancels the parse, which then yields a null tree. + constexpr auto kParseTimeout = std::chrono::seconds(5); + + // TSInput read callback: hand tree-sitter the whole remaining buffer at once. + const char *readString(void *payload, const uint32_t byte, TSPoint, uint32_t *bytesRead) { + const auto *s = static_cast(payload); + if (byte >= s->size()) { + *bytesRead = 0; + return ""; + } + *bytesRead = static_cast(s->size() - byte); + return s->data() + byte; + } + + bool parseExpired(TSParseState *state) { + const auto *deadline = static_cast(state->payload); + return std::chrono::steady_clock::now() >= *deadline; + } + + // thread_local TSParser, cleaned up at thread exit (parsing runs under OpenMP). + struct ParserHolder { + TSParser *parser = nullptr; + TSParser *get() { + if (!parser) + parser = ts_parser_new(); + return parser; + } + ~ParserHolder() { + if (parser) + ts_parser_delete(parser); + } + }; +} + +bool TreeSitter::IsTreeSitterLanguage(const LanguageEnum language) { + return languageFor(language) != nullptr; +} + +bool TreeSitter::Parse(const LanguageEnum language, const std::string &path, Serde::JSON &result) { + const TSLanguage *tsLang = languageFor(language); + if (!tsLang) + return false; + + std::ifstream in(path, std::ios::binary); + if (!in) + return false; + const std::string src((std::istreambuf_iterator(in)), std::istreambuf_iterator()); + + if (!result.IsArray()) + result = Serde::JSON::Array(); + + thread_local ParserHolder holder; + TSParser *parser = holder.get(); + if (!ts_parser_set_language(parser, tsLang)) + return false; // ABI mismatch between grammar and core runtime + + auto deadline = std::chrono::steady_clock::now() + kParseTimeout; + TSInput input{}; + input.payload = const_cast(&src); + input.read = readString; + input.encoding = TSInputEncodingUTF8; + input.decode = nullptr; + TSParseOptions opts{}; + opts.payload = &deadline; + opts.progress_callback = parseExpired; + + TSTree *tree = ts_parser_parse_with_options(parser, nullptr, input, opts); + if (!tree) + return true; // timed out or nothing parsed; empty array is valid + + walk(ts_tree_root_node(tree), src, classifierFor(language), result); + ts_tree_delete(tree); + return true; +} From 36e3cd5e7819976d2e1afbfed0de505d5a282a3e Mon Sep 17 00:00:00 2001 From: AntoineBastide47 <148970403+AntoineBastide47@users.noreply.github.com> Date: Wed, 20 May 2026 20:37:48 +0200 Subject: [PATCH 4/4] feat: faster parsing ~5-20% --- LLVM/extern/tree-sitter/CMakeLists.txt | 22 ++++++++- LLVM/src/TreeSitterParser.cpp | 68 +++++++++++++++++++------- 2 files changed, 69 insertions(+), 21 deletions(-) diff --git a/LLVM/extern/tree-sitter/CMakeLists.txt b/LLVM/extern/tree-sitter/CMakeLists.txt index 62331aa..0c3d5de 100644 --- a/LLVM/extern/tree-sitter/CMakeLists.txt +++ b/LLVM/extern/tree-sitter/CMakeLists.txt @@ -20,14 +20,30 @@ target_include_directories(tree-sitter-core target_compile_options(tree-sitter-core PRIVATE -w -O2) set_target_properties(tree-sitter-core PROPERTIES POSITION_INDEPENDENT_CODE ON) +# A few grammars (perl, swift) don't commit a generated parser.c, so generate it +# from grammar.json with the tree-sitter CLI. This also self-heals when a +# submodule re-checkout wipes the previously generated (untracked) parser.c. +find_program(TREE_SITTER_CLI tree-sitter HINTS /opt/homebrew/bin /usr/local/bin) + # --- one static lib per grammar --------------------------------------------- # `name` becomes target ts_; the language symbol tree_sitter_ lives # in the grammar's parser.c. `src_dir` is relative to TS_ROOT and holds parser.c # (+ optional scanner.c/.cc). Generated tables are huge: -O2 (not -O3) and -w. function(add_ts_grammar name src_dir) set(dir ${TS_ROOT}/${src_dir}) + get_filename_component(repo_dir ${dir} DIRECTORY) + if (NOT EXISTS ${dir}/parser.c AND EXISTS ${dir}/grammar.json AND TREE_SITTER_CLI) + message(STATUS "tree-sitter: generating parser.c for '${name}'") + execute_process( + COMMAND ${TREE_SITTER_CLI} generate src/grammar.json + WORKING_DIRECTORY ${repo_dir} + RESULT_VARIABLE ts_gen_result OUTPUT_QUIET ERROR_QUIET) + endif () if (NOT EXISTS ${dir}/parser.c) - message(FATAL_ERROR "tree-sitter grammar '${name}' missing parser.c at ${dir}") + message(FATAL_ERROR + "tree-sitter grammar '${name}' missing parser.c at ${dir}.\n" + "Install the tree-sitter CLI (brew install tree-sitter) and re-configure, or run:\n" + " (cd ${repo_dir} && tree-sitter generate src/grammar.json)") endif () set(srcs ${dir}/parser.c) if (EXISTS ${dir}/scanner.c) @@ -38,7 +54,9 @@ function(add_ts_grammar name src_dir) endif () add_library(ts_${name} STATIC ${srcs}) target_include_directories(ts_${name} PRIVATE ${dir}) - target_compile_options(ts_${name} PRIVATE -w -O2) + # parser.c is mostly static data tables (opt level emits identical data), but + # each lib may also include a hand-written scanner.c that IS hot code, so -O3. + target_compile_options(ts_${name} PRIVATE -w -O3 -DNDEBUG -mcpu=native) set_target_properties(ts_${name} PROPERTIES POSITION_INDEPENDENT_CODE ON) endfunction() diff --git a/LLVM/src/TreeSitterParser.cpp b/LLVM/src/TreeSitterParser.cpp index ef36995..91882fb 100644 --- a/LLVM/src/TreeSitterParser.cpp +++ b/LLVM/src/TreeSitterParser.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -115,7 +114,8 @@ namespace { } // ---- text helpers -------------------------------------------------------- - std::string nodeText(const TSNode node, const std::string &src) { + // A node's source bytes as a view into `src` (no copy). + std::string_view nodeText(const TSNode node, const std::string_view src) { uint32_t s = ts_node_start_byte(node); uint32_t e = ts_node_end_byte(node); if (e > src.size()) @@ -129,8 +129,8 @@ namespace { // Strip surrounding string delimiters (and optional letter/@/$ prefixes), // handling triple-quote runs. Fallback when the grammar exposes no content - // child. Mirrors the spirit of parser.py / parser.go body extraction. - std::string stripDelimiters(const std::string &s) { + // child. Returns a view into the input. Mirrors parser.py / parser.go. + std::string_view stripDelimiters(const std::string_view s) { size_t b = 0; const size_t e = s.size(); while (b < e && !isQuote(s[b]) && (std::isalpha(static_cast(s[b])) || s[b] == '@' || s[b] == '$')) @@ -150,24 +150,38 @@ namespace { return s.substr(bodyStart, bodyEnd - bodyStart); } - // Body of a string node: prefer content/fragment children (delimiters and - // interpolation markers excluded for free); otherwise strip delimiters. - std::string stringBody(const TSNode node, const std::string &src) { - std::string body; - bool found = false; + // Body of a string node, returned as a view. Prefer content/fragment children + // (delimiters and interpolation markers excluded for free); otherwise strip + // delimiters off the whole node. The rare multi-child case concatenates into + // `scratch` (reused across nodes) and returns a view into it. + std::string_view stringBody(const TSNode node, const std::string_view src, std::string &scratch) { const uint32_t n = ts_node_named_child_count(node); + int firstContent = -1; + int contentCount = 0; for (uint32_t i = 0; i < n; ++i) { - const TSNode c = ts_node_named_child(node, i); - const char *t = ts_node_type(c); + const char *t = ts_node_type(ts_node_named_child(node, i)); if (std::strstr(t, "content") || std::strstr(t, "fragment")) { - body += nodeText(c, src); - found = true; + if (firstContent < 0) + firstContent = static_cast(i); + ++contentCount; } } - return found ? body : stripDelimiters(nodeText(node, src)); + if (contentCount == 1) + return nodeText(ts_node_named_child(node, static_cast(firstContent)), src); + if (contentCount > 1) { + scratch.clear(); + for (uint32_t i = 0; i < n; ++i) { + const TSNode c = ts_node_named_child(node, i); + const char *t = ts_node_type(c); + if (std::strstr(t, "content") || std::strstr(t, "fragment")) + scratch.append(nodeText(c, src)); + } + return scratch; + } + return stripDelimiters(nodeText(node, src)); } - Serde::JSON makeObj(const char *kind, const std::string &value) { + Serde::JSON makeObj(const char *kind, const std::string_view value) { return Serde::JSON::Object({{"kind", Serde::JSON(kind)}, {"value", Serde::JSON(value)}}); } @@ -175,8 +189,9 @@ namespace { // Iterative (explicit stack) to avoid recursion-depth blowups on deep trees. // String/comment nodes are emitted whole and not descended into; ERROR nodes // are descended through so well-formed descendants are still collected. - void walk(const TSNode root, const std::string &src, const Classifier &cls, Serde::JSON &out) { + void walk(const TSNode root, const std::string_view src, const Classifier &cls, Serde::JSON &out) { std::vector stack; + std::string scratch; // reused buffer for multi-child string concatenation stack.push_back(root); while (!stack.empty()) { const TSNode node = stack.back(); @@ -187,7 +202,7 @@ namespace { out.PushBack(makeObj("comment", nodeText(node, src))); continue; case Kind::String: - out.PushBack(makeObj("string", stringBody(node, src))); + out.PushBack(makeObj("string", stringBody(node, src, scratch))); continue; case Kind::Other: break; @@ -245,14 +260,29 @@ bool TreeSitter::Parse(const LanguageEnum language, const std::string &path, Ser if (!tsLang) return false; - std::ifstream in(path, std::ios::binary); + // Bulk read (one allocation + one read) instead of istreambuf_iterator. + std::ifstream in(path, std::ios::binary | std::ios::ate); if (!in) return false; - const std::string src((std::istreambuf_iterator(in)), std::istreambuf_iterator()); + const std::streamoff size = in.tellg(); + if (size < 0) + return false; + std::string src(static_cast(size), '\0'); + in.seekg(0); + in.read(src.data(), size); + src.resize(static_cast(in.gcount())); if (!result.IsArray()) result = Serde::JSON::Array(); + // Skip binary files. Source extensions collide with non-source formats (e.g. + // `.ts` is both TypeScript and MPEG transport stream, which Chromium ships as + // test data). A NUL byte is a reliable binary signal — source essentially + // never contains one — and feeding binary to the GLR parser triggers + // pathological error-recovery blowups the parse timeout can't always catch. + if (std::memchr(src.data(), '\0', src.size()) != nullptr) + return true; + thread_local ParserHolder holder; TSParser *parser = holder.get(); if (!ts_parser_set_language(parser, tsLang))