From 9c0a0292111df8a839783c183e3adf5b68708147 Mon Sep 17 00:00:00 2001
From: AntoineBastide47 <148970403+AntoineBastide47@users.noreply.github.com>
Date: Wed, 20 May 2026 14:53:52 +0200
Subject: [PATCH 1/4] feat: faster file indexing

---
 LLVM/main.cpp | 107 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 87 insertions(+), 20 deletions(-)
diff --git a/LLVM/main.cpp b/LLVM/main.cpp
index cb01a61..540f8c0 100644
--- a/LLVM/main.cpp
+++ b/LLVM/main.cpp
@@ -1,7 +1,9 @@
+#include <algorithm>
 #include <atomic>
 #include <chrono>
 #include <filesystem>
 #include <fstream>
+#include <functional>
 #include <iomanip>
 #include <iostream>
 #include <map>
@@ -49,6 +51,16 @@ static long long elapsed_ms(const Clock::time_point start, const Clock::time_poi
   return std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
 }
 
+// Index of the executing OpenMP thread (0 when built without OpenMP). Used to
+// route work into per-thread storage that needs no locking.
+static int current_thread() {
+#if USE_OPENMP
+  return omp_get_thread_num();
+#else
+  return 0;
+#endif
+}
+
 static void log_info(const std::string &message) {
   std::istringstream lines(message);
   std::string line;
@@ -217,41 +229,96 @@ int main(const int argc, char *argv[]) {
         extToLang.try_emplace(std::string(ext), lang);
   }
 
-  // Single directory walk: bucket files by language.
-  // lexically_normal() is pure string arithmetic (no stat syscalls).
+  // Parallel directory walk: bucket files by language.
+  // The readdir/stat syscalls and the per-file work (extension parsing plus
+  // lexically_normal, which is pure string arithmetic) are spread across
+  // threads via OpenMP tasks — one task per subdirectory, so sibling subtrees
+  // are walked concurrently. Each thread fills its own bucket set, so the hot
+  // path is lock-free; a cheap serial merge below deduplicates and assembles
+  // the global buckets.
   const auto indexingStart = Clock::now();
   std::unordered_set<std::string> seen;
   std::map<LanguageEnum, std::vector<std::pair<std::string, std::string>>> buckets;
 
+#if USE_OPENMP
+  const int indexThreads = omp_get_max_threads();
+#else
+  const int indexThreads = 1;
+#endif
+  std::vector<std::map<LanguageEnum, std::vector<std::pair<std::string, std::string>>>> tlBuckets(indexThreads);
+
+  // Resolve roots up front so their normalized input-path strings live in
+  // stable storage that every spawned task can safely reference.
+  std::vector<std::pair<fs::path, std::string>> roots;
+  roots.reserve(rawPaths.size());
   for (const auto &rawPath: rawPaths) {
     const fs::path p(rawPath);
     if (!fs::exists(p)) {
       std::cerr << "Path does not exist: " << p << "\n";
       continue;
     }
-    const std::string inputPath = p.lexically_normal().string();
-
-    auto try_add = [&](const fs::path &fp) {
-      const std::string extStr = fp.extension().string();
-      if (extStr.size() <= 1)
-        return;
-      const auto it = extToLang.find(extStr.substr(1));
-      if (it == extToLang.end())
-        return;
-      const std::string norm = fp.lexically_normal().string();
-      if (!seen.insert(norm).second)
-        return;
-      buckets[it->second].emplace_back(norm, inputPath);
-    };
+    roots.emplace_back(p, p.lexically_normal().string());
+  }
 
+  // Match a candidate file against the extension table and stash it in the
+  // current thread's bucket. Deduplication is deferred to the serial merge, so
+  // this touches only thread-local state and needs no synchronization.
+  auto try_add = [&](const fs::path &fp, const std::string &inputPath) {
+    const std::string extStr = fp.extension().string();
+    if (extStr.size() <= 1)
+      return;
+    const auto it = extToLang.find(extStr.substr(1));
+    if (it == extToLang.end())
+      return;
+    tlBuckets[current_thread()][it->second].emplace_back(fp.lexically_normal().string(), inputPath);
+  };
+
+  // Non-recursive scan of one directory; each subdirectory is handed to its own
+  // task. is_directory/is_regular_file consume the file type cached by readdir
+  // where the platform provides it, avoiding extra stat calls. Errors (e.g.
+  // permission denied) skip the offending entry rather than aborting the walk.
+  std::function<void(const fs::path &, const std::string &)> walk =
+      [&](const fs::path &dir, const std::string &inputPath) {
+    std::vector<fs::path> subdirs;
+    std::error_code wec;
+    for (fs::directory_iterator it(dir, fs::directory_options::skip_permission_denied, wec), end;
+         !wec && it != end; it.increment(wec)) {
+      std::error_code tec;
+      if (it->is_directory(tec))
+        subdirs.push_back(it->path());
+      else if (it->is_regular_file(tec))
+        try_add(it->path(), inputPath);
+    }
+    for (auto &sd: subdirs) {
+#if USE_OPENMP
+#pragma omp task firstprivate(sd) shared(walk, inputPath)
+#endif
+      walk(sd, inputPath);
+    }
+  };
+
+#if USE_OPENMP
+#pragma omp parallel
+#pragma omp single
+#endif
+  for (const auto &[p, inputPath]: roots) {
     if (fs::is_regular_file(p))
-      try_add(p);
+      try_add(p, inputPath);
     else if (fs::is_directory(p))
-      for (const auto &entry: fs::recursive_directory_iterator(p))
-        if (fs::is_regular_file(entry))
-          try_add(entry.path());
+      walk(p, inputPath);
   }
 
+  // Serial merge: deduplicate across threads and assemble the global buckets,
+  // then sort each bucket so output is reproducible regardless of the order in
+  // which threads happened to discover files.
+  for (auto &tb: tlBuckets)
+    for (auto &[lang, files]: tb)
+      for (auto &entry: files)
+        if (seen.insert(entry.first).second)
+          buckets[lang].push_back(std::move(entry));
+  for (auto &files: buckets | std::views::values)
+    std::ranges::sort(files);
+
   // Assemble langFiles in kLanguageData order for deterministic output.
   std::vector<std::pair<LanguageEnum, std::vector<std::pair<std::string, std::string>>>> langFiles;
   size_t totalFiles = 0;

From ca1f6a108d9d8d0044c42d95d1e2465f85558cf8 Mon Sep 17 00:00:00 2001
From: AntoineBastide47 <148970403+AntoineBastide47@users.noreply.github.com>
Date: Wed, 20 May 2026 16:17:18 +0200
Subject: [PATCH 2/4] feat: change how cloning works

---
 LLVM/.gitignore    |   3 +-
 LLVM/clone-orgs.sh | 263 ++++++++++++++++++++++++++++++++++++---------
 LLVM/main.cpp      |  23 ++--
 3 files changed, 229 insertions(+), 60 deletions(-)

diff --git a/LLVM/.gitignore b/LLVM/.gitignore
index 7c6db17..76e6b39 100644
--- a/LLVM/.gitignore
+++ b/LLVM/.gitignore
@@ -5,4 +5,5 @@ build-test
 ./ignore
 ./include/LanguageModel.generated.hpp
 result/**
-train/__pycache__
\ No newline at end of file
+train/__pycache__
+repos/**
\ No newline at end of file
diff --git a/LLVM/clone-orgs.sh b/LLVM/clone-orgs.sh
index 954a868..d8fe39e 100755
--- a/LLVM/clone-orgs.sh
+++ b/LLVM/clone-orgs.sh
@@ -1,7 +1,9 @@
 #!/usr/bin/env bash
 #
 # clone-orgs.sh — clone every (non-archived) repository in one or more GitHub
-# orgs in parallel, designed to stay well under GitHub's API and abuse limits.
+# orgs in parallel, parse each repo as it lands, and emit a whole-org parse
+# before discarding the clone tree. Designed to stay well under GitHub's API
+# and abuse limits.
 #
 # Usage:
 #   ./clone-orgs.sh [-o <output-dir>] <org1> [<org2> ...]
@@ -18,7 +20,18 @@
 #   SKIP_FORKS    skip forked repos     (default 0)
 #   RETRIES       per-repo retry count  (default 3, exponential backoff)
 #
-# Requirements: gh (authenticated — run `gh auth login` once), git, jq.
+# Per-org flow:
+#   1. List the org's repos.
+#   2. Clone them in parallel (bounded by JOBS). The instant a repo is on disk
+#      its tree is handed to `parser <repo>` in the background, so parsing
+#      overlaps with the next download. The org's special `.github` repo is
+#      never parsed.
+#   3. Once every clone and every per-repo parse has finished, run one more
+#      `parser` pass over the whole org folder, then delete that folder and
+#      move on to the next org.
+#
+# Requirements: gh (authenticated — run `gh auth login` once), git, jq, and the
+# built `parser` binary next to this script.
 # For private repos also run `gh auth setup-git` so plain `git clone` picks
 # up the gh token via git's credential helper.
 #
@@ -70,6 +83,10 @@ command -v git >/dev/null || die "git not found"
 command -v jq  >/dev/null || die "jq not found"
 gh auth status >/dev/null 2>&1 || die "not authenticated — run: gh auth login"
 
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+PARSER="${SCRIPT_DIR}/parser"
+[ -x "${PARSER}" ] || die "parser binary not found / not executable at ${PARSER} (build it first)"
+
 remaining=$(gh api rate_limit -q '.resources.core.remaining' 2>/dev/null || echo 0)
 log "GitHub REST budget: ${remaining}/5000 remaining"
 [ "${remaining}" -ge 100 ] || die "REST budget too low (${remaining}); wait or use a different token"
@@ -91,28 +108,25 @@ list_repos() {
     -q "${jq_filter}"
 }
 
-ALL_REPOS=$(mktemp -t clone-orgs.XXXXXX)
-trap 'rm -f "${ALL_REPOS}"' EXIT
-
-for org in "$@"; do
-  list_repos "${org}" >> "${ALL_REPOS}" || log "warning: skipping ${org} (listing failed)"
-done
-
-total=$(wc -l < "${ALL_REPOS}" | tr -d ' ')
-log "queued ${total} repos across $# org(s); JOBS=${JOBS}, DEPTH=${DEPTH}, OUTPUT=${OUTPUT}"
-[ "${total}" -gt 0 ] || { log "nothing to clone"; exit 0; }
-
 CLONE_LOG="${OUTPUT}/clone.log"
-: > "${CLONE_LOG}"
+PARSE_LOG="${OUTPUT}/parse.log"             # one line per repo the consumer has parsed
+PARSE_QUEUE_LOG="${OUTPUT}/parse_queue.log" # one line per repo enqueued for parsing
+TOTAL_FAIL=0
 
 # Per-repo worker: skip-if-exists, clone, retry on failure with quadratic
 # backoff. Records its outcome in CLONE_LOG (one short line, append is atomic
 # for the lengths used here) and stays silent on stdout so the progress bar
 # rendered by the watcher below is the only thing on the user's terminal.
 # Always returns 0 so a single bad repo doesn't sink the batch.
+#
+# As soon as the repo is on disk its path is pushed onto PARSE_FIFO. A single
+# background consumer (see parse_consumer) drains that queue and parses repos
+# one at a time, so parsing overlaps with downloads yet never runs more than one
+# parser at once. The org's special `.github` repo is never enqueued.
 clone_one() {
   local slug=$1
   local dest="${OUTPUT}/${slug}"
+  local name=${slug##*/}
   local outcome
 
   if [ -d "${dest}/.git" ]; then
@@ -137,13 +151,32 @@ clone_one() {
   fi
 
   printf '%s\n' "${outcome}" >> "${CLONE_LOG}"
+
+  # Account for this repo as one parse unit. Repos with parseable content are
+  # enqueued for the single consumer (which ticks the parse log once parsed);
+  # everything else — the org `.github` repo, a missing tree, or a failed
+  # clone — has nothing to parse, so it ticks the parse log here as a resolved
+  # unit. Either way each repo advances the parse bar exactly once.
+  case "${outcome}" in
+    OK*|SKIP*)
+      if [ "${name}" != ".github" ] && [ -d "${dest}" ]; then
+        printf 'Q\n' >> "${PARSE_QUEUE_LOG}"
+        printf '%s\n' "${dest}" > "${PARSE_FIFO}"
+      else
+        printf 'S\n' >> "${PARSE_LOG}"
+      fi
+      ;;
+    *)
+      printf 'S\n' >> "${PARSE_LOG}"
+      ;;
+  esac
 }
 export -f clone_one
-export OUTPUT DEPTH RETRIES CLONE_LOG
+export OUTPUT DEPTH RETRIES CLONE_LOG PARSE_LOG PARSE_QUEUE_LOG PARSER
 
 # Progress bar matching main.cpp's per-language renderer: 40-wide "[####....]"
-# refreshed in place. A background watcher polls the clone log every 200 ms
-# and re-renders, so all stdio races stay inside this one process.
+# refreshed in place. A background watcher polls the clone log every 200 ms and
+# re-renders, so all stdio races stay inside this one process.
 draw_bar() {
   local n=$1 tot=$2
   local pct=$(( tot > 0 ? n * 100 / tot : 0 ))
@@ -156,40 +189,168 @@ draw_bar() {
 }
 export -f draw_bar
 
-(
-  while :; do
-    sleep 0.2
-    if [ -f "${CLONE_LOG}" ]; then
-      n=$(wc -l < "${CLONE_LOG}" 2>/dev/null | tr -d ' ')
-      [ -z "$n" ] && n=0
-      [ "$n" -gt "${total}" ] && n=${total}
-      draw_bar "$n" "${total}"
-    fi
+# Render the clone and parse progress side-by-side on one in-place line. Parse's
+# denominator is "enqueued so far", which grows during cloning and is final once
+# cloning ends, so the bar tracks real outstanding work rather than a guess.
+draw_bars() {
+  local cn=$1 ct=$2 pn=$3 pt=$4
+  local W=20
+  local full='########################################'
+  local empty='........................................'
+  # Guard the divisions explicitly: bash 3.2's $(( a ? b/c : 0 )) still
+  # evaluates b/c when c is 0, and the parse denominator starts at 0.
+  local cf=0 pf=0
+  [ "${ct}" -gt 0 ] && cf=$(( cn * 100 / ct * W / 100 ))
+  [ "${pt}" -gt 0 ] && pf=$(( pn * 100 / pt * W / 100 ))
+  printf '\r  clone [%s%s] %d/%d  parse [%s%s] %d/%d   ' \
+    "${full:0:cf}" "${empty:0:$((W - cf))}" "$cn" "$ct" \
+    "${full:0:pf}" "${empty:0:$((W - pf))}" "$pn" "$pt" >&2
+}
+export -f draw_bars
+
+# Single parse consumer: drains repo paths from PARSE_FIFO and parses them one
+# at a time, guaranteeing only one parser process is ever live. Holds the FIFO
+# open read-write (fd 3) so enqueuing clones never block on a missing reader,
+# and stops on the `__DONE__` sentinel the org driver sends once cloning ends.
+parse_consumer() {
+  local repo
+  exec 3<>"${PARSE_FIFO}"
+  while IFS= read -r repo <&3; do
+    [ "${repo}" = "__DONE__" ] && break
+    [ -d "${repo}" ] && { "${PARSER}" "${repo}" >/dev/null 2>&1 || true; }
+    printf 'P\n' >> "${PARSE_LOG}"
   done
-) &
-WATCHER_PID=$!
-# Make sure the watcher dies even if we exit via an error.
-trap 'rm -f "${ALL_REPOS}"; kill "${WATCHER_PID}" 2>/dev/null; wait "${WATCHER_PID}" 2>/dev/null || true' EXIT
-
-# Fan out one slug per worker, bounded by JOBS. The `bash -c '… "$@"' _`
-# pattern is the portable way to invoke an exported function via xargs.
-< "${ALL_REPOS}" xargs -n1 -P "${JOBS}" bash -c 'clone_one "$@"' _
-
-# Stop the watcher and paint the final state.
-kill "${WATCHER_PID}" 2>/dev/null
-wait "${WATCHER_PID}" 2>/dev/null || true
-
-ok=$(grep -c '^OK   ' "${CLONE_LOG}" || true)
-sk=$(grep -c '^SKIP ' "${CLONE_LOG}" || true)
-ko=$(grep -c '^FAIL ' "${CLONE_LOG}" || true)
-finished=$(( ok + sk + ko ))
-draw_bar "${finished}" "${total}"
-printf '\n' >&2
+  exec 3<&-
+}
+
+# Tear down any live progress watcher / parse consumer on exit, even on error.
+CURRENT_WATCHER=""
+CURRENT_CONSUMER=""
+CURRENT_FIFO=""
+cleanup() {
+  [ -n "${CURRENT_WATCHER}" ] && kill "${CURRENT_WATCHER}" 2>/dev/null
+  [ -n "${CURRENT_CONSUMER}" ] && kill "${CURRENT_CONSUMER}" 2>/dev/null
+  wait "${CURRENT_WATCHER}" 2>/dev/null || true
+  wait "${CURRENT_CONSUMER}" 2>/dev/null || true
+  [ -n "${CURRENT_FIFO}" ] && rm -f "${CURRENT_FIFO}"
+  rm -f "${CLONE_LOG}" "${PARSE_LOG}" "${PARSE_QUEUE_LOG}"
+}
+trap cleanup EXIT
+
+# Clone one org (bounded by JOBS), parse each repo as it lands, then run a
+# whole-org parse and delete the org's clone tree.
+process_org() {
+  local org=$1
+  local list
+  list=$(mktemp -t clone-orgs.XXXXXX)
+
+  if ! list_repos "${org}" > "${list}"; then
+    log "warning: skipping ${org} (listing failed)"
+    rm -f "${list}"
+    return
+  fi
+
+  local total
+  total=$(wc -l < "${list}" | tr -d ' ')
+  if [ "${total}" -eq 0 ]; then
+    log "${org}: nothing to clone"
+    rm -f "${list}"
+    return
+  fi
+
+  # The org's clone tree is OUTPUT/<owner>, where <owner> is gh's canonical
+  # casing taken from the first slug.
+  local org_owner org_dir
+  org_owner=$(head -n1 "${list}" | cut -d/ -f1)
+  org_dir="${OUTPUT}/${org_owner}"
+
+  log "${org}: queued ${total} repos; JOBS=${JOBS}, DEPTH=${DEPTH}, OUTPUT=${OUTPUT}"
+
+  : > "${CLONE_LOG}"
+  : > "${PARSE_LOG}"
+  : > "${PARSE_QUEUE_LOG}"
+
+  # Start the lone parse consumer and the queue it reads from.
+  PARSE_FIFO=$(mktemp -u -t clone-parse.XXXXXX)
+  mkfifo "${PARSE_FIFO}"
+  export PARSE_FIFO
+  CURRENT_FIFO="${PARSE_FIFO}"
+  parse_consumer &
+  CURRENT_CONSUMER=$!
+
+  # One watcher renders both bars and stays alive until parsing has drained, so
+  # the parse bar keeps advancing through the backlog after cloning finishes.
+  (
+    while :; do
+      sleep 0.2
+      cn=0; pn=0
+      [ -f "${CLONE_LOG}" ] && cn=$(wc -l < "${CLONE_LOG}" 2>/dev/null | tr -d ' ')
+      [ -f "${PARSE_LOG}" ] && pn=$(wc -l < "${PARSE_LOG}" 2>/dev/null | tr -d ' ')
+      [ -z "${cn}" ] && cn=0; [ -z "${pn}" ] && pn=0
+      [ "${cn}" -gt "${total}" ] && cn=${total}
+      # Parse total is fixed: every repo (one unit each) plus the whole-org pass.
+      draw_bars "${cn}" "${total}" "${pn}" "$(( total + 1 ))"
+    done
+  ) &
+  CURRENT_WATCHER=$!
+
+  # Fan out one slug per worker, bounded by JOBS. The `bash -c '… "$@"' _`
+  # pattern is the portable way to invoke an exported function via xargs.
+  < "${list}" xargs -n1 -P "${JOBS}" bash -c 'clone_one "$@"' _
+
+  # Cloning done: signal end-of-queue and let the consumer drain the parse
+  # backlog. The watcher keeps painting both bars throughout, so the whole-org
+  # pass below stays the only live parser.
+  printf '%s\n' "__DONE__" > "${PARSE_FIFO}"
+  wait "${CURRENT_CONSUMER}" 2>/dev/null || true
+  CURRENT_CONSUMER=""
+  rm -f "${PARSE_FIFO}"
+  CURRENT_FIFO=""
+
+  # Whole-org pass: the final "+1" parse unit folded into the parse bar. Drop
+  # the org `.github` repo first so the walk skips it; output is silenced so the
+  # bar stays the only thing on screen, and a parse-log tick lifts the bar to
+  # 100% once it completes. The watcher keeps painting throughout.
+  local org_parsed=0
+  if [ -d "${org_dir}" ]; then
+    rm -rf "${org_dir}/.github"
+    # Put the whole-org aggregate in its own subdir alongside the per-repo
+    # results: ./result/<output-root>/<org>/<org> (e.g. result/repos/dedis/dedis),
+    # so it doesn't collide with the per-repo dirs at result/repos/dedis/<repo>.
+    "${PARSER}" "${org_dir}" --output "${OUTPUT#./}/${org_owner}/${org_owner}" >/dev/null 2>&1 \
+      || log "warning: whole-org parse failed for ${org}"
+    printf 'P\n' >> "${PARSE_LOG}"
+    org_parsed=1
+  fi
+
+  # Stop the watcher and paint the final state of both bars.
+  kill "${CURRENT_WATCHER}" 2>/dev/null
+  wait "${CURRENT_WATCHER}" 2>/dev/null || true
+  CURRENT_WATCHER=""
+
+  local ok sk ko parsed queued
+  ok=$(grep -c '^OK   ' "${CLONE_LOG}" || true)
+  sk=$(grep -c '^SKIP ' "${CLONE_LOG}" || true)
+  ko=$(grep -c '^FAIL ' "${CLONE_LOG}" || true)
+  parsed=$(wc -l < "${PARSE_LOG}" 2>/dev/null | tr -d ' '); [ -z "${parsed}" ] && parsed=0
+  queued=$(wc -l < "${PARSE_QUEUE_LOG}" 2>/dev/null | tr -d ' '); [ -z "${queued}" ] && queued=0
+  draw_bars "$(( ok + sk + ko ))" "${total}" "${parsed}" "$(( total + org_parsed ))"
+  printf '\n' >&2
+  log "${org}: ${ok} cloned, ${sk} skipped, ${ko} failed (of ${total}); parsed ${queued} repos$( [ "${org_parsed}" -eq 1 ] && printf ' + whole-org pass' )"
+  if [ "${ko}" -gt 0 ]; then
+    grep '^FAIL ' "${CLONE_LOG}" | sed 's/^FAIL  /  /' >&2
+  fi
+  TOTAL_FAIL=$(( TOTAL_FAIL + ko ))
+
+  # Org tree no longer needed.
+  [ -d "${org_dir}" ] && rm -rf "${org_dir}"
+  rm -f "${list}"
+}
+
+for org in "$@"; do
+  process_org "${org}"
+done
 
 log ""
-log "done: ${ok} cloned, ${sk} skipped, ${ko} failed (of ${total})"
-if [ "${ko}" -gt 0 ]; then
-  log "failed repos (also in ${CLONE_LOG}):"
-  grep '^FAIL ' "${CLONE_LOG}" | sed 's/^FAIL  /  /' >&2
-fi
-[ "${ko}" -eq 0 ]
+log "all orgs done"
+[ "${TOTAL_FAIL}" -eq 0 ]
diff --git a/LLVM/main.cpp b/LLVM/main.cpp
index 540f8c0..54cddad 100644
--- a/LLVM/main.cpp
+++ b/LLVM/main.cpp
@@ -38,7 +38,7 @@ static const char *kUsage =
     "\n"
     "OPTIONS:\n"
     "    --language <lang>              Only analyze files of the given language\n"
-    "    --output <dir>                 Directory to write results to (default: ./result)\n"
+    "    --output <name>                Write results to ./result/<name> (default: ./result/<input-path>)\n"
     "    --compiler <compiler>          Override the compiler used for parsing\n"
     "    --extensions <ext1,ext2,...>   Comma-separated list of additional file extensions\n"
     "\n"
@@ -129,7 +129,7 @@ int main(const int argc, char *argv[]) {
   }
 
   std::string compilerOverride;
-  std::string outputDir = "./result";
+  std::string outputName; // when set via --output, names the subdir under ./result
   auto filterLanguage = LanguageEnum::Unknown;
   std::vector<std::string_view> extraExtensions;
   std::vector<std::string> rawPaths;
@@ -142,7 +142,7 @@ int main(const int argc, char *argv[]) {
         std::fprintf(stderr, kUsage, argv[0]);
         return -1;
       }
-      outputDir = argv[++i];
+      outputName = argv[++i];
       continue;
     }
     if (arg == "--compiler") {
@@ -194,12 +194,19 @@ int main(const int argc, char *argv[]) {
   for (const auto &p: rawPaths)
     inputPaths.push_back(fs::path(p).lexically_normal().string());
 
-  // Derive per-repo output dir: ./result/<repo_path>
-  // For relative inputs use the path as-is; for absolute inputs use the last component.
+  // Output goes under ./result. With --output <name> the subdir is exactly
+  // <name>; otherwise it is derived from the first input path (relative paths
+  // as-is, absolute paths by their last component).
+  std::string outputDir;
   {
-    const fs::path ip = fs::path(rawPaths[0]).lexically_normal();
-    const fs::path sub = ip.is_relative() ? ip : ip.filename();
-    outputDir = (fs::path(outputDir) / sub).string();
+    fs::path sub;
+    if (!outputName.empty())
+      sub = fs::path(outputName);
+    else {
+      const fs::path ip = fs::path(rawPaths[0]).lexically_normal();
+      sub = ip.is_relative() ? ip : ip.filename();
+    }
+    outputDir = (fs::path("./result") / sub).lexically_normal().string();
   }
 
   std::error_code ec;

From aa49e95e227f1b1cb4e70250fe1b95a30b5bc0d5 Mon Sep 17 00:00:00 2001
From: AntoineBastide47 <148970403+AntoineBastide47@users.noreply.github.com>
Date: Wed, 20 May 2026 18:13:54 +0200
Subject: [PATCH 3/4] feat: all the parsers

---
 .gitmodules                                   |  72 +++++
 LLVM/CMakeLists.txt                           |  38 +--
 LLVM/extern/tree-sitter/CMakeLists.txt        |  76 +++++
 LLVM/extern/tree-sitter/tree-sitter           |   1 +
 LLVM/extern/tree-sitter/tree-sitter-c-sharp   |   1 +
 LLVM/extern/tree-sitter/tree-sitter-dart      |   1 +
 LLVM/extern/tree-sitter/tree-sitter-elixir    |   1 +
 LLVM/extern/tree-sitter/tree-sitter-erlang    |   1 +
 LLVM/extern/tree-sitter/tree-sitter-glsl      |   1 +
 LLVM/extern/tree-sitter/tree-sitter-go        |   1 +
 LLVM/extern/tree-sitter/tree-sitter-haskell   |   1 +
 LLVM/extern/tree-sitter/tree-sitter-hlsl      |   1 +
 LLVM/extern/tree-sitter/tree-sitter-java      |   1 +
 .../extern/tree-sitter/tree-sitter-javascript |   1 +
 LLVM/extern/tree-sitter/tree-sitter-kotlin    |   1 +
 LLVM/extern/tree-sitter/tree-sitter-lua       |   1 +
 LLVM/extern/tree-sitter/tree-sitter-objc      |   1 +
 LLVM/extern/tree-sitter/tree-sitter-ocaml     |   1 +
 LLVM/extern/tree-sitter/tree-sitter-perl      |   1 +
 LLVM/extern/tree-sitter/tree-sitter-php       |   1 +
 LLVM/extern/tree-sitter/tree-sitter-python    |   1 +
 LLVM/extern/tree-sitter/tree-sitter-r         |   1 +
 LLVM/extern/tree-sitter/tree-sitter-ruby      |   1 +
 LLVM/extern/tree-sitter/tree-sitter-rust      |   1 +
 LLVM/extern/tree-sitter/tree-sitter-scala     |   1 +
 LLVM/extern/tree-sitter/tree-sitter-swift     |   1 +
 .../extern/tree-sitter/tree-sitter-typescript |   1 +
 LLVM/include/LanguageData.hpp                 |  72 ++---
 LLVM/include/LanguageParser.hpp               |   6 +-
 LLVM/include/TreeSitterParser.hpp             |  24 ++
 LLVM/main.cpp                                 |  18 +-
 LLVM/parsers/parser.go                        |  89 ------
 LLVM/parsers/parser.py                        |  97 ------
 LLVM/src/LanguageParser.cpp                   | 242 +--------------
 LLVM/src/TreeSitterParser.cpp                 | 278 ++++++++++++++++++
 35 files changed, 540 insertions(+), 496 deletions(-)
 create mode 100644 LLVM/extern/tree-sitter/CMakeLists.txt
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-c-sharp
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-dart
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-elixir
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-erlang
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-glsl
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-go
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-haskell
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-hlsl
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-java
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-javascript
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-kotlin
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-lua
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-objc
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-ocaml
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-perl
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-php
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-python
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-r
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-ruby
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-rust
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-scala
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-swift
 create mode 160000 LLVM/extern/tree-sitter/tree-sitter-typescript
 create mode 100644 LLVM/include/TreeSitterParser.hpp
 delete mode 100644 LLVM/parsers/parser.go
 delete mode 100644 LLVM/parsers/parser.py
 create mode 100644 LLVM/src/TreeSitterParser.cpp

diff --git a/.gitmodules b/.gitmodules
index 6d81eec..31736b4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,75 @@
 [submodule "LLVM/extern/Serde"]
 	path = LLVM/extern/Serde
 	url = https://github.com/AntoineBastide47/Serde.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter"]
+	path = LLVM/extern/tree-sitter/tree-sitter
+	url = https://github.com/tree-sitter/tree-sitter.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-javascript"]
+	path = LLVM/extern/tree-sitter/tree-sitter-javascript
+	url = https://github.com/tree-sitter/tree-sitter-javascript.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-typescript"]
+	path = LLVM/extern/tree-sitter/tree-sitter-typescript
+	url = https://github.com/tree-sitter/tree-sitter-typescript.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-java"]
+	path = LLVM/extern/tree-sitter/tree-sitter-java
+	url = https://github.com/tree-sitter/tree-sitter-java.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-c-sharp"]
+	path = LLVM/extern/tree-sitter/tree-sitter-c-sharp
+	url = https://github.com/tree-sitter/tree-sitter-c-sharp.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-rust"]
+	path = LLVM/extern/tree-sitter/tree-sitter-rust
+	url = https://github.com/tree-sitter/tree-sitter-rust.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-ruby"]
+	path = LLVM/extern/tree-sitter/tree-sitter-ruby
+	url = https://github.com/tree-sitter/tree-sitter-ruby.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-php"]
+	path = LLVM/extern/tree-sitter/tree-sitter-php
+	url = https://github.com/tree-sitter/tree-sitter-php.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-perl"]
+	path = LLVM/extern/tree-sitter/tree-sitter-perl
+	url = https://github.com/tree-sitter-perl/tree-sitter-perl.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-lua"]
+	path = LLVM/extern/tree-sitter/tree-sitter-lua
+	url = https://github.com/tree-sitter-grammars/tree-sitter-lua.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-swift"]
+	path = LLVM/extern/tree-sitter/tree-sitter-swift
+	url = https://github.com/alex-pinkus/tree-sitter-swift.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-kotlin"]
+	path = LLVM/extern/tree-sitter/tree-sitter-kotlin
+	url = https://github.com/fwcd/tree-sitter-kotlin.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-r"]
+	path = LLVM/extern/tree-sitter/tree-sitter-r
+	url = https://github.com/r-lib/tree-sitter-r.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-scala"]
+	path = LLVM/extern/tree-sitter/tree-sitter-scala
+	url = https://github.com/tree-sitter/tree-sitter-scala.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-haskell"]
+	path = LLVM/extern/tree-sitter/tree-sitter-haskell
+	url = https://github.com/tree-sitter/tree-sitter-haskell.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-ocaml"]
+	path = LLVM/extern/tree-sitter/tree-sitter-ocaml
+	url = https://github.com/tree-sitter/tree-sitter-ocaml.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-erlang"]
+	path = LLVM/extern/tree-sitter/tree-sitter-erlang
+	url = https://github.com/WhatsApp/tree-sitter-erlang.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-elixir"]
+	path = LLVM/extern/tree-sitter/tree-sitter-elixir
+	url = https://github.com/elixir-lang/tree-sitter-elixir.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-dart"]
+	path = LLVM/extern/tree-sitter/tree-sitter-dart
+	url = https://github.com/UserNobody14/tree-sitter-dart.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-objc"]
+	path = LLVM/extern/tree-sitter/tree-sitter-objc
+	url = https://github.com/tree-sitter-grammars/tree-sitter-objc.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-glsl"]
+	path = LLVM/extern/tree-sitter/tree-sitter-glsl
+	url = https://github.com/tree-sitter-grammars/tree-sitter-glsl.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-hlsl"]
+	path = LLVM/extern/tree-sitter/tree-sitter-hlsl
+	url = https://github.com/tree-sitter-grammars/tree-sitter-hlsl.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-go"]
+	path = LLVM/extern/tree-sitter/tree-sitter-go
+	url = https://github.com/tree-sitter/tree-sitter-go.git
+[submodule "LLVM/extern/tree-sitter/tree-sitter-python"]
+	path = LLVM/extern/tree-sitter/tree-sitter-python
+	url = https://github.com/tree-sitter/tree-sitter-python.git
diff --git a/LLVM/CMakeLists.txt b/LLVM/CMakeLists.txt
index 588fff7..a8050e7 100644
--- a/LLVM/CMakeLists.txt
+++ b/LLVM/CMakeLists.txt
@@ -54,8 +54,10 @@ add_executable(${PROJECT_NAME}
     include/LanguageStats.hpp
     include/LanguageModel.generated.hpp
     src/LanguageParser.cpp
+    src/TreeSitterParser.cpp
     include/LanguageParser.hpp
     include/LanguageData.hpp
+    include/TreeSitterParser.hpp
 )
 
 add_executable(LanguageClassifierTests
@@ -91,8 +93,10 @@ add_executable(ParserFixtureTests
     include/LanguageStats.hpp
     include/LanguageModel.generated.hpp
     src/LanguageParser.cpp
+    src/TreeSitterParser.cpp
     include/LanguageParser.hpp
     include/LanguageData.hpp
+    include/TreeSitterParser.hpp
 )
 target_include_directories(ParserFixtureTests PRIVATE include)
 target_link_libraries(ParserFixtureTests clangLex clangBasic)
@@ -135,8 +139,10 @@ add_executable(DebugLanguageOutputTests
     include/LanguageStats.hpp
     include/LanguageModel.generated.hpp
     src/LanguageParser.cpp
+    src/TreeSitterParser.cpp
     include/LanguageParser.hpp
     include/LanguageData.hpp
+    include/TreeSitterParser.hpp
 )
 target_include_directories(DebugLanguageOutputTests PRIVATE include)
 target_link_libraries(DebugLanguageOutputTests clangLex clangBasic)
@@ -173,8 +179,10 @@ add_executable(StringLiteralDecodeTests
     include/LanguageStats.hpp
     include/LanguageModel.generated.hpp
     src/LanguageParser.cpp
+    src/TreeSitterParser.cpp
     include/LanguageParser.hpp
     include/LanguageData.hpp
+    include/TreeSitterParser.hpp
 )
 target_include_directories(StringLiteralDecodeTests PRIVATE include)
 target_link_libraries(StringLiteralDecodeTests clangLex clangBasic)
@@ -192,29 +200,6 @@ find_package(Clang REQUIRED CONFIG)
 
 include_directories(${LLVM_INCLUDE_DIRS})
 target_link_libraries(${PROJECT_NAME} clangLex clangBasic)
-target_compile_definitions(${PROJECT_NAME} PRIVATE MATCHERTEXT_PARSERS_DIR="${CMAKE_SOURCE_DIR}/parsers")
-
-# === Go parser binary ===
-# Pre-compile parsers/parser.go into a native binary so each per-file invocation
-# avoids recompiling, and dodges `go run`'s rule that disallows .go arguments
-# from a different directory than the source file.
-find_program(GO_EXECUTABLE go)
-if (GO_EXECUTABLE)
-    set(GO_PARSER_BIN "${CMAKE_BINARY_DIR}/matchertext_go_parser")
-    add_custom_command(
-        OUTPUT ${GO_PARSER_BIN}
-        COMMAND ${GO_EXECUTABLE} build -o ${GO_PARSER_BIN} ${CMAKE_SOURCE_DIR}/parsers/parser.go
-        DEPENDS ${CMAKE_SOURCE_DIR}/parsers/parser.go
-        COMMENT "Building Go parser binary"
-        VERBATIM
-    )
-    add_custom_target(go_parser ALL DEPENDS ${GO_PARSER_BIN})
-    add_dependencies(${PROJECT_NAME} go_parser)
-    target_compile_definitions(${PROJECT_NAME} PRIVATE
-        MATCHERTEXT_GO_PARSER_BIN="${GO_PARSER_BIN}")
-else ()
-    message(WARNING "go executable not found; Go parsing will be disabled at runtime")
-endif ()
 
 # === OpenMP ===
 find_package(OpenMP QUIET)
@@ -234,6 +219,13 @@ target_link_libraries(ParserFixtureTests Serde)
 target_link_libraries(DebugLanguageOutputTests Serde)
 target_link_libraries(StringLiteralDecodeTests Serde)
 
+# === tree-sitter (in-process grammars for ~21 programming languages) ===
+add_subdirectory(extern/tree-sitter)
+target_link_libraries(${PROJECT_NAME} tree-sitter-grammars)
+target_link_libraries(ParserFixtureTests tree-sitter-grammars)
+target_link_libraries(DebugLanguageOutputTests tree-sitter-grammars)
+target_link_libraries(StringLiteralDecodeTests tree-sitter-grammars)
+
 # --------------------------
 # Build Configs
 # --------------------------
diff --git a/LLVM/extern/tree-sitter/CMakeLists.txt b/LLVM/extern/tree-sitter/CMakeLists.txt
new file mode 100644
index 0000000..62331aa
--- /dev/null
+++ b/LLVM/extern/tree-sitter/CMakeLists.txt
@@ -0,0 +1,76 @@
+# Builds the tree-sitter runtime + one isolated static library per grammar, then
+# aggregates them into the `tree-sitter-grammars` INTERFACE target that the main
+# binary links. Each grammar bundles its OWN src/tree_sitter/parser.h, so every
+# grammar is compiled with ONLY its own src/ on the include path to avoid header
+# clashes between grammars and with the core runtime.
+
+set(TS_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
+
+if (NOT EXISTS ${TS_ROOT}/tree-sitter/lib/src/lib.c)
+  message(FATAL_ERROR
+    "tree-sitter submodules not initialized.\n"
+    "Run: git submodule update --init --recursive")
+endif ()
+
+# --- core runtime -----------------------------------------------------------
+add_library(tree-sitter-core STATIC ${TS_ROOT}/tree-sitter/lib/src/lib.c)
+target_include_directories(tree-sitter-core
+    PUBLIC  ${TS_ROOT}/tree-sitter/lib/include   # tree_sitter/api.h for our C++ code
+    PRIVATE ${TS_ROOT}/tree-sitter/lib/src)      # lib.c #includes the rest internally
+target_compile_options(tree-sitter-core PRIVATE -w -O2)
+set_target_properties(tree-sitter-core PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+# --- one static lib per grammar ---------------------------------------------
+# `name` becomes target ts_<name>; the language symbol tree_sitter_<name> lives
+# in the grammar's parser.c. `src_dir` is relative to TS_ROOT and holds parser.c
+# (+ optional scanner.c/.cc). Generated tables are huge: -O2 (not -O3) and -w.
+function(add_ts_grammar name src_dir)
+  set(dir ${TS_ROOT}/${src_dir})
+  if (NOT EXISTS ${dir}/parser.c)
+    message(FATAL_ERROR "tree-sitter grammar '${name}' missing parser.c at ${dir}")
+  endif ()
+  set(srcs ${dir}/parser.c)
+  if (EXISTS ${dir}/scanner.c)
+    list(APPEND srcs ${dir}/scanner.c)
+  endif ()
+  if (EXISTS ${dir}/scanner.cc)
+    list(APPEND srcs ${dir}/scanner.cc)
+  endif ()
+  add_library(ts_${name} STATIC ${srcs})
+  target_include_directories(ts_${name} PRIVATE ${dir})
+  target_compile_options(ts_${name} PRIVATE -w -O2)
+  set_target_properties(ts_${name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endfunction()
+
+add_ts_grammar(javascript tree-sitter-javascript/src)
+add_ts_grammar(typescript tree-sitter-typescript/typescript/src)
+add_ts_grammar(java       tree-sitter-java/src)
+add_ts_grammar(c_sharp    tree-sitter-c-sharp/src)
+add_ts_grammar(rust       tree-sitter-rust/src)
+add_ts_grammar(ruby       tree-sitter-ruby/src)
+add_ts_grammar(php        tree-sitter-php/php/src)
+add_ts_grammar(perl       tree-sitter-perl/src)
+add_ts_grammar(lua        tree-sitter-lua/src)
+add_ts_grammar(swift      tree-sitter-swift/src)
+add_ts_grammar(kotlin     tree-sitter-kotlin/src)
+add_ts_grammar(r          tree-sitter-r/src)
+add_ts_grammar(scala      tree-sitter-scala/src)
+add_ts_grammar(haskell    tree-sitter-haskell/src)
+add_ts_grammar(ocaml      tree-sitter-ocaml/grammars/ocaml/src)
+add_ts_grammar(erlang     tree-sitter-erlang/src)
+add_ts_grammar(elixir     tree-sitter-elixir/src)
+add_ts_grammar(dart       tree-sitter-dart/src)
+add_ts_grammar(objc       tree-sitter-objc/src)
+add_ts_grammar(glsl       tree-sitter-glsl/src)
+add_ts_grammar(hlsl       tree-sitter-hlsl/src)
+add_ts_grammar(go         tree-sitter-go/src)
+add_ts_grammar(python     tree-sitter-python/src)
+
+# --- aggregate --------------------------------------------------------------
+add_library(tree-sitter-grammars INTERFACE)
+target_include_directories(tree-sitter-grammars INTERFACE ${TS_ROOT}/tree-sitter/lib/include)
+target_link_libraries(tree-sitter-grammars INTERFACE
+    tree-sitter-core
+    ts_javascript ts_typescript ts_java ts_c_sharp ts_rust ts_ruby ts_php
+    ts_perl ts_lua ts_swift ts_kotlin ts_r ts_scala ts_haskell ts_ocaml
+    ts_erlang ts_elixir ts_dart ts_objc ts_glsl ts_hlsl ts_go ts_python)
diff --git a/LLVM/extern/tree-sitter/tree-sitter b/LLVM/extern/tree-sitter/tree-sitter
new file mode 160000
index 0000000..b1fa972
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter
@@ -0,0 +1 @@
+Subproject commit b1fa9725cf7bad668ac98b9fd3ec303f1e9076ec
diff --git a/LLVM/extern/tree-sitter/tree-sitter-c-sharp b/LLVM/extern/tree-sitter/tree-sitter-c-sharp
new file mode 160000
index 0000000..cac6d5f
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-c-sharp
@@ -0,0 +1 @@
+Subproject commit cac6d5fb595f5811a076336682d5d595ac1c9e85
diff --git a/LLVM/extern/tree-sitter/tree-sitter-dart b/LLVM/extern/tree-sitter/tree-sitter-dart
new file mode 160000
index 0000000..a9bdfa3
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-dart
@@ -0,0 +1 @@
+Subproject commit a9bdfa3db2fbc9b9f12c93450d04a671f33a5102
diff --git a/LLVM/extern/tree-sitter/tree-sitter-elixir b/LLVM/extern/tree-sitter/tree-sitter-elixir
new file mode 160000
index 0000000..7937d3b
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-elixir
@@ -0,0 +1 @@
+Subproject commit 7937d3b4d65fa574163cfa59394515d3c1cf16f4
diff --git a/LLVM/extern/tree-sitter/tree-sitter-erlang b/LLVM/extern/tree-sitter/tree-sitter-erlang
new file mode 160000
index 0000000..e446ec6
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-erlang
@@ -0,0 +1 @@
+Subproject commit e446ec60022a7cafe157805742b41c04b499cc5d
diff --git a/LLVM/extern/tree-sitter/tree-sitter-glsl b/LLVM/extern/tree-sitter/tree-sitter-glsl
new file mode 160000
index 0000000..24a6c8e
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-glsl
@@ -0,0 +1 @@
+Subproject commit 24a6c8ef698e4480fecf8340d771fbcb5de8fbb4
diff --git a/LLVM/extern/tree-sitter/tree-sitter-go b/LLVM/extern/tree-sitter/tree-sitter-go
new file mode 160000
index 0000000..2346a3a
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-go
@@ -0,0 +1 @@
+Subproject commit 2346a3ab1bb3857b48b29d779a1ef9799a248cd7
diff --git a/LLVM/extern/tree-sitter/tree-sitter-haskell b/LLVM/extern/tree-sitter/tree-sitter-haskell
new file mode 160000
index 0000000..0975ef7
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-haskell
@@ -0,0 +1 @@
+Subproject commit 0975ef72fc3c47b530309ca93937d7d143523628
diff --git a/LLVM/extern/tree-sitter/tree-sitter-hlsl b/LLVM/extern/tree-sitter/tree-sitter-hlsl
new file mode 160000
index 0000000..bab9111
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-hlsl
@@ -0,0 +1 @@
+Subproject commit bab9111922d53d43668fabb61869bec51bbcb915
diff --git a/LLVM/extern/tree-sitter/tree-sitter-java b/LLVM/extern/tree-sitter/tree-sitter-java
new file mode 160000
index 0000000..e10607b
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-java
@@ -0,0 +1 @@
+Subproject commit e10607b45ff745f5f876bfa3e94fbcc6b44bdc11
diff --git a/LLVM/extern/tree-sitter/tree-sitter-javascript b/LLVM/extern/tree-sitter/tree-sitter-javascript
new file mode 160000
index 0000000..58404d8
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-javascript
@@ -0,0 +1 @@
+Subproject commit 58404d8cf191d69f2674a8fd507bd5776f46cb11
diff --git a/LLVM/extern/tree-sitter/tree-sitter-kotlin b/LLVM/extern/tree-sitter/tree-sitter-kotlin
new file mode 160000
index 0000000..f66d290
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-kotlin
@@ -0,0 +1 @@
+Subproject commit f66d2908542e93c0204c6c241f794afe4e9cd5d1
diff --git a/LLVM/extern/tree-sitter/tree-sitter-lua b/LLVM/extern/tree-sitter/tree-sitter-lua
new file mode 160000
index 0000000..10fe005
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-lua
@@ -0,0 +1 @@
+Subproject commit 10fe0054734eec83049514ea2e718b2a56acd0c9
diff --git a/LLVM/extern/tree-sitter/tree-sitter-objc b/LLVM/extern/tree-sitter/tree-sitter-objc
new file mode 160000
index 0000000..181a81b
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-objc
@@ -0,0 +1 @@
+Subproject commit 181a81b8f23a2d593e7ab4259981f50122909fda
diff --git a/LLVM/extern/tree-sitter/tree-sitter-ocaml b/LLVM/extern/tree-sitter/tree-sitter-ocaml
new file mode 160000
index 0000000..6902a86
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-ocaml
@@ -0,0 +1 @@
+Subproject commit 6902a86ab5b3b80c622030210aae2d8cb95eb775
diff --git a/LLVM/extern/tree-sitter/tree-sitter-perl b/LLVM/extern/tree-sitter/tree-sitter-perl
new file mode 160000
index 0000000..9b651c0
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-perl
@@ -0,0 +1 @@
+Subproject commit 9b651c0ca29a2c581e2bb07815e17432ac3820d3
diff --git a/LLVM/extern/tree-sitter/tree-sitter-php b/LLVM/extern/tree-sitter/tree-sitter-php
new file mode 160000
index 0000000..3f2465c
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-php
@@ -0,0 +1 @@
+Subproject commit 3f2465c217d0a966d41e584b42d75522f2a3149e
diff --git a/LLVM/extern/tree-sitter/tree-sitter-python b/LLVM/extern/tree-sitter/tree-sitter-python
new file mode 160000
index 0000000..26855ea
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-python
@@ -0,0 +1 @@
+Subproject commit 26855eabccb19c6abf499fbc5b8dc7cc9ab8bc64
diff --git a/LLVM/extern/tree-sitter/tree-sitter-r b/LLVM/extern/tree-sitter/tree-sitter-r
new file mode 160000
index 0000000..0e6ef77
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-r
@@ -0,0 +1 @@
+Subproject commit 0e6ef7741712c09dc3ee6e81c42e919820cc65ef
diff --git a/LLVM/extern/tree-sitter/tree-sitter-ruby b/LLVM/extern/tree-sitter/tree-sitter-ruby
new file mode 160000
index 0000000..ad907a6
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-ruby
@@ -0,0 +1 @@
+Subproject commit ad907a69da0c8a4f7a943a7fe012712208da6dee
diff --git a/LLVM/extern/tree-sitter/tree-sitter-rust b/LLVM/extern/tree-sitter/tree-sitter-rust
new file mode 160000
index 0000000..77a3747
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-rust
@@ -0,0 +1 @@
+Subproject commit 77a3747266f4d621d0757825e6b11edcbf991ca5
diff --git a/LLVM/extern/tree-sitter/tree-sitter-scala b/LLVM/extern/tree-sitter/tree-sitter-scala
new file mode 160000
index 0000000..4d081d9
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-scala
@@ -0,0 +1 @@
+Subproject commit 4d081d98670ff6e98ca42c085294fc75eec15e1d
diff --git a/LLVM/extern/tree-sitter/tree-sitter-swift b/LLVM/extern/tree-sitter/tree-sitter-swift
new file mode 160000
index 0000000..a923ac6
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-swift
@@ -0,0 +1 @@
+Subproject commit a923ac6d78d6f3132d996ecd189a4c7ca13c8d9d
diff --git a/LLVM/extern/tree-sitter/tree-sitter-typescript b/LLVM/extern/tree-sitter/tree-sitter-typescript
new file mode 160000
index 0000000..75b3874
--- /dev/null
+++ b/LLVM/extern/tree-sitter/tree-sitter-typescript
@@ -0,0 +1 @@
+Subproject commit 75b3874edb2dc714fb1fd77a32013d0f8699989f
diff --git a/LLVM/include/LanguageData.hpp b/LLVM/include/LanguageData.hpp
index 6f392d3..62d92a4 100644
--- a/LLVM/include/LanguageData.hpp
+++ b/LLVM/include/LanguageData.hpp
@@ -1,14 +1,6 @@
 #ifndef LANGUAGE_DATA_HPP
 #define LANGUAGE_DATA_HPP
 
-#ifndef MATCHERTEXT_PARSERS_DIR
-#define MATCHERTEXT_PARSERS_DIR "./parsers"
-#endif
-
-#ifndef MATCHERTEXT_GO_PARSER_BIN
-#define MATCHERTEXT_GO_PARSER_BIN "./matchertext_go_parser"
-#endif
-
 #include <array>
 #include <span>
 #include <string_view>
@@ -22,20 +14,42 @@ struct LanguageData {
   const std::span<const std::string_view> alias;
   /// The language's common file extensions
   const std::span<const std::string_view> extensions;
-  /// The language's common compilers
-  const std::span<const std::string_view> compilers;
-  /// The language's compiler command call template
-  const std::string_view cmdTemplate;
 };
 
-#define LANGUAGE_LIST(X)                                                     \
-  X(C, ("c"), ("c", "h"), (), "")                                            \
-  X(CPP, ("cpp", "c++"), ("cc", "cpp", "cxx", "hpp", "hh", "hxx"), (), "")   \
-  X(Go, ("go"), ("go"), (MATCHERTEXT_GO_PARSER_BIN), R"("{}" "{}")")         \
-  X(Python, ("python", "py"), ("py", "pyw", "pyi", "pyz", "pyzw"), ("python3", "python"), "{} \"" MATCHERTEXT_PARSERS_DIR "/parser.py\" \"{}\"")
-
-// The code bellow is to make sure the data above ^^^^^ is compiled into the parser instead of being constructed at
-// each parser startup for some speed optimizations.
+// Every language is parsed in-process (clang for C/C++, tree-sitter otherwise),
+// so only aliases and extensions are needed. Order matters: extension lookup is
+// first-wins by this list's order (main.cpp's try_emplace), so later entries
+// never re-list an extension owned by an earlier one (e.g. Objective-C omits
+// .h, which stays C).
+#define LANGUAGE_LIST(X)                                                          \
+  X(C, ("c"), ("c", "h"))                                                         \
+  X(CPP, ("cpp", "c++"), ("cc", "cpp", "cxx", "hpp", "hh", "hxx"))                \
+  X(Go, ("go"), ("go"))                                                           \
+  X(Python, ("python", "py"), ("py", "pyw", "pyi", "pyz", "pyzw"))                \
+  X(JavaScript, ("javascript", "js"), ("js", "mjs", "cjs", "jsx"))                \
+  X(TypeScript, ("typescript", "ts"), ("ts", "mts", "cts", "tsx"))                \
+  X(Java, ("java"), ("java"))                                                     \
+  X(CSharp, ("csharp", "c#", "cs"), ("cs", "csx"))                                \
+  X(Rust, ("rust", "rs"), ("rs"))                                                 \
+  X(Ruby, ("ruby", "rb"), ("rb", "rbw", "rake", "gemspec"))                       \
+  X(PHP, ("php"), ("php", "php3", "php4", "php5", "phtml"))                        \
+  X(Perl, ("perl", "pl"), ("pl", "pm", "perl"))                                   \
+  X(Lua, ("lua"), ("lua"))                                                        \
+  X(Swift, ("swift"), ("swift"))                                                  \
+  X(Kotlin, ("kotlin", "kt"), ("kt", "kts"))                                      \
+  X(R, ("r"), ("r", "R"))                                                         \
+  X(Scala, ("scala"), ("scala", "sc"))                                            \
+  X(Haskell, ("haskell", "hs"), ("hs", "lhs"))                                    \
+  X(OCaml, ("ocaml", "ml"), ("ml", "mli"))                                        \
+  X(Erlang, ("erlang", "erl"), ("erl", "hrl"))                                    \
+  X(Elixir, ("elixir", "ex"), ("ex", "exs"))                                      \
+  X(Dart, ("dart"), ("dart"))                                                     \
+  X(Objective_C, ("objc", "objectivec", "objective-c"), ("m", "mm"))              \
+  X(GLSL, ("glsl"), ("glsl", "vert", "frag", "geom", "comp", "tesc", "tese"))     \
+  X(HLSL, ("hlsl"), ("hlsl", "fx", "hlsli"))
+
+// The code below ensures the data above is compiled into the parser instead of
+// being constructed at each parser startup, for some speed optimizations.
 
 template<typename... Ts> constexpr std::array<std::string_view, sizeof...(Ts)> sv_array(Ts... values) {
   return {std::string_view{values}...};
@@ -44,23 +58,20 @@ template<typename... Ts> constexpr std::array<std::string_view, sizeof...(Ts)> s
 #define STRIP_PARENS(...) __VA_ARGS__
 #define AS_ARRAY(x) sv_array(STRIP_PARENS x)
 
-#define MAKE_ARRAYS(name, aliases, extensions, comps, cmd)                      \
+#define MAKE_ARRAYS(name, aliases, extensions)                                  \
   constexpr auto k##name##Aliases = AS_ARRAY(aliases);                          \
-  constexpr auto k##name##Extensions = AS_ARRAY(extensions);                    \
-  constexpr auto k##name##Compilers  = AS_ARRAY(comps);
+  constexpr auto k##name##Extensions = AS_ARRAY(extensions);
 
 LANGUAGE_LIST(MAKE_ARRAYS)
 
 #undef MAKE_ARRAYS
 
-#define MAKE_ENTRY(name, aliases, extensions, comps, cmd)                       \
+#define MAKE_ENTRY(name, aliases, extensions)                                   \
   std::pair{                                                                    \
     LanguageEnum::name,                                                         \
     LanguageData{                                                               \
       std::span<const std::string_view>{k##name##Aliases},                      \
-      std::span<const std::string_view>{k##name##Extensions},                   \
-      std::span<const std::string_view>{k##name##Compilers},                    \
-      std::string_view{cmd}                                                     \
+      std::span<const std::string_view>{k##name##Extensions}                    \
     }                                                                           \
   },
 
@@ -68,13 +79,6 @@ constexpr auto kLanguageData = std::array{LANGUAGE_LIST(MAKE_ENTRY)};
 
 #undef MAKE_ENTRY
 
-static constexpr const LanguageData &GetLanguageData(const LanguageEnum lang) {
-  for (const auto &[key, value]: kLanguageData)
-    if (key == lang)
-      return value;
-  throw std::logic_error{"Unknown Language, make sure that the language is registered in LanguageData.hpp"};
-}
-
 static LanguageEnum GetLanguage(const std::string_view lang) {
   for (const auto &[key, value]: kLanguageData)
     if (const auto it = std::ranges::find(value.alias, lang); it != value.alias.end())
diff --git a/LLVM/include/LanguageParser.hpp b/LLVM/include/LanguageParser.hpp
index a82b826..1d7e1eb 100644
--- a/LLVM/include/LanguageParser.hpp
+++ b/LLVM/include/LanguageParser.hpp
@@ -14,13 +14,9 @@
 class LanguageParser {
   public:
     [[nodiscard]] static bool ExtractData(
-      LanguageEnum language, const std::string &compilerOverride, const std::string &filePath, Serde::JSON &result
+      LanguageEnum language, const std::string &filePath, Serde::JSON &result
     );
     static bool ParseLanguage(const std::string &name, LanguageEnum &out);
-  private:
-    [[nodiscard]] static bool RunBuildCommand(
-      LanguageEnum language, const std::string &compilerOverride, const std::string &filePath, std::string &out
-    );
 };
 
 #endif //LANGUAGE_PARSER_HPP
diff --git a/LLVM/include/TreeSitterParser.hpp b/LLVM/include/TreeSitterParser.hpp
new file mode 100644
index 0000000..3eeeb4a
--- /dev/null
+++ b/LLVM/include/TreeSitterParser.hpp
@@ -0,0 +1,24 @@
+#ifndef TREE_SITTER_PARSER_HPP
+#define TREE_SITTER_PARSER_HPP
+
+#include <string>
+
+#include "JSON.hpp"
+#include "LanguageClassifier.hpp"
+
+/// In-process tree-sitter string/comment extractor. Counterpart of
+/// Parser::ParseC_CPP for the ~21 programming languages whose grammars are
+/// linked into the binary (see extern/tree-sitter). Emits the same JSON
+/// contract: an array of {"kind":"string"|"comment","value":"..."}.
+namespace TreeSitter {
+  /// True if `language` is parsed in-process via a linked tree-sitter grammar.
+  bool IsTreeSitterLanguage(LanguageEnum language);
+
+  /// Parse `path` for `language`, appending {"kind","value"} objects to
+  /// `result` (made an array if it isn't one). Thread-safe (thread_local
+  /// parser). Returns false only on unreadable file / unsupported language;
+  /// parse errors yield whatever was collected (possibly empty).
+  bool Parse(LanguageEnum language, const std::string &path, Serde::JSON &result);
+}
+
+#endif // TREE_SITTER_PARSER_HPP
diff --git a/LLVM/main.cpp b/LLVM/main.cpp
index 54cddad..a001100 100644
--- a/LLVM/main.cpp
+++ b/LLVM/main.cpp
@@ -39,7 +39,6 @@ static const char *kUsage =
     "OPTIONS:\n"
     "    --language <lang>              Only analyze files of the given language\n"
     "    --output <name>                Write results to ./result/<name> (default: ./result/<input-path>)\n"
-    "    --compiler <compiler>          Override the compiler used for parsing\n"
     "    --extensions <ext1,ext2,...>   Comma-separated list of additional file extensions\n"
     "\n"
     "EXAMPLES:\n"
@@ -128,7 +127,6 @@ int main(const int argc, char *argv[]) {
     log_info(msg.str());
   }
 
-  std::string compilerOverride;
   std::string outputName; // when set via --output, names the subdir under ./result
   auto filterLanguage = LanguageEnum::Unknown;
   std::vector<std::string_view> extraExtensions;
@@ -145,15 +143,6 @@ int main(const int argc, char *argv[]) {
       outputName = argv[++i];
       continue;
     }
-    if (arg == "--compiler") {
-      if (i + 1 >= argc) {
-        std::fprintf(stderr, "--compiler requires a value\n\n");
-        std::fprintf(stderr, kUsage, argv[0]);
-        return -1;
-      }
-      compilerOverride = argv[++i];
-      continue;
-    }
     if (arg == "--extensions") {
       if (i + 1 >= argc) {
         std::fprintf(stderr, "--extensions requires a value\n\n");
@@ -375,11 +364,11 @@ int main(const int argc, char *argv[]) {
       const auto langStart = Clock::now();
 
       #if USE_OPENMP
-      #pragma omp parallel for schedule(dynamic) default(none) shared(lang, compilerOverride, pls, done, displayedPct, langTotal)
+      #pragma omp parallel for schedule(dynamic) default(none) shared(lang, pls, done, displayedPct, langTotal)
       #endif
       for (const auto &[filePath, inputPath]: lang.second) {
         try {
-          if (Serde::JSON result; LanguageParser::ExtractData(lang.first, compilerOverride, filePath, result))
+          if (Serde::JSON result; LanguageParser::ExtractData(lang.first, filePath, result))
             Parser::GatherStatistics(std::move(result), filePath, inputPath, pls);
         } catch (const std::exception &e) {
           #pragma omp critical
@@ -389,8 +378,7 @@ int main(const int argc, char *argv[]) {
         }
 
         const size_t n = ++done;
-        const int newPct = static_cast<int>(n * 100 / langTotal);
-        if (newPct > displayedPct.load(std::memory_order_relaxed)) {
+        if (const int newPct = static_cast<int>(n * 100 / langTotal); newPct > displayedPct.load(std::memory_order_relaxed)) {
           #pragma omp critical
           {
             if (const int cur = displayedPct.load(); newPct > cur) {
diff --git a/LLVM/parsers/parser.go b/LLVM/parsers/parser.go
deleted file mode 100644
index 122b0ac..0000000
--- a/LLVM/parsers/parser.go
+++ /dev/null
@@ -1,89 +0,0 @@
-// parser.go
-// Go counterpart of Parser::ParseC_CPP.
-// Emits a JSON array of {"kind": "string"|"comment", "value": "..."} on stdout.
-// Usage: go run parser.go <path>
-package main
-
-import (
-	"encoding/json"
-	"go/scanner"
-	"go/token"
-	"os"
-	"strings"
-)
-
-type Item struct {
-	Kind  string `json:"kind"`
-	Value string `json:"value"`
-}
-
-// extractStringBody strips the surrounding delimiters of a Go string literal.
-// Supports interpreted strings ("...") and raw strings (`...`); preserves
-// inner content verbatim — escape sequences are kept as written.
-func extractStringBody(lit string) string {
-	if len(lit) < 2 {
-		return lit
-	}
-	first, last := lit[0], lit[len(lit)-1]
-	if (first == '"' && last == '"') || (first == '`' && last == '`') {
-		return lit[1 : len(lit)-1]
-	}
-	return lit
-}
-
-func parse(path string) []Item {
-	items := make([]Item, 0)
-
-	src, err := os.ReadFile(path)
-	if err != nil {
-		return items
-	}
-
-	fset := token.NewFileSet()
-	file := fset.AddFile(path, fset.Base(), len(src))
-	var s scanner.Scanner
-	s.Init(file, src, func(_ token.Position, _ string) {}, scanner.ScanComments)
-
-	var pending strings.Builder
-	hasPending := false
-	flushPending := func() {
-		if hasPending {
-			items = append(items, Item{Kind: "string", Value: pending.String()})
-			pending.Reset()
-			hasPending = false
-		}
-	}
-
-	for {
-		_, tok, lit := s.Scan()
-		if tok == token.EOF {
-			break
-		}
-		switch tok {
-		case token.STRING:
-			pending.WriteString(extractStringBody(lit))
-			hasPending = true
-		case token.COMMENT:
-			flushPending()
-			items = append(items, Item{Kind: "comment", Value: lit})
-		case token.SEMICOLON:
-			// Inserted by the scanner — does not break adjacency for our purposes.
-		default:
-			flushPending()
-		}
-	}
-	flushPending()
-
-	return items
-}
-
-func main() {
-	if len(os.Args) < 2 {
-		os.Stdout.WriteString("[]")
-		return
-	}
-	items := parse(os.Args[1])
-	enc := json.NewEncoder(os.Stdout)
-	enc.SetEscapeHTML(false)
-	_ = enc.Encode(items)
-}
diff --git a/LLVM/parsers/parser.py b/LLVM/parsers/parser.py
deleted file mode 100644
index 3c10c70..0000000
--- a/LLVM/parsers/parser.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env python3
-#
-# parser.py
-# Python counterpart of Parser::ParseC_CPP.
-# Emits a JSON array of {"kind": "string"|"comment", "value": "..."} on stdout.
-# Usage: parser.py <path>
-#        parser.py --server   (reads file paths from stdin, one per line; writes one JSON array per line)
-
-import json
-import re
-import sys
-import tokenize
-
-
-_PREFIX_QUOTE = re.compile(r"^([rRbBuUfF]{0,3})('''|\"\"\"|'|\")")
-
-
-def extract_string_body(spelling: str) -> str:
-    m = _PREFIX_QUOTE.match(spelling)
-    if not m:
-        return spelling
-    quote = m.group(2)
-    body_start = m.end()
-    body_end = spelling.rfind(quote)
-    if body_end <= body_start - len(quote):
-        return spelling[body_start:]
-    return spelling[body_start:body_end]
-
-
-def parse(path: str):
-    items = []
-    pending = None  # accumulator for adjacent string concatenation
-
-    def flush_pending():
-        nonlocal pending
-        if pending is not None:
-            items.append({"kind": "string", "value": pending})
-            pending = None
-
-    try:
-        with open(path, "rb") as f:
-            tokens = tokenize.tokenize(f.readline)
-            for tok in tokens:
-                ttype = tok.type
-                if ttype == tokenize.STRING:
-                    body = extract_string_body(tok.string)
-                    pending = body if pending is None else pending + body
-                elif ttype == tokenize.COMMENT:
-                    flush_pending()
-                    items.append({"kind": "comment", "value": tok.string})
-                elif ttype in (
-                    tokenize.NL,
-                    tokenize.NEWLINE,
-                    tokenize.INDENT,
-                    tokenize.DEDENT,
-                    tokenize.ENCODING,
-                    tokenize.ENDMARKER,
-                ):
-                    continue
-                else:
-                    flush_pending()
-            flush_pending()
-    except (tokenize.TokenError, SyntaxError, OSError, UnicodeDecodeError):
-        # Bail out gracefully — return what we collected so far.
-        flush_pending()
-
-    return items
-
-
-def server_mode():
-    """Read file paths from stdin line by line, write one JSON array per line to stdout."""
-    for line in sys.stdin:
-        path = line.rstrip("\n")
-        if not path:
-            sys.stdout.write("[]\n")
-            sys.stdout.flush()
-            continue
-        items = parse(path)
-        json.dump(items, sys.stdout, ensure_ascii=False, separators=(",", ":"))
-        sys.stdout.write("\n")
-        sys.stdout.flush()
-
-
-def main() -> int:
-    if len(sys.argv) >= 2 and sys.argv[1] == "--server":
-        server_mode()
-        return 0
-    if len(sys.argv) < 2:
-        sys.stdout.write("[]")
-        return 0
-    items = parse(sys.argv[1])
-    json.dump(items, sys.stdout, ensure_ascii=False, separators=(",", ":"))
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/LLVM/src/LanguageParser.cpp b/LLVM/src/LanguageParser.cpp
index 26c5285..8ad9f23 100644
--- a/LLVM/src/LanguageParser.cpp
+++ b/LLVM/src/LanguageParser.cpp
@@ -5,200 +5,26 @@
 //
 
 #include <algorithm>
-#include <array>
-#include <cerrno>
-#include <chrono>
-#include <csignal>
-#include <cstdio>
-#include <map>
-#include <poll.h>
-#include <spawn.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-extern char **environ;
+#include <cctype>
+#include <string>
 
 #include "../include/LanguageParser.hpp"
-#include "JSON.hpp"
-#include "JsonParser.hpp"
 #include "../include/LanguageData.hpp"
 #include "../include/Parser.hpp"
-
-static std::string format(const std::string &tpl, const std::string &a, const std::string &b) {
-  std::string out;
-  out.reserve(tpl.size() + a.size() * 2 + b.size() * 2);
-
-  int arg = 0;
-  for (size_t i = 0; i < tpl.size(); ++i) {
-    if (i + 1 < tpl.size() && tpl[i] == '{' && tpl[i + 1] == '}') {
-      if (arg == 0)
-        out += a;
-      else if (arg == 1)
-        out += b;
-      else if (arg == 2 || arg == 3)
-        out += b + ".out";
-      ++arg;
-      ++i;
-    } else {
-      out += tpl[i];
-    }
-  }
-  return out;
-}
-
-static bool isAvailable(const std::string_view cmd) {
-  const std::string check = "command -v " + std::string(cmd) + " >/dev/null 2>&1";
-  return std::system(check.c_str()) == 0;
-}
-
-static std::string firstAvailable(const std::span<const std::string_view> candidates) {
-  for (const auto c: candidates)
-    if (isAvailable(c))
-      return std::string(c);
-  return "";
-}
-
-namespace {
-  // Bidirectional persistent subprocess. One instance per thread per language.
-  struct PersistentProcess {
-    int write_fd = -1;
-    int read_fd = -1;
-    pid_t pid = -1;
-
-    [[nodiscard]] bool valid() const {
-      return write_fd >= 0 && read_fd >= 0 && pid > 0;
-    }
-
-    bool start(const std::string &cmd) {
-      int to_child[2], from_child[2];
-      if (pipe(to_child) < 0)
-        return false;
-      if (pipe(from_child) < 0) {
-        ::close(to_child[0]);
-        ::close(to_child[1]);
-        return false;
-      }
-
-      posix_spawn_file_actions_t fa;
-      posix_spawn_file_actions_init(&fa);
-      posix_spawn_file_actions_adddup2(&fa, to_child[0], STDIN_FILENO);
-      posix_spawn_file_actions_adddup2(&fa, from_child[1], STDOUT_FILENO);
-      posix_spawn_file_actions_addclose(&fa, to_child[0]);
-      posix_spawn_file_actions_addclose(&fa, to_child[1]);
-      posix_spawn_file_actions_addclose(&fa, from_child[0]);
-      posix_spawn_file_actions_addclose(&fa, from_child[1]);
-
-      // Put the child in its own process group so kill(-pid, SIGKILL) reaches
-      // both /bin/sh and the python3 it forks, preventing orphaned subprocesses.
-      posix_spawnattr_t attr;
-      posix_spawnattr_init(&attr);
-      posix_spawnattr_setpgroup(&attr, 0);
-      posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETPGROUP);
-
-      const char *argv[] = {"/bin/sh", "-c", cmd.c_str(), nullptr};
-      const int r = posix_spawn(&pid, "/bin/sh", &fa, &attr, const_cast<char **>(argv), environ);
-      posix_spawn_file_actions_destroy(&fa);
-      posix_spawnattr_destroy(&attr);
-
-      ::close(to_child[0]);
-      ::close(from_child[1]);
-
-      if (r != 0) {
-        ::close(to_child[1]);
-        ::close(from_child[0]);
-        pid = -1;
-        return false;
-      }
-
-      write_fd = to_child[1];
-      read_fd = from_child[0];
-      return true;
-    }
-
-    // Send a file path, read back the JSON line. Returns false if the subprocess died or timed out.
-    bool request(const std::string &path, std::string &out, int timeout_ms = 10000) {
-      const std::string msg = path + "\n";
-      const char *p = msg.c_str();
-      size_t remaining = msg.size();
-      while (remaining > 0) {
-        const ssize_t n = write(write_fd, p, remaining);
-        if (n <= 0)
-          return false;
-        p += n;
-        remaining -= static_cast<size_t>(n);
-      }
-
-      out.clear();
-      char buf[4096];
-      const auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms);
-
-      while (true) {
-        const auto now = std::chrono::steady_clock::now();
-        if (now >= deadline)
-          return false;
-
-        const int wait_ms = static_cast<int>(
-          std::chrono::duration_cast<std::chrono::milliseconds>(deadline - now).count()
-        );
-        struct pollfd pfd = {read_fd, POLLIN, 0};
-        if (::poll(&pfd, 1, wait_ms) <= 0)
-          return false;
-        if (!(pfd.revents & POLLIN))
-          return false;
-
-        const ssize_t n = ::read(read_fd, buf, sizeof(buf));
-        if (n <= 0)
-          return false;
-        out.append(buf, static_cast<size_t>(n));
-
-        if (!out.empty() && out.back() == '\n') {
-          out.pop_back();
-          return true;
-        }
-      }
-    }
-
-    void stop() {
-      if (write_fd >= 0) {
-        ::close(write_fd);
-        write_fd = -1;
-      }
-      if (read_fd >= 0) {
-        ::close(read_fd);
-        read_fd = -1;
-      }
-      if (pid > 0) {
-        kill(-pid, SIGKILL); // kill entire process group (shell + python3 child)
-        waitpid(pid, nullptr, 0);
-        pid = -1;
-      }
-    }
-  };
-
-  struct ThreadProcesses {
-    std::map<LanguageEnum, PersistentProcess> procs;
-
-    ~ThreadProcesses() {
-      for (auto &[lang, proc]: procs)
-        proc.stop();
-    }
-  };
-
-  thread_local ThreadProcesses tl_procs;
-}
+#include "../include/TreeSitterParser.hpp"
 
 bool LanguageParser::ExtractData(
-  const LanguageEnum language, const std::string &compilerOverride, const std::string &filePath, Serde::JSON &result
+  const LanguageEnum language, const std::string &filePath, Serde::JSON &result
 ) {
+  // C/C++ use the in-process clang lexer; every other supported language uses an
+  // in-process tree-sitter grammar (see src/TreeSitterParser.cpp).
   if (language == LanguageEnum::C || language == LanguageEnum::CPP)
     return Parser::ParseC_CPP(filePath, result) && result.IsArray();
 
-  std::string out;
-  if (!RunBuildCommand(language, compilerOverride, filePath, out))
-    return false;
+  if (TreeSitter::IsTreeSitterLanguage(language))
+    return TreeSitter::Parse(language, filePath, result) && result.IsArray();
 
-  result = Serde::JSONParser{out}.Parse();
-  return result.IsArray();
+  return false;
 }
 
 bool LanguageParser::ParseLanguage(const std::string &name, LanguageEnum &out) {
@@ -211,53 +37,3 @@ bool LanguageParser::ParseLanguage(const std::string &name, LanguageEnum &out) {
   out = GetLanguage(lower);
   return out != LanguageEnum::Unknown;
 }
-
-bool LanguageParser::RunBuildCommand(
-  const LanguageEnum language, const std::string &compilerOverride, const std::string &filePath, std::string &out
-) {
-  const auto data = GetLanguageData(language);
-  const std::string cc = compilerOverride.empty() ? firstAvailable(data.compilers) : compilerOverride;
-  if (cc.empty())
-    return false;
-
-  // Script-based parsers use a persistent per-thread subprocess to avoid interpreter startup overhead.
-  if (language == LanguageEnum::Python) {
-    auto &proc = tl_procs.procs[language];
-    auto try_start = [&]() -> bool {
-      if (proc.valid())
-        return true;
-      const std::string serverCmd = cc + " \"" MATCHERTEXT_PARSERS_DIR
-      "/parser.py\" --server";
-      return proc.start(serverCmd);
-    };
-    if (!try_start())
-      return false;
-    if (!proc.request(filePath, out)) {
-      // Subprocess died mid-run; restart and retry once.
-      proc.stop();
-      if (!try_start())
-        return false;
-      return proc.request(filePath, out);
-    }
-    return true;
-  }
-
-  // One-shot popen for other external parsers (e.g. Go binary).
-  std::array < char, 4096 > buffer{};
-  const std::string cmd = format(std::string(data.cmdTemplate), cc, filePath);
-  FILE *pipe = popen(cmd.c_str(), "r");
-  if (!pipe)
-    throw std::runtime_error("popen() failed");
-
-  while (fgets(buffer.data(), buffer.size(), pipe) != nullptr)
-    out += buffer.data();
-
-  int status = pclose(pipe);
-  if (status == -1)
-    throw std::runtime_error("pclose() failed");
-
-  if (WIFEXITED(status) && WEXITSTATUS(status) != 0)
-    throw std::runtime_error("command failed with exit code " + std::to_string(WEXITSTATUS(status)));
-
-  return true;
-}
diff --git a/LLVM/src/TreeSitterParser.cpp b/LLVM/src/TreeSitterParser.cpp
new file mode 100644
index 0000000..ef36995
--- /dev/null
+++ b/LLVM/src/TreeSitterParser.cpp
@@ -0,0 +1,278 @@
+#include "../include/TreeSitterParser.hpp"
+
+#include <cctype>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iterator>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <tree_sitter/api.h>
+
+// Each grammar's language function lives in its vendored parser.c.
+extern "C" {
+  const TSLanguage *tree_sitter_javascript(void);
+  const TSLanguage *tree_sitter_typescript(void);
+  const TSLanguage *tree_sitter_java(void);
+  const TSLanguage *tree_sitter_c_sharp(void);
+  const TSLanguage *tree_sitter_rust(void);
+  const TSLanguage *tree_sitter_ruby(void);
+  const TSLanguage *tree_sitter_php(void);
+  const TSLanguage *tree_sitter_perl(void);
+  const TSLanguage *tree_sitter_lua(void);
+  const TSLanguage *tree_sitter_swift(void);
+  const TSLanguage *tree_sitter_kotlin(void);
+  const TSLanguage *tree_sitter_r(void);
+  const TSLanguage *tree_sitter_scala(void);
+  const TSLanguage *tree_sitter_haskell(void);
+  const TSLanguage *tree_sitter_ocaml(void);
+  const TSLanguage *tree_sitter_erlang(void);
+  const TSLanguage *tree_sitter_elixir(void);
+  const TSLanguage *tree_sitter_dart(void);
+  const TSLanguage *tree_sitter_objc(void);
+  const TSLanguage *tree_sitter_glsl(void);
+  const TSLanguage *tree_sitter_hlsl(void);
+  const TSLanguage *tree_sitter_go(void);
+  const TSLanguage *tree_sitter_python(void);
+}
+
+namespace {
+  // ---- grammar registry ----------------------------------------------------
+  const TSLanguage *languageFor(const LanguageEnum lang) {
+    switch (lang) {
+      case LanguageEnum::JavaScript: return tree_sitter_javascript();
+      case LanguageEnum::TypeScript: return tree_sitter_typescript();
+      case LanguageEnum::Java: return tree_sitter_java();
+      case LanguageEnum::CSharp: return tree_sitter_c_sharp();
+      case LanguageEnum::Rust: return tree_sitter_rust();
+      case LanguageEnum::Ruby: return tree_sitter_ruby();
+      case LanguageEnum::PHP: return tree_sitter_php();
+      case LanguageEnum::Perl: return tree_sitter_perl();
+      case LanguageEnum::Lua: return tree_sitter_lua();
+      case LanguageEnum::Swift: return tree_sitter_swift();
+      case LanguageEnum::Kotlin: return tree_sitter_kotlin();
+      case LanguageEnum::R: return tree_sitter_r();
+      case LanguageEnum::Scala: return tree_sitter_scala();
+      case LanguageEnum::Haskell: return tree_sitter_haskell();
+      case LanguageEnum::OCaml: return tree_sitter_ocaml();
+      case LanguageEnum::Erlang: return tree_sitter_erlang();
+      case LanguageEnum::Elixir: return tree_sitter_elixir();
+      case LanguageEnum::Dart: return tree_sitter_dart();
+      case LanguageEnum::Objective_C: return tree_sitter_objc();
+      case LanguageEnum::GLSL: return tree_sitter_glsl();
+      case LanguageEnum::HLSL: return tree_sitter_hlsl();
+      case LanguageEnum::Go: return tree_sitter_go();
+      case LanguageEnum::Python: return tree_sitter_python();
+      default: return nullptr;
+    }
+  }
+
+  // ---- node-type classification --------------------------------------------
+  enum class Kind { Other, String, Comment };
+
+  struct Classifier {
+    std::unordered_set<std::string_view> stringTypes;
+    std::unordered_set<std::string_view> commentTypes;
+
+    Kind classify(const char *type) const {
+      const std::string_view t(type);
+      if (commentTypes.contains(t))
+        return Kind::Comment;
+      if (stringTypes.contains(t))
+        return Kind::String;
+      // Heuristic fallback — resilient to grammar version drift and the many
+      // string/comment node-type spellings across grammars.
+      if (t.find("comment") != std::string_view::npos)
+        return Kind::Comment;
+      if (t.find("string") != std::string_view::npos || t == "char" || t.find("char_literal") != std::string_view::npos)
+        return Kind::String;
+      return Kind::Other;
+    }
+  };
+
+  // Per-language overrides for node types the heuristic misses (no "string"/
+  // "comment" substring). Languages absent here rely purely on the heuristic.
+  const Classifier &classifierFor(const LanguageEnum lang) {
+    static const std::unordered_map<LanguageEnum, Classifier> table = [] {
+      std::unordered_map<LanguageEnum, Classifier> m;
+      m[LanguageEnum::Java] = {{"text_block"}, {}};
+      m[LanguageEnum::Ruby] = {{"heredoc_body", "heredoc_beginning", "bare_string"}, {}};
+      m[LanguageEnum::PHP] = {{"heredoc", "nowdoc", "encapsed_string"}, {}};
+      m[LanguageEnum::Perl] = {{"heredoc_content"}, {"=pod"}};
+      m[LanguageEnum::Elixir] = {{"charlist", "sigil", "quoted_content"}, {}};
+      m[LanguageEnum::Erlang] = {{"sigil"}, {}};
+      m[LanguageEnum::OCaml] = {{"quoted_string"}, {}};
+      m[LanguageEnum::Haskell] = {{}, {"haddock"}};
+      return m;
+    }();
+    static const Classifier kEmpty{};
+    const auto it = table.find(lang);
+    return it == table.end() ? kEmpty : it->second;
+  }
+
+  // ---- text helpers --------------------------------------------------------
+  std::string nodeText(const TSNode node, const std::string &src) {
+    uint32_t s = ts_node_start_byte(node);
+    uint32_t e = ts_node_end_byte(node);
+    if (e > src.size())
+      e = static_cast<uint32_t>(src.size());
+    if (s > e)
+      s = e;
+    return src.substr(s, e - s);
+  }
+
+  bool isQuote(const char c) { return c == '"' || c == '\'' || c == '`'; }
+
+  // Strip surrounding string delimiters (and optional letter/@/$ prefixes),
+  // handling triple-quote runs. Fallback when the grammar exposes no content
+  // child. Mirrors the spirit of parser.py / parser.go body extraction.
+  std::string stripDelimiters(const std::string &s) {
+    size_t b = 0;
+    const size_t e = s.size();
+    while (b < e && !isQuote(s[b]) && (std::isalpha(static_cast<unsigned char>(s[b])) || s[b] == '@' || s[b] == '$'))
+      ++b;
+    if (b >= e || !isQuote(s[b]))
+      return s; // no recognizable quote delimiter
+    const char q = s[b];
+    size_t open = 0;
+    while (b + open < e && s[b + open] == q)
+      ++open;
+    size_t close = 0;
+    while (close < e - b && s[e - 1 - close] == q)
+      ++close;
+    const size_t take = std::min(open, close);
+    const size_t bodyStart = b + take;
+    const size_t bodyEnd = e >= take && e - take >= bodyStart ? e - take : bodyStart;
+    return s.substr(bodyStart, bodyEnd - bodyStart);
+  }
+
+  // Body of a string node: prefer content/fragment children (delimiters and
+  // interpolation markers excluded for free); otherwise strip delimiters.
+  std::string stringBody(const TSNode node, const std::string &src) {
+    std::string body;
+    bool found = false;
+    const uint32_t n = ts_node_named_child_count(node);
+    for (uint32_t i = 0; i < n; ++i) {
+      const TSNode c = ts_node_named_child(node, i);
+      const char *t = ts_node_type(c);
+      if (std::strstr(t, "content") || std::strstr(t, "fragment")) {
+        body += nodeText(c, src);
+        found = true;
+      }
+    }
+    return found ? body : stripDelimiters(nodeText(node, src));
+  }
+
+  Serde::JSON makeObj(const char *kind, const std::string &value) {
+    return Serde::JSON::Object({{"kind", Serde::JSON(kind)}, {"value", Serde::JSON(value)}});
+  }
+
+  // ---- tree walk -----------------------------------------------------------
+  // Iterative (explicit stack) to avoid recursion-depth blowups on deep trees.
+  // String/comment nodes are emitted whole and not descended into; ERROR nodes
+  // are descended through so well-formed descendants are still collected.
+  void walk(const TSNode root, const std::string &src, const Classifier &cls, Serde::JSON &out) {
+    std::vector<TSNode> stack;
+    stack.push_back(root);
+    while (!stack.empty()) {
+      const TSNode node = stack.back();
+      stack.pop_back();
+
+      switch (cls.classify(ts_node_type(node))) {
+        case Kind::Comment:
+          out.PushBack(makeObj("comment", nodeText(node, src)));
+          continue;
+        case Kind::String:
+          out.PushBack(makeObj("string", stringBody(node, src)));
+          continue;
+        case Kind::Other:
+          break;
+      }
+
+      const uint32_t count = ts_node_child_count(node);
+      for (uint32_t i = count; i-- > 0;)
+        stack.push_back(ts_node_child(node, i));
+    }
+  }
+
+  // Cap per-file parse time so a pathological input (e.g. a huge minified
+  // bundle, common in large repos) can't wedge a worker thread forever.
+  // tree-sitter invokes the progress callback periodically while parsing;
+  // returning true cancels the parse, which then yields a null tree.
+  constexpr auto kParseTimeout = std::chrono::seconds(5);
+
+  // TSInput read callback: hand tree-sitter the whole remaining buffer at once.
+  const char *readString(void *payload, const uint32_t byte, TSPoint, uint32_t *bytesRead) {
+    const auto *s = static_cast<const std::string *>(payload);
+    if (byte >= s->size()) {
+      *bytesRead = 0;
+      return "";
+    }
+    *bytesRead = static_cast<uint32_t>(s->size() - byte);
+    return s->data() + byte;
+  }
+
+  bool parseExpired(TSParseState *state) {
+    const auto *deadline = static_cast<const std::chrono::steady_clock::time_point *>(state->payload);
+    return std::chrono::steady_clock::now() >= *deadline;
+  }
+
+  // thread_local TSParser, cleaned up at thread exit (parsing runs under OpenMP).
+  struct ParserHolder {
+    TSParser *parser = nullptr;
+    TSParser *get() {
+      if (!parser)
+        parser = ts_parser_new();
+      return parser;
+    }
+    ~ParserHolder() {
+      if (parser)
+        ts_parser_delete(parser);
+    }
+  };
+}
+
+bool TreeSitter::IsTreeSitterLanguage(const LanguageEnum language) {
+  return languageFor(language) != nullptr;
+}
+
+bool TreeSitter::Parse(const LanguageEnum language, const std::string &path, Serde::JSON &result) {
+  const TSLanguage *tsLang = languageFor(language);
+  if (!tsLang)
+    return false;
+
+  std::ifstream in(path, std::ios::binary);
+  if (!in)
+    return false;
+  const std::string src((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>());
+
+  if (!result.IsArray())
+    result = Serde::JSON::Array();
+
+  thread_local ParserHolder holder;
+  TSParser *parser = holder.get();
+  if (!ts_parser_set_language(parser, tsLang))
+    return false; // ABI mismatch between grammar and core runtime
+
+  auto deadline = std::chrono::steady_clock::now() + kParseTimeout;
+  TSInput input{};
+  input.payload = const_cast<std::string *>(&src);
+  input.read = readString;
+  input.encoding = TSInputEncodingUTF8;
+  input.decode = nullptr;
+  TSParseOptions opts{};
+  opts.payload = &deadline;
+  opts.progress_callback = parseExpired;
+
+  TSTree *tree = ts_parser_parse_with_options(parser, nullptr, input, opts);
+  if (!tree)
+    return true; // timed out or nothing parsed; empty array is valid
+
+  walk(ts_tree_root_node(tree), src, classifierFor(language), result);
+  ts_tree_delete(tree);
+  return true;
+}

From 36e3cd5e7819976d2e1afbfed0de505d5a282a3e Mon Sep 17 00:00:00 2001
From: AntoineBastide47 <148970403+AntoineBastide47@users.noreply.github.com>
Date: Wed, 20 May 2026 20:37:48 +0200
Subject: [PATCH 4/4] feat: faster parsing ~5-20%

---
 LLVM/extern/tree-sitter/CMakeLists.txt | 22 ++++++++-
 LLVM/src/TreeSitterParser.cpp          | 68 +++++++++++++++++++-------
 2 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/LLVM/extern/tree-sitter/CMakeLists.txt b/LLVM/extern/tree-sitter/CMakeLists.txt
index 62331aa..0c3d5de 100644
--- a/LLVM/extern/tree-sitter/CMakeLists.txt
+++ b/LLVM/extern/tree-sitter/CMakeLists.txt
@@ -20,14 +20,30 @@ target_include_directories(tree-sitter-core
 target_compile_options(tree-sitter-core PRIVATE -w -O2)
 set_target_properties(tree-sitter-core PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
+# A few grammars (perl, swift) don't commit a generated parser.c, so generate it
+# from grammar.json with the tree-sitter CLI. This also self-heals when a
+# submodule re-checkout wipes the previously generated (untracked) parser.c.
+find_program(TREE_SITTER_CLI tree-sitter HINTS /opt/homebrew/bin /usr/local/bin)
+
 # --- one static lib per grammar ---------------------------------------------
 # `name` becomes target ts_<name>; the language symbol tree_sitter_<name> lives
 # in the grammar's parser.c. `src_dir` is relative to TS_ROOT and holds parser.c
 # (+ optional scanner.c/.cc). Generated tables are huge: -O2 (not -O3) and -w.
 function(add_ts_grammar name src_dir)
   set(dir ${TS_ROOT}/${src_dir})
+  get_filename_component(repo_dir ${dir} DIRECTORY)
+  if (NOT EXISTS ${dir}/parser.c AND EXISTS ${dir}/grammar.json AND TREE_SITTER_CLI)
+    message(STATUS "tree-sitter: generating parser.c for '${name}'")
+    execute_process(
+      COMMAND ${TREE_SITTER_CLI} generate src/grammar.json
+      WORKING_DIRECTORY ${repo_dir}
+      RESULT_VARIABLE ts_gen_result OUTPUT_QUIET ERROR_QUIET)
+  endif ()
   if (NOT EXISTS ${dir}/parser.c)
-    message(FATAL_ERROR "tree-sitter grammar '${name}' missing parser.c at ${dir}")
+    message(FATAL_ERROR
+      "tree-sitter grammar '${name}' missing parser.c at ${dir}.\n"
+      "Install the tree-sitter CLI (brew install tree-sitter) and re-configure, or run:\n"
+      "  (cd ${repo_dir} && tree-sitter generate src/grammar.json)")
   endif ()
   set(srcs ${dir}/parser.c)
   if (EXISTS ${dir}/scanner.c)
@@ -38,7 +54,9 @@ function(add_ts_grammar name src_dir)
   endif ()
   add_library(ts_${name} STATIC ${srcs})
   target_include_directories(ts_${name} PRIVATE ${dir})
-  target_compile_options(ts_${name} PRIVATE -w -O2)
+  # parser.c is mostly static data tables (opt level emits identical data), but
+  # each lib may also include a hand-written scanner.c that IS hot code, so -O3.
+  target_compile_options(ts_${name} PRIVATE -w -O3 -DNDEBUG -mcpu=native)
   set_target_properties(ts_${name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endfunction()
 
diff --git a/LLVM/src/TreeSitterParser.cpp b/LLVM/src/TreeSitterParser.cpp
index ef36995..91882fb 100644
--- a/LLVM/src/TreeSitterParser.cpp
+++ b/LLVM/src/TreeSitterParser.cpp
@@ -4,7 +4,6 @@
 #include <chrono>
 #include <cstring>
 #include <fstream>
-#include <iterator>
 #include <string>
 #include <string_view>
 #include <unordered_map>
@@ -115,7 +114,8 @@ namespace {
   }
 
   // ---- text helpers --------------------------------------------------------
-  std::string nodeText(const TSNode node, const std::string &src) {
+  // A node's source bytes as a view into `src` (no copy).
+  std::string_view nodeText(const TSNode node, const std::string_view src) {
     uint32_t s = ts_node_start_byte(node);
     uint32_t e = ts_node_end_byte(node);
     if (e > src.size())
@@ -129,8 +129,8 @@ namespace {
 
   // Strip surrounding string delimiters (and optional letter/@/$ prefixes),
   // handling triple-quote runs. Fallback when the grammar exposes no content
-  // child. Mirrors the spirit of parser.py / parser.go body extraction.
-  std::string stripDelimiters(const std::string &s) {
+  // child. Returns a view into the input. Mirrors parser.py / parser.go.
+  std::string_view stripDelimiters(const std::string_view s) {
     size_t b = 0;
     const size_t e = s.size();
     while (b < e && !isQuote(s[b]) && (std::isalpha(static_cast<unsigned char>(s[b])) || s[b] == '@' || s[b] == '$'))
@@ -150,24 +150,38 @@ namespace {
     return s.substr(bodyStart, bodyEnd - bodyStart);
   }
 
-  // Body of a string node: prefer content/fragment children (delimiters and
-  // interpolation markers excluded for free); otherwise strip delimiters.
-  std::string stringBody(const TSNode node, const std::string &src) {
-    std::string body;
-    bool found = false;
+  // Body of a string node, returned as a view. Prefer content/fragment children
+  // (delimiters and interpolation markers excluded for free); otherwise strip
+  // delimiters off the whole node. The rare multi-child case concatenates into
+  // `scratch` (reused across nodes) and returns a view into it.
+  std::string_view stringBody(const TSNode node, const std::string_view src, std::string &scratch) {
     const uint32_t n = ts_node_named_child_count(node);
+    int firstContent = -1;
+    int contentCount = 0;
     for (uint32_t i = 0; i < n; ++i) {
-      const TSNode c = ts_node_named_child(node, i);
-      const char *t = ts_node_type(c);
+      const char *t = ts_node_type(ts_node_named_child(node, i));
       if (std::strstr(t, "content") || std::strstr(t, "fragment")) {
-        body += nodeText(c, src);
-        found = true;
+        if (firstContent < 0)
+          firstContent = static_cast<int>(i);
+        ++contentCount;
       }
     }
-    return found ? body : stripDelimiters(nodeText(node, src));
+    if (contentCount == 1)
+      return nodeText(ts_node_named_child(node, static_cast<uint32_t>(firstContent)), src);
+    if (contentCount > 1) {
+      scratch.clear();
+      for (uint32_t i = 0; i < n; ++i) {
+        const TSNode c = ts_node_named_child(node, i);
+        const char *t = ts_node_type(c);
+        if (std::strstr(t, "content") || std::strstr(t, "fragment"))
+          scratch.append(nodeText(c, src));
+      }
+      return scratch;
+    }
+    return stripDelimiters(nodeText(node, src));
   }
 
-  Serde::JSON makeObj(const char *kind, const std::string &value) {
+  Serde::JSON makeObj(const char *kind, const std::string_view value) {
     return Serde::JSON::Object({{"kind", Serde::JSON(kind)}, {"value", Serde::JSON(value)}});
   }
 
@@ -175,8 +189,9 @@ namespace {
   // Iterative (explicit stack) to avoid recursion-depth blowups on deep trees.
   // String/comment nodes are emitted whole and not descended into; ERROR nodes
   // are descended through so well-formed descendants are still collected.
-  void walk(const TSNode root, const std::string &src, const Classifier &cls, Serde::JSON &out) {
+  void walk(const TSNode root, const std::string_view src, const Classifier &cls, Serde::JSON &out) {
     std::vector<TSNode> stack;
+    std::string scratch; // reused buffer for multi-child string concatenation
     stack.push_back(root);
     while (!stack.empty()) {
       const TSNode node = stack.back();
@@ -187,7 +202,7 @@ namespace {
           out.PushBack(makeObj("comment", nodeText(node, src)));
           continue;
         case Kind::String:
-          out.PushBack(makeObj("string", stringBody(node, src)));
+          out.PushBack(makeObj("string", stringBody(node, src, scratch)));
           continue;
         case Kind::Other:
           break;
@@ -245,14 +260,29 @@ bool TreeSitter::Parse(const LanguageEnum language, const std::string &path, Ser
   if (!tsLang)
     return false;
 
-  std::ifstream in(path, std::ios::binary);
+  // Bulk read (one allocation + one read) instead of istreambuf_iterator.
+  std::ifstream in(path, std::ios::binary | std::ios::ate);
   if (!in)
     return false;
-  const std::string src((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>());
+  const std::streamoff size = in.tellg();
+  if (size < 0)
+    return false;
+  std::string src(static_cast<size_t>(size), '\0');
+  in.seekg(0);
+  in.read(src.data(), size);
+  src.resize(static_cast<size_t>(in.gcount()));
 
   if (!result.IsArray())
     result = Serde::JSON::Array();
 
+  // Skip binary files. Source extensions collide with non-source formats (e.g.
+  // `.ts` is both TypeScript and MPEG transport stream, which Chromium ships as
+  // test data). A NUL byte is a reliable binary signal — source essentially
+  // never contains one — and feeding binary to the GLR parser triggers
+  // pathological error-recovery blowups the parse timeout can't always catch.
+  if (std::memchr(src.data(), '\0', src.size()) != nullptr)
+    return true;
+
   thread_local ParserHolder holder;
   TSParser *parser = holder.get();
   if (!ts_parser_set_language(parser, tsLang))