From 5fa4e71b0a5daeec1ac12161344a685cc7715039 Mon Sep 17 00:00:00 2001 From: Brendan Kellam Date: Tue, 12 May 2026 12:12:07 -0700 Subject: [PATCH 1/2] feat(backend): write changed-path Bloom filters to commit-graph Writes --changed-paths when building the commit-graph, which adds Bloom filters that let git cheaply skip commits that did not touch a given path. This dramatically accelerates `git log -- ` and modestly helps `git blame` on large repos. Existing repos that already have a Bloom-less commit-graph get a one-time `--split=replace` rewrite on their next fetch, gated on a new `commitGraphChangedPathsBackfilledAt` timestamp stored in repo metadata. Subsequent fetches do a cheap incremental write. Also moves the `writeCommitGraph` call out of `cloneRepository` and into `RepoIndexManager.indexRepository` so clone and fetch paths handle the commit-graph symmetrically. Drops `--write-commit-graph` from the fetch invocation since that flag does not honor `--changed-paths`. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/backend/src/git.ts | 33 +++++++++++------ packages/backend/src/repoIndexManager.test.ts | 3 ++ packages/backend/src/repoIndexManager.ts | 36 ++++++++++++++++++- packages/shared/src/types.ts | 8 +++++ .../codePreviewPanel/codePreview.tsx | 2 +- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/packages/backend/src/git.ts b/packages/backend/src/git.ts index c803d18d6..d827a5714 100644 --- a/packages/backend/src/git.ts +++ b/packages/backend/src/git.ts @@ -105,12 +105,6 @@ export const cloneRepository = async ( keys: ["remote.origin.url"], signal, }); - - // @note: operations that need to iterate over a lot of commits (e.g., rev-list --count) - // can be slow on larger repositories. Commit graphs are a acceleration structure that - // speed up these operations. - // @see: https://git-scm.com/docs/commit-graph - await writeCommitGraph({ path, signal }); } catch (error: unknown) { const baseLog = `Failed to clone repository: ${path}`; @@ -151,9 +145,6 @@ export const fetchRepository = async ( "+refs/heads/*:refs/heads/*", "--prune", "--progress", - // On fetch, ensure the commit graph is up to date. - // @see: https://git-scm.com/docs/commit-graph - "--write-commit-graph" ]); // Update HEAD to match the remote's default branch. This handles the case where the remote's @@ -490,20 +481,42 @@ export const isRepoEmpty = async ({ * Writes or updates the commit-graph file for the repository. * This pre-computes commit metadata to speed up operations like * rev-list --count, log, and merge-base. + * + * Also writes changed-path Bloom filters (--changed-paths), which let git + * quickly skip commits that didn't touch a given path. This accelerates + * `git log -- ` dramatically and `git blame` modestly on large repos. + * + * For incremental writes (the default), Bloom filters are only computed for + * commits being added in this write. Repos that already have a Bloom-less + * commit-graph from a prior version need a one-time `forceBackfill: true` + * write to backfill filters for their historical commits. + * + * @see: https://git-scm.com/docs/commit-graph */ export const writeCommitGraph = async ({ path, + forceBackfill, onProgress, signal, }: { path: string, + forceBackfill?: boolean, onProgress?: onProgressFn, signal?: AbortSignal, }): Promise => { const git = createGitClientForPath(path, onProgress, signal); try { - await git.raw(['commit-graph', 'write', '--reachable']); + const args = ['commit-graph', 'write', '--reachable', '--changed-paths']; + if (forceBackfill) { + // --split=replace consolidates any existing layers into a single new layer, + // which forces git to recompute Bloom filters for every commit (not just commits + // added since the last write). Used for the one-time migration of repos that have + // a Bloom-less commit-graph from before --changed-paths was enabled. + // @see: https://git-scm.com/docs/git-commit-graph#Documentation/git-commit-graph.txt-write + args.push('--split=replace'); + } + await git.raw(args); } catch (error) { // Don't throw an exception here since this is just a performance optimization. logger.debug(`Failed to write commit-graph for ${path}:`, error); diff --git a/packages/backend/src/repoIndexManager.test.ts b/packages/backend/src/repoIndexManager.test.ts index 3a65d092f..deb2ebdfd 100644 --- a/packages/backend/src/repoIndexManager.test.ts +++ b/packages/backend/src/repoIndexManager.test.ts @@ -51,6 +51,7 @@ vi.mock('./git.js', () => ({ isRepoEmpty: vi.fn().mockResolvedValue(false), unsetGitConfig: vi.fn(), upsertGitConfig: vi.fn(), + writeCommitGraph: vi.fn(), })); vi.mock('./zoekt.js', () => ({ @@ -178,6 +179,8 @@ const createMockSettings = (): Settings => ({ enablePublicAccess: false, experiment_repoDrivenPermissionSyncIntervalMs: 1000 * 60 * 60 * 24, experiment_userDrivenPermissionSyncIntervalMs: 1000 * 60 * 60 * 24, + repoDrivenPermissionSyncIntervalMs: 1000 * 60 * 60 * 24, + userDrivenPermissionSyncIntervalMs: 1000 * 60 * 60 * 24, maxAccountPermissionSyncJobConcurrency: 8, maxRepoPermissionSyncJobConcurrency: 8, }); diff --git a/packages/backend/src/repoIndexManager.ts b/packages/backend/src/repoIndexManager.ts index 254019b24..0fb055a31 100644 --- a/packages/backend/src/repoIndexManager.ts +++ b/packages/backend/src/repoIndexManager.ts @@ -8,7 +8,7 @@ import { Redis } from 'ioredis'; import micromatch from 'micromatch'; import Redlock, { ExecutionError } from 'redlock'; import { INDEX_CACHE_DIR, REPOS_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js'; -import { cloneRepository, fetchRepository, getBranches, getCommitHashForRefName, getLatestCommitTimestamp, getLocalDefaultBranch, getTags, isPathAValidGitRepoRoot, isRepoEmpty, unsetGitConfig, upsertGitConfig } from './git.js'; +import { cloneRepository, fetchRepository, getBranches, getCommitHashForRefName, getLatestCommitTimestamp, getLocalDefaultBranch, getTags, isPathAValidGitRepoRoot, isRepoEmpty, unsetGitConfig, upsertGitConfig, writeCommitGraph } from './git.js'; import { captureEvent } from './posthog.js'; import { PromClient } from './promClient.js'; import { RepoWithConnections, Settings } from "./types.js"; @@ -396,6 +396,19 @@ export class RepoIndexManager { const fetchDuration_s = durationMs / 1000; logger.debug(`Fetched ${repo.name} (id: ${repo.id}) in ${fetchDuration_s}s`); + + // Update the commit-graph after fetch. Force a full backfill the first time we + // see this repo after the --changed-paths rollout, so historical commits get + // Bloom filters. Subsequent fetches do a cheap incremental write. + const needsBackfill = !metadata.commitGraphChangedPathsBackfilledAt; + if (needsBackfill) { + logger.debug(`Backfilling changed-path Bloom filters for ${repo.name} (id: ${repo.id})...`); + } + await writeCommitGraph({ + path: repoPath, + forceBackfill: needsBackfill, + signal, + }); } else if (!isReadOnly) { logger.debug(`Cloning ${repo.name} (id: ${repo.id})...`); @@ -411,6 +424,27 @@ export class RepoIndexManager { const cloneDuration_s = durationMs / 1000; logger.debug(`Cloned ${repo.name} (id: ${repo.id}) in ${cloneDuration_s}s`); + + // Write the commit-graph for the freshly cloned repo. + await writeCommitGraph({ + path: repoPath, + signal, + }); + } + + // Record that this repo's commit-graph now includes changed-path Bloom filters + // for its full history (either freshly written during clone, or backfilled above + // during fetch). + if (!isReadOnly && !metadata.commitGraphChangedPathsBackfilledAt) { + await this.db.repo.update({ + where: { id: repo.id }, + data: { + metadata: { + ...metadata, + commitGraphChangedPathsBackfilledAt: new Date().toISOString(), + } satisfies RepoMetadata, + }, + }); } // Regardless of clone or fetch, always upsert the git config for the repo. diff --git a/packages/shared/src/types.ts b/packages/shared/src/types.ts index b0291a57b..f5de58476 100644 --- a/packages/shared/src/types.ts +++ b/packages/shared/src/types.ts @@ -32,6 +32,14 @@ export const repoMetadataSchema = z.object({ */ indexedRevisions: z.array(z.string()).optional(), + /** + * Timestamp of when changed-path Bloom filters were written into the + * commit-graph for this repo's full history. Undefined means the one-time + * backfill has not yet run, so historical commits still lack Bloom filters. + * @see writeCommitGraph in packages/backend/src/git.ts + */ + commitGraphChangedPathsBackfilledAt: z.string().datetime().optional(), + /** * Code host specific metadata, keyed by code host type. */ diff --git a/packages/web/src/app/(app)/search/components/codePreviewPanel/codePreview.tsx b/packages/web/src/app/(app)/search/components/codePreviewPanel/codePreview.tsx index 4d9ef76ba..4149bd6b3 100644 --- a/packages/web/src/app/(app)/search/components/codePreviewPanel/codePreview.tsx +++ b/packages/web/src/app/(app)/search/components/codePreviewPanel/codePreview.tsx @@ -112,7 +112,7 @@ export const CodePreview = ({ return (
-
+
{/* Gutter icon */}
From ed2f828b216ac01367c924825e194a816cb58b72 Mon Sep 17 00:00:00 2001 From: Brendan Kellam Date: Tue, 12 May 2026 12:42:41 -0700 Subject: [PATCH 2/2] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37b29a349..93ace9904 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Reduced the log verbosity of the worker by changing various log messages from info to debug. [#1179](https://github.com/sourcebot-dev/sourcebot/pull/1179) - [EE] Switched symbol hover detection to use Lezer highlight tags, broadening identifier coverage. [#1194](https://github.com/sourcebot-dev/sourcebot/pull/1194) +- Improved git history and blame performance on large repositories. [#1198](https://github.com/sourcebot-dev/sourcebot/pull/1198) ## [4.17.1] - 2026-05-04