lzehrung · lzehrung · May 23, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
diff --git a/README.md b/README.md
@@ -194,7 +194,7 @@ The supported package import surface is the root export, `@lzehrung/codegraph`.
 ## Common workflows
 
 - Repo triage: run `codegraph inspect ./src --limit 20`, then follow with `codegraph hotspots ./src --limit 20` or `codegraph unresolved` to focus the next pass.
-- Duplicate cleanup: run `codegraph duplicates ./src --min-confidence medium` before refactors to find shared extraction candidates.
+- Duplicate cleanup: run `codegraph duplicates ./src --min-confidence medium` before refactors to find grouped extraction candidates.
 - Symbol navigation: use `codegraph goto <file> <line> <column>` and `codegraph refs --file <file> --line <line> --col <column> --pretty` when a question is about definitions or semantic usages rather than matching strings.
 - PR review: run `codegraph impact --base origin/main --head HEAD --pretty` for a ranked map, `codegraph review --base origin/main --head HEAD --summary` for a compact reviewer handoff with actionable candidate tests, or redirect plain `review` output when a downstream tool needs the full JSON bundle.
 - Worktree review: run `codegraph impact --base HEAD --head WORKTREE --pretty` for current staged and unstaged tracked-file changes, then `codegraph review --base HEAD --head WORKTREE --summary` for a compact handoff. Use `--head STAGED` to compare `HEAD` against the current index.
@@ -287,7 +287,7 @@ npm run test:ci
 
 `npm run test:ci` writes a Vitest JSON timing report and prints a slow-test summary. Tests over 2 seconds are review-required, and tests over 10 seconds should be treated as integration-tier candidates unless they have a documented reason.
 
-Use `npm run test:fast` for the shorter non-integration suite. Use `npm run test:integration` for CLI and native-runtime integration coverage.
+Use `npm test` or `npm run test:fast` for the shorter non-integration suite. Use `npm run test:integration` for CLI and native-runtime integration coverage.
 
 If you are touching the native workspace directly, also run `npm run build:native` and `npm run test:native`. Benchmark harness coverage lives behind `npm run test:bench`.
 

diff --git a/codegraph-skill/codegraph/SKILL.md b/codegraph-skill/codegraph/SKILL.md
@@ -214,9 +214,13 @@ For git-provider impact and git-scoped review/index/graph commands, `WORKTREE` c
 - Duplicate and near-duplicate code:
   `codegraph duplicates --root . ./src --min-confidence medium`
   Covers indexed symbols, semantic chunks, and text chunks.
+  Reports grouped findings by default so overlapping symbol/chunk variants collapse into one clone.
+  Uses duplicate JSON `schemaVersion: 2`.
+  Bounds per-group variants by default and reports hidden evidence with counts.
   A single positional directory becomes the project root unless `--root` is set.
   Use `--include-small` for tiny helpers.
   Use `--include-same-file` for local clone cleanup.
+  Use `--raw-pairs` to include low-level scored unit-pair suggestions.
 - Unresolved project imports:
   `codegraph unresolved`
   Excludes graph-only document/template link edges plus known runtime/package externals: supported-language standard libraries, URL imports, and dependencies declared in nearby manifests such as `package.json`, Python, PHP, Rust, Go, Zig, Ruby, Java/Kotlin, .NET, C/C++, and Swift package manifests.

diff --git a/docs/cli.md b/docs/cli.md
@@ -150,6 +150,7 @@ codegraph chunk config.yaml --language yaml --min-tokens 100 --max-tokens 300
 # Detect duplicate and near-duplicate code units
 codegraph duplicates ./src --min-confidence medium --limit 20
 codegraph duplicates --root . ./src ./packages/app --include-same-file
+codegraph duplicates ./src --raw-pairs
 codegraph duplicates --help
 
 # Go to definition
@@ -166,14 +167,18 @@ codegraph grep --query '(function_declaration name: (identifier) @name)'
 codegraph grep --pattern 'eval\(' --ignore-case
 ```
 
-`duplicates` always reports scored exact, renamed, near, and weak clone candidates as JSON.
+`duplicates` always reports grouped exact, renamed, near, and weak clone candidates as JSON.
 
 - It combines indexed symbols, semantic chunks, and text chunks.
-- It reports project-relative paths, confidence, clone type, metrics, omission counts, and pair stats.
+- It emits `schemaVersion: 2` for grouped duplicate output.
+- It reports project-relative paths, confidence, clone type, metrics, variant counts, omission counts, and pair stats.
+- Groups collapse overlapping symbol/chunk variants so one underlying clone appears as one finding.
+- Group `variants` are bounded by default; use `rawPairCount` and `omittedVariantCount` to see hidden evidence counts.
 - A single positional directory becomes the project root unless `--root` is set.
 - Use `--root . ./src` for scoped scans with repository-relative paths.
 - Use `--include-small` for tiny helpers.
 - Use `--include-same-file` for non-overlapping clones inside one file.
+- Use `--raw-pairs` when debugging the low-level pair evidence behind each group.
 
 `search`, `explain`, `artifact`, and `mcp` each support command-specific `--help` output so agents do not have to infer their options from the top-level help. `search` is deterministic and vectorless. It returns ranked results with project-relative stable handles, rank reasons, evidence, graph neighbors, follow-up commands, result counts, per-packet limits, and omission counts. `explain` resolves file paths, symbol names, SQL object names, and search handles, including file/chunk/graph handles, into bounded packets with symbols, dependencies, reverse dependencies, references, snippets, SQL object relation facts, changed-context review tasks/candidate tests, explicit limits, omission counts, and follow-ups. Generated follow-up and suggested-question commands POSIX-shell-quote dynamic arguments when needed. SQL object names resolve by exact name first; unqualified basenames resolve only when unique, so handles or schema-qualified names are preferred. Reference and snippet omission counts are lower bounds after the bounded navigation scan reaches its cap. `artifact build` writes `codegraph.sqlite`, self-describing project-relative `graph.json`, `CODEGRAPH_REPORT.md`, `questions.json`, and `manifest.json` by default; suggested questions use unique IDs backed by stable handles when a handle is available. Use artifact flags to select a subset. `--force` permits non-empty output directories, removes recognizable stale Codegraph artifacts, preserves unrelated operator files, and refuses unrecognized reserved-name collisions. Artifact contents exclude their own output directory and linked outside-root files. `mcp serve` exposes `search`, `get_file`, `get_symbol`, `goto`, `refs`, `deps`, `rdeps`, `path`, `impact`, `review`, `query_sqlite`, and `artifact_build` over stdio by default or Streamable HTTP with `--port <number>`. HTTP serves `/mcp`, binds to `127.0.0.1` unless `--host <host>` is passed, validates the Host header, allows loopback Host headers for wildcard binds, and rejects oversized request bodies. MCP file and artifact paths are confined to `--root` after realpath resolution; tools are read-only by default, `query_sqlite` is row- and byte-bounded and rejects synthetic payload functions, and `--allow-build` enables artifact output only. `chunk` uses semantic Tree-sitter chunking for registered source and stylesheet languages, Vue and Svelte block-aware chunking for single-file components, and text chunking for JSON, YAML, and unsupported extensions. Use `--text` to force text chunking.
 

diff --git a/docs/library-api.md b/docs/library-api.md
@@ -218,7 +218,10 @@ The integration examples demonstrate semantic chunking with type-based filtering
 `findDuplicates()` scans a built `ProjectIndex` for exact, renamed, near, and weak clone candidates.
 
 - It uses indexed symbols, semantic chunks, and text chunks.
-- Results include confidence, score, clone type, metrics, omission counts, and pair stats.
+- Grouped duplicate output uses `schemaVersion: 2`.
+- Results include grouped findings, confidence, score, clone type, metrics, omission counts, and pair stats.
+- Group `variants` are bounded by default and expose hidden evidence through `rawPairCount` and `omittedVariantCount`.
+- Raw unit-pair suggestions and full group variants are available when `includeRawPairs` is enabled.
 - Paths are project-relative when the index has a project root.
 
 ```ts
@@ -231,14 +234,15 @@ const duplicates = await findDuplicates(index, {
   limit: 20,
 });
 
-console.log(duplicates.suggestions);
+console.log(duplicates.groups);
 ```
 
 Useful options:
 
 - `minConfidence`: `high`, `medium`, or `low`; default `medium`.
 - `includeSameFile`: report non-overlapping clones in the same file.
 - `includeSmall`: include units below the default token floor.
+- `includeRawPairs`: include low-level symbol/chunk pair evidence as `suggestions`.
 - `minTokens` and `maxTokens`: tune unit and fallback chunk bounds.
 
 Tests:

diff --git a/package.json b/package.json
@@ -26,7 +26,7 @@
     "lint": "npx eslint \"src/**/*.ts\" \"tests/**/*.test.ts\"",
     "lint:fix": "npx eslint \"src/**/*.ts\" \"tests/**/*.test.ts\" --fix",
     "prepare": "node ./scripts/prepare-package.mjs",
-    "test": "node ./scripts/ensure-dist-for-tests.mjs && vitest",
+    "test": "npm run test:fast",
     "test:all": "npm run test:native && npm run test:ci",
     "test:fast": "node ./scripts/ensure-dist-for-tests.mjs && vitest run --exclude tests/bench-harness.test.ts --exclude tests/cli-regressions.test.ts --exclude tests/impact-cli.test.ts --exclude tests/esm-language-loading.test.ts --exclude tests/native-fallback-reporting.test.ts --exclude tests/native-tree-sitter.test.ts --exclude tests/native-semantic-parity.test.ts --exclude tests/native-worker-parity.test.ts --exclude tests/native-parser-ownership.test.ts --exclude tests/detailed-symbol-native-only.test.ts",
     "test:integration": "node ./scripts/ensure-dist-for-tests.mjs && vitest run tests/cli-regressions.test.ts tests/impact-cli.test.ts tests/esm-language-loading.test.ts tests/native-fallback-reporting.test.ts tests/native-tree-sitter.test.ts tests/native-semantic-parity.test.ts tests/native-worker-parity.test.ts tests/native-parser-ownership.test.ts tests/detailed-symbol-native-only.test.ts",

diff --git a/src/agent-tools.ts b/src/agent-tools.ts
@@ -13,7 +13,7 @@ import { analyzeImpactFromDiff } from "./impact/index.js";
 import type { CompactImpactReport, ImpactOptions, ImpactReport } from "./impact/types.js";
 import type { Edge, Range } from "./types.js";
 import { collectGraph } from "./graph-builder.js";
-import { getDependencies, getReverseDependencies } from "./graphs/queries.js";
+import { getDependencies, getReverseDependencies, type DependencyNode } from "./graphs/queries.js";
 import { getHotspots } from "./graphs/hotspots.js";
 import type { NativeRuntimeMode } from "./native/treeSitterNative.js";
 import { fileExists } from "./util/workspace.js";
@@ -92,6 +92,30 @@ export type ToolDependencyEntry = {
   depth: number;
 };
 
+type ToolDependencyOptions = ToolRuntimeOptions & {
+  depth?: number;
+  limit?: number;
+};
+
+type ToolDependencyListResult =
+  | {
+      status: "ok";
+      file: string;
+      entries: ToolDependencyEntry[];
+      truncated: boolean;
+    }
+  | {
+      status: "not_found";
+      file: string;
+      reason: "file_not_found" | "file_not_indexed";
+      error: string;
+    }
+  | {
+      status: "error";
+      error: string;
+      reason?: "outside_project_root";
+    };
+
 /** File-level hotspot metrics returned by `tool_getHotspots`. */
 export type ToolHotspotEntry = {
   file: string;
@@ -342,12 +366,7 @@ export async function tool_getGraph(
 export async function tool_getDependencies(
   root: string,
   filePath: string,
-  options: {
-    depth?: number;
-    limit?: number;
-    index?: ProjectIndex;
-    native?: NativeRuntimeMode;
-  } = {},
+  options: ToolDependencyOptions = {},
 ): Promise<
   | {
       status: "ok";
@@ -367,37 +386,18 @@ export async function tool_getDependencies(
       reason?: "outside_project_root";
     }
 > {
-  try {
-    const resolvedFile = resolveToolFileInput(root, filePath);
-    if (resolvedFile.status === "error") {
-      return resolvedFile;
-    }
-
-    const index = await getToolIndex(root, options);
-    const missing = await getToolMissingFileResult(index, resolvedFile.absPath, resolvedFile.relativeFile);
-    if (missing) {
-      return missing;
-    }
-
-    const limit = getToolLimit(options.limit) ?? 20;
-    const dependencies = getDependencies(index.graph, resolvedFile.absPath, {
-      ...(options.depth !== undefined ? { depth: options.depth } : {}),
-      limit: limit + 1,
-    });
-    const limited = boundAgentList(dependencies, limit).items.map((entry) => ({
-      file: normalizeToolFileOutput(root, entry.file),
-      depth: entry.depth,
-    }));
-
-    return {
-      status: "ok",
-      file: resolvedFile.relativeFile,
-      dependencies: limited,
-      truncated: dependencies.length !== limited.length,
-    };
-  } catch (error) {
-    return { status: "error", error: String(error) };
+  const result = await collectToolDependencyEntries(root, filePath, options, (index, file, traversalOptions) =>
+    getDependencies(index.graph, file, traversalOptions),
+  );
+  if (result.status !== "ok") {
+    return result;
   }
+  return {
+    status: "ok",
+    file: result.file,
+    dependencies: result.entries,
+    truncated: result.truncated,
+  };
 }
 
 /**
@@ -406,12 +406,7 @@ export async function tool_getDependencies(
 export async function tool_getReverseDependencies(
   root: string,
   filePath: string,
-  options: {
-    depth?: number;
-    limit?: number;
-    index?: ProjectIndex;
-    native?: NativeRuntimeMode;
-  } = {},
+  options: ToolDependencyOptions = {},
 ): Promise<
   | {
       status: "ok";
@@ -431,6 +426,30 @@ export async function tool_getReverseDependencies(
       reason?: "outside_project_root";
     }
 > {
+  const result = await collectToolDependencyEntries(root, filePath, options, (index, file, traversalOptions) =>
+    getReverseDependencies(index.graph, file, traversalOptions),
+  );
+  if (result.status !== "ok") {
+    return result;
+  }
+  return {
+    status: "ok",
+    file: result.file,
+    dependents: result.entries,
+    truncated: result.truncated,
+  };
+}
+
+async function collectToolDependencyEntries(
+  root: string,
+  filePath: string,
+  options: ToolDependencyOptions,
+  collectEntries: (
+    index: ProjectIndex,
+    file: string,
+    traversalOptions: { depth?: number; limit: number },
+  ) => DependencyNode[],
+): Promise<ToolDependencyListResult> {
   try {
     const resolvedFile = resolveToolFileInput(root, filePath);
     if (resolvedFile.status === "error") {
@@ -444,20 +463,20 @@ export async function tool_getReverseDependencies(
     }
 
     const limit = getToolLimit(options.limit) ?? 20;
-    const dependents = getReverseDependencies(index.graph, resolvedFile.absPath, {
+    const entries = collectEntries(index, resolvedFile.absPath, {
       ...(options.depth !== undefined ? { depth: options.depth } : {}),
       limit: limit + 1,
     });
-    const limited = boundAgentList(dependents, limit).items.map((entry) => ({
+    const limited = boundAgentList(entries, limit).items.map((entry) => ({
       file: normalizeToolFileOutput(root, entry.file),
       depth: entry.depth,
     }));
 
     return {
       status: "ok",
       file: resolvedFile.relativeFile,
-      dependents: limited,
-      truncated: dependents.length !== limited.length,
+      entries: limited,
+      truncated: entries.length !== limited.length,
     };
   } catch (error) {
     return { status: "error", error: String(error) };

diff --git a/src/agent/artifact.ts b/src/agent/artifact.ts
@@ -4,6 +4,7 @@ import { getHotspots } from "../graphs/hotspots.js";
 import { type SymbolNode } from "../graphs/symbol-graph.js";
 import { defNodeId } from "../graphs/symbol-graph.js";
 import { queryGraphSqliteRaw, writeGraphSqlite } from "../sqlite.js";
+import { isPlainRecord } from "../util/guards.js";
 import { isFilePathWithinRoot, normalizePath, toProjectRelativePath } from "../util/paths.js";
 import { formatAgentSqlHandle, formatAgentSymbolHandle } from "./handles.js";
 import { createAgentFileLookup, normalizeAgentFilePath } from "./normalize.js";
@@ -266,9 +267,9 @@ async function fileExists(filePath: string): Promise<boolean> {
 
 async function readCodegraphManifest(outDir: string): Promise<ArtifactManifest | undefined> {
   const value = await readJsonIfPresent(path.join(outDir, MANIFEST_FILE));
-  if (!isRecord(value)) return undefined;
+  if (!isPlainRecord(value)) return undefined;
   if (value.schemaVersion !== 1 || value.graphJsonSchema !== "codegraph.graph-json") return undefined;
-  if (!isRecord(value.artifacts)) return undefined;
+  if (!isPlainRecord(value.artifacts)) return undefined;
   const artifacts: CodegraphArtifactBuildResult["artifacts"] = {};
   for (const [key, artifactFile] of Object.entries(value.artifacts)) {
     if (typeof artifactFile !== "string") continue;
@@ -297,11 +298,11 @@ async function isRecognizedCodegraphArtifact(filePath: string, fileName: string)
   }
   if (fileName === GRAPH_JSON_FILE) {
     const value = await readJsonIfPresent(filePath);
-    return isRecord(value) && value.format === "codegraph.graph-json";
+    return isPlainRecord(value) && value.format === "codegraph.graph-json";
   }
   if (fileName === QUESTIONS_FILE) {
     const value = await readJsonIfPresent(filePath);
-    return isRecord(value) && value.format === "codegraph.questions";
+    return isPlainRecord(value) && value.format === "codegraph.questions";
   }
   if (fileName === REPORT_FILE) {
     try {
@@ -370,10 +371,6 @@ async function readJsonIfPresent(filePath: string): Promise<unknown> {
   }
 }
 
-function isRecord(value: unknown): value is Record<string, unknown> {
-  return typeof value === "object" && value !== null && !Array.isArray(value);
-}
-
 async function readDirectoryIfPresent(outDir: string): Promise<string[]> {
   try {
     return await fs.readdir(outDir);

diff --git a/src/cli/doctor.ts b/src/cli/doctor.ts
@@ -11,6 +11,7 @@ import {
   pathExists,
   type CodegraphPackageIdentity,
 } from "./packageInfo.js";
+import { isPlainRecord } from "../util/guards.js";
 
 type IndexedArtifactReport = {
   type: "jsonGraph" | "sqliteGraph" | "diskCache" | "artifactBundle" | "unknown";
@@ -40,9 +41,9 @@ function statIfExists(filePath: string): fs.Stats | null {
 function readArtifactManifest(dirPath: string): { artifacts: Record<string, string> } | null {
   try {
     const parsed = JSON.parse(fs.readFileSync(path.join(dirPath, "manifest.json"), "utf8"));
-    if (!isRecord(parsed)) return null;
+    if (!isPlainRecord(parsed)) return null;
     if (parsed.schemaVersion !== 1 || parsed.graphJsonSchema !== "codegraph.graph-json") return null;
-    if (!isRecord(parsed.artifacts)) return null;
+    if (!isPlainRecord(parsed.artifacts)) return null;
     const artifacts: Record<string, string> = {};
     for (const [key, value] of Object.entries(parsed.artifacts)) {
       if (typeof value === "string") artifacts[key] = value;
@@ -53,10 +54,6 @@ function readArtifactManifest(dirPath: string): { artifacts: Record<string, stri
   }
 }
 
-function isRecord(value: unknown): value is Record<string, unknown> {
-  return typeof value === "object" && value !== null && !Array.isArray(value);
-}
-
 function detectIndexedArtifactType(filePath: string, stats: fs.Stats | null): IndexedArtifactReport["type"] {
   if (stats?.isDirectory() && readArtifactManifest(filePath)) {
     return "artifactBundle";

diff --git a/src/cli/duplicates.ts b/src/cli/duplicates.ts
@@ -32,6 +32,7 @@ function parseDuplicateDetectionOptions(context: DuplicatesCommandContext): Dupl
     maxBucketSize: parsePositiveIntegerOption(context.getOpt("--max-bucket-size"), "--max-bucket-size", 200),
     ...(context.hasFlag("--include-same-file") ? { includeSameFile: true } : {}),
     ...(context.hasFlag("--include-small") ? { includeSmall: true } : {}),
+    ...(context.hasFlag("--raw-pairs") ? { includeRawPairs: true } : {}),
   };
 
   if (options.maxTokens !== undefined && options.minTokens !== undefined && options.maxTokens < options.minTokens) {