BYK · BYK · May 20, 2026 · May 20, 2026
diff --git a/.lore.md b/.lore.md
diff --git a/packages/core/eval/baselines.ts b/packages/core/eval/baselines.ts
@@ -121,56 +121,85 @@ Conversation to summarize:
 /**
  * Simulate compaction: LLM-summarize the prefix that falls outside
  * the tail window, then return summary + tail.
+ *
+ * Iterative: when the total exceeds `compactionThreshold`, compact the prefix
+ * and check again. Real tools (Claude Code) auto-compact at ~83.5% of the
+ * context window, and a 400K session triggers 2-3 compaction cycles. Each
+ * cycle replaces the prefix with a summary, losing more detail.
  */
 export async function compactionBaseline(
   turns: ConversationTurn[],
   tailBudgetTokens: number = 80_000,
   llm: EvalLLMClient,
+  modelContextWindow: number = 200_000,
 ): Promise<string> {
-  const total = totalTokens(turns);
-
-  // If everything fits, no compaction needed
-  if (total <= tailBudgetTokens) {
-    return renderConversation(turns);
-  }
-
-  // Find the tail window cutoff
-  let tailTokens = 0;
-  let cutoff = turns.length;
-  for (let i = turns.length - 1; i >= 0; i--) {
-    const turnTokens =
-      turns[i].tokens ?? estimateTokens(renderTurn(turns[i]));
-    if (tailTokens + turnTokens > tailBudgetTokens) {
-      cutoff = i + 1;
-      break;
+  // Match Claude Code's autoCompactThreshold: effectiveContextWindow * 0.835
+  const compactionThreshold = Math.floor(
+    (modelContextWindow - Math.min(32_000, modelContextWindow * 0.15)) * 0.835,
+  );
+  const maxCompactions = 4; // safety cap
+  let currentTurns = turns;
+  let compactionCount = 0;
+
+  while (compactionCount < maxCompactions) {
+    const total = totalTokens(currentTurns);
+
+    // If everything fits within the threshold (or within the tail budget
+    // on the first pass), no more compaction needed.
+    if (compactionCount > 0 && total <= compactionThreshold) break;
+    if (total <= tailBudgetTokens) break;
+
+    // Find the tail window cutoff
+    let tailTokens = 0;
+    let cutoff = currentTurns.length;
+    for (let i = currentTurns.length - 1; i >= 0; i--) {
+      const turnTokens =
+        currentTurns[i].tokens ?? estimateTokens(renderTurn(currentTurns[i]));
+      if (tailTokens + turnTokens > tailBudgetTokens) {
+        cutoff = i + 1;
+        break;
+      }
+      tailTokens += turnTokens;
+      if (i === 0) cutoff = 0;
     }
-    tailTokens += turnTokens;
-    if (i === 0) cutoff = 0;
-  }
 
-  const prefix = turns.slice(0, cutoff);
-  const tail = turns.slice(cutoff);
-
-  if (prefix.length === 0) {
-    return renderConversation(tail);
+    const prefix = currentTurns.slice(0, cutoff);
+    const tail = currentTurns.slice(cutoff);
+
+    if (prefix.length === 0) break;
+
+    // Summarize the prefix via LLM
+    const prefixText = renderConversation(prefix);
+    const userPrompt = COMPACTION_USER_TEMPLATE.replace(
+      "{{conversation}}",
+      prefixText,
+    );
+
+    const result = await llm.prompt(COMPACTION_SYSTEM, userPrompt, {
+      maxTokens: 4096,
+      temperature: 0,
+    });
+
+    // Replace prefix with a synthetic summary turn + keep tail
+    const summaryTurn: ConversationTurn = {
+      role: "assistant",
+      content: [{ type: "text", text: `## Compacted Summary (pass ${compactionCount + 1})\n\n${result.text}` }],
+      tokens: estimateTokens(result.text),
+    };
+    currentTurns = [summaryTurn, ...tail];
+    compactionCount++;
+
+    console.log(
+      `  [compaction] pass ${compactionCount}: ${prefix.length} turns summarized → ${estimateTokens(result.text)} tok, ${currentTurns.length} turns remaining (${totalTokens(currentTurns)} tok)`,
+    );
   }
 
-  // Summarize the prefix via LLM
-  const prefixText = renderConversation(prefix);
-  const userPrompt = COMPACTION_USER_TEMPLATE.replace(
-    "{{conversation}}",
-    prefixText,
-  );
-
-  const result = await llm.prompt(COMPACTION_SYSTEM, userPrompt, {
-    maxTokens: 4096,
-    temperature: 0,
-  });
+  // Final render
+  if (compactionCount === 0) {
+    return renderConversation(currentTurns);
+  }
 
-  return (
-    `## Compacted Summary of Earlier Conversation\n\n${result.text}\n\n` +
-    `---\n\n## Recent Conversation\n\n${renderConversation(tail)}`
-  );
+  return renderConversation(currentTurns);
 }
 
 // ---------------------------------------------------------------------------
@@ -226,26 +255,19 @@ export function memoryOnlyConfigOverrides(): Record<string, unknown> {
 export function buildQAPrompt(
   context: string,
   question: string,
-  mode: "baseline" | "lore",
+  _mode: "baseline" | "lore",
 ): string {
-  const preamble =
-    mode === "lore"
-      ? "Here are distilled observations and knowledge from a coding session. " +
-        "If the observations don't have enough detail, use the recall tool to search for it."
-      : "Here is context from a past coding session.";
-
   return (
-    `${preamble}\n\n${context}\n\n` +
+    `Here is context from a past coding session.\n\n${context}\n\n` +
     `Question: ${question}\n\n` +
     `Answer concisely and specifically. Include exact values, file paths, and names when known.`
   );
 }
 
 export const QA_SYSTEM =
   "You are answering questions about past coding sessions. " +
-  "You have a recall tool available — USE IT to search your memory for specific details " +
-  "(file paths, branch names, error messages, version numbers, test counts, etc.). " +
-  "Always invoke recall before answering unless the answer is already in your system context. " +
-  "When recall returns results with source IDs (t:xxx), you can recall those IDs to get " +
-  "the full original message with exact details. " +
-  "Be specific and factual. If you don't have enough information even after recall, say so.";
+  "Do your best to come up with the exact and correct answer. " +
+  "Use all the tools available to you to find it. " +
+  "Be specific and factual — include exact file paths, error messages, " +
+  "version numbers, and names when known. " +
+  "If you don't have enough information, say so.";
diff --git a/packages/core/eval/harness.ts b/packages/core/eval/harness.ts
@@ -832,6 +832,23 @@ export async function runScenario(
       }
     }
 
+    // Backfill embeddings for distillations and temporal messages created
+    // during replay. The startup backfill runs before any content exists;
+    // this ensures vector search works for QA questions.
+    if (gateway.isReal !== false) {
+      try {
+        const { embedding } = await import("@loreai/core");
+        const kn = await embedding.backfillEmbeddings();
+        const dist = await embedding.backfillDistillationEmbeddings();
+        console.log(
+          `  [embedding] post-replay backfill: ${kn} knowledge, ${dist} distillations` +
+          ` (available=${embedding.isAvailable()})`,
+        );
+      } catch (err) {
+        console.warn("  Warning: post-replay embedding backfill failed:", err);
+      }
+    }
+
     // Collect all turns across sessions for baseline context building
     const allTurns = scenario.sessions.flatMap((s) => s.turns);
 

diff --git a/packages/core/src/config.ts b/packages/core/src/config.ts
@@ -147,9 +147,9 @@ export const LoreConfig = z.object({
        *  Default: 1.5. Set to 1.0 to disable. */
       vectorBoostWeight: z.number().min(1).max(5).default(1.5),
       /** Minimum meaningful query terms (after stopword removal) to activate
-       *  vector boost. Short keyword queries (1-2 terms) are left unweighted
-       *  since BM25 excels there. Default: 3. */
-      vectorBoostMinTerms: z.number().min(1).max(10).default(3),
+       *  vector boost. Single-term queries are left unweighted since BM25
+       *  excels there. Default: 2. */
+      vectorBoostMinTerms: z.number().min(1).max(10).default(2),
       /** Vector embedding search.
        *  Supports multiple providers:
        *  - "local" (default): @huggingface/transformers + nomic-embed-text-v1.5, no API key needed.
@@ -184,25 +184,25 @@ export const LoreConfig = z.object({
       recall: z
         .object({
           /** Total character budget for recall output. Controls how much context the
-           *  recall results consume. ~2K tokens at 8000 chars. Default: 8000. */
-          charBudget: z.number().min(2000).max(20000).default(8000),
+           *  recall results consume. ~3K tokens at 12000 chars. Default: 12000. */
+          charBudget: z.number().min(2000).max(20000).default(12000),
           /** Minimum RRF score relative to top result. Results below
            *  topScore * relevanceFloor are dropped. Default: 0.15.
            *  Set to 0 to disable score-based cutoff. */
           relevanceFloor: z.number().min(0).max(1).default(0.15),
           /** Max results to show in recall output. Default: 15. */
           maxResults: z.number().min(3).max(30).default(15),
         })
-        .default({ charBudget: 8000, relevanceFloor: 0.15, maxResults: 15 }),
+        .default({ charBudget: 12000, relevanceFloor: 0.15, maxResults: 15 }),
     })
     .default({
       ftsWeights: { title: 6.0, content: 2.0, category: 3.0 },
       recallLimit: 10,
       queryExpansion: true,
       vectorBoostWeight: 1.5,
-      vectorBoostMinTerms: 3,
+      vectorBoostMinTerms: 2,
       embeddings: { enabled: true, provider: "local" as const, model: "nomic-ai/nomic-embed-text-v1.5", dimensions: 768 },
-      recall: { charBudget: 8000, relevanceFloor: 0.15, maxResults: 15 },
+      recall: { charBudget: 12000, relevanceFloor: 0.15, maxResults: 15 },
     }),
   cache: z
     .object({

diff --git a/packages/core/src/gradient.ts b/packages/core/src/gradient.ts
@@ -683,6 +683,9 @@ type Distillation = {
   token_count: number;
   created_at: number;
   session_id: string;
+  r_compression: number | null;
+  c_norm: number | null;
+  source_ids: string[];
 };
 
 // Load non-archived distillations for the in-context prefix.
@@ -694,12 +697,16 @@ function loadDistillations(
 ): Distillation[] {
   const pid = ensureProject(projectPath);
   const query = sessionID
-    ? "SELECT id, observations, generation, token_count, created_at, session_id FROM distillations WHERE project_id = ? AND session_id = ? AND archived = 0 ORDER BY created_at ASC"
-    : "SELECT id, observations, generation, token_count, created_at, session_id FROM distillations WHERE project_id = ? AND archived = 0 ORDER BY created_at ASC";
+    ? "SELECT id, observations, generation, token_count, created_at, session_id, r_compression, c_norm, source_ids FROM distillations WHERE project_id = ? AND session_id = ? AND archived = 0 ORDER BY created_at ASC"
+    : "SELECT id, observations, generation, token_count, created_at, session_id, r_compression, c_norm, source_ids FROM distillations WHERE project_id = ? AND archived = 0 ORDER BY created_at ASC";
   const params = sessionID ? [pid, sessionID] : [pid];
-  return db()
+  const rows = db()
     .query(query)
-    .all(...params) as Distillation[];
+    .all(...params) as Array<Omit<Distillation, "source_ids"> & { source_ids: string }>;
+  return rows.map((r) => ({
+    ...r,
+    source_ids: r.source_ids ? JSON.parse(r.source_ids) : [],
+  }));
 }
 
 // Cached distillation loader — avoids hitting the DB on every transform() call.
@@ -1480,8 +1487,21 @@ function tryFitStable(input: {
     const windowSize = rawWindowCache!.pinnedRawCount + newMessages;
     const pinnedIdx = Math.max(0, input.messages.length - windowSize);
 
+    // Ensure the pinned window starts with a user message when a prefix is
+    // present — the prefix ends with assistant so a leading assistant in the
+    // raw window would create back-to-back assistants (#424).
+    let adjustedPinnedIdx = pinnedIdx;
+    if (input.prefix.length > 0) {
+      while (
+        adjustedPinnedIdx < input.messages.length &&
+        input.messages[adjustedPinnedIdx].info.role === "assistant"
+      ) {
+        adjustedPinnedIdx++;
+      }
+    }
+
     // Measure the token cost of the pinned window.
-    const pinnedWindow = input.messages.slice(pinnedIdx);
+    const pinnedWindow = input.messages.slice(adjustedPinnedIdx);
     const pinnedTokens = pinnedWindow.reduce(
       (sum, m) => sum + estimateMessage(m),
       0,
@@ -1961,6 +1981,17 @@ function transformInner(input: {
     olderTokens += est;
   }
 
+  // Ensure role alternation at the prefix/raw boundary: drop leading assistant
+  // messages from the older tail so the raw window starts with user (#424).
+  while (
+    olderMessages.length > 0 &&
+    nuclearPrefix.length > 0 &&
+    olderMessages[0].info.role === "assistant"
+  ) {
+    olderTokens -= estimateMessage(olderMessages[0]);
+    olderMessages.shift();
+  }
+
   const nuclearRaw = [...olderMessages, ...currentTurn];
   const nuclearRawTokens = olderTokens + currentTurnTokens;
 
@@ -2134,6 +2165,22 @@ function tryFit(input: {
     if (i === 0) cutoff = 0;
   }
 
+  // Ensure role alternation at the prefix/raw boundary: the distilled prefix
+  // ends with an assistant message, so the raw window must start with a user.
+  // The backward budget scan is purely token-based and can land on any role.
+  // If the cutoff produces a raw window starting with assistant(s), advance it
+  // past them — otherwise loreMessagesToGateway produces back-to-back assistants
+  // and the API rejects with "tool_use ids found without tool_result" (#424).
+  if (input.prefix.length > 0) {
+    while (
+      cutoff < olderMessages.length &&
+      olderMessages[cutoff].info.role === "assistant"
+    ) {
+      olderTokens -= estimateMessage(olderMessages[cutoff]);
+      cutoff++;
+    }
+  }
+
   const rawMessages = [...olderMessages.slice(cutoff), ...currentTurn];
   const rawTokens = olderTokens + currentTurnTokens;
 

diff --git a/packages/core/src/prompt.ts b/packages/core/src/prompt.ts
@@ -478,10 +478,16 @@ Produce update/delete ops to reduce entry count to at most ${input.targetMax}. P
 
 // Format distillations for injection into the message context.
 // Observations are plain event-log text — inject them directly under a header.
+// Optional metadata (id, r_compression, source_ids) adds drill-down hints so
+// the model knows how lossy each distillation is and can use recall to fetch
+// the full original messages.
 export function formatDistillations(
   distillations: Array<{
     observations: string;
     generation: number;
+    id?: string;
+    r_compression?: number | null;
+    source_ids?: string[];
   }>,
 ): string {
   if (!distillations.length) return "";
@@ -493,20 +499,39 @@ export function formatDistillations(
   if (meta.length) {
     sections.push("### Earlier Work (summarized)");
     for (const d of meta) {
-      sections.push(d.observations.trim());
+      sections.push(formatOneDistillation(d));
     }
   }
 
   if (recent.length) {
     sections.push("### Recent Work (distilled)");
     for (const d of recent) {
-      sections.push(d.observations.trim());
+      sections.push(formatOneDistillation(d));
     }
   }
 
   return sections.join("\n\n");
 }
 
+/** Render a single distillation with optional metadata header. */
+function formatOneDistillation(d: {
+  observations: string;
+  id?: string;
+  r_compression?: number | null;
+  source_ids?: string[];
+}): string {
+  if (!d.id) return d.observations.trim();
+
+  const lossy = d.r_compression != null && d.r_compression < 1.0;
+  const sourceCount = d.source_ids?.length ?? 0;
+  const meta = [
+    `d:${d.id}`,
+    lossy ? "lossy" : null,
+    sourceCount > 0 ? `${sourceCount} source${sourceCount > 1 ? "s" : ""}` : null,
+  ].filter(Boolean).join(" | ");
+  return `(${meta})\n${d.observations.trim()}`;
+}
+
 // Strict Markdown skeleton for the /compact session summary. Task-oriented
 // sections so the next agent starting from the compacted context has a clear
 // "where am I, what's next, what's blocked" briefing. Derived from upstream