From 739e5632c579e92e8f916c93f074f0b03407d025 Mon Sep 17 00:00:00 2001 From: kuitos Date: Fri, 1 May 2026 00:46:49 +0800 Subject: [PATCH] fix: avoid unrelated memory recall --- README.md | 8 + .../2026-04-27-memory-task-eval-design.md | 60 +++++ package.json | 1 + src/recall.ts | 75 +++++- test/evals/cases/effect-coverage.json | 232 ++++++++++++++++++ test/evals/cases/sanitized-replay.json | 45 ++++ test/evals/fixtures.ts | 178 ++++++++++++++ test/evals/harness.ts | 128 ++++++++++ test/evals/judges.ts | 41 ++++ test/evals/report.ts | 21 ++ test/evals/run.ts | 12 + test/evals/task-eval.test.ts | 67 +++++ test/recall.test.ts | 41 ++++ 13 files changed, 907 insertions(+), 2 deletions(-) create mode 100644 docs/superpowers/specs/2026-04-27-memory-task-eval-design.md create mode 100644 test/evals/cases/effect-coverage.json create mode 100644 test/evals/cases/sanitized-replay.json create mode 100644 test/evals/fixtures.ts create mode 100644 test/evals/harness.ts create mode 100644 test/evals/judges.ts create mode 100644 test/evals/report.ts create mode 100644 test/evals/run.ts create mode 100644 test/evals/task-eval.test.ts diff --git a/README.md b/README.md index 48b8e15..a00c07d 100644 --- a/README.md +++ b/README.md @@ -268,12 +268,20 @@ Supported memory types: # Run tests bun test +# Run the focused memory-effect task eval harness +bun test test/evals/task-eval.test.ts + +# Print a memory-effect eval report +bun run eval:memory + # Build published artifacts bun run build # Release: push to main triggers semantic-release → npm publish ``` +Task eval fixtures now support both in-code synthetic cases and file-backed replay cases under `test/evals/cases/*.json`. + ## 📄 License [MIT](LICENSE) © [kuitos](https://github.com/kuitos) diff --git a/docs/superpowers/specs/2026-04-27-memory-task-eval-design.md b/docs/superpowers/specs/2026-04-27-memory-task-eval-design.md new file mode 100644 index 0000000..3265aa4 --- /dev/null +++ b/docs/superpowers/specs/2026-04-27-memory-task-eval-design.md @@ -0,0 +1,60 @@ +# Memory Task Eval Design + +## Goal +Add a lightweight offline evaluation layer that compares memory-on and memory-off plugin behavior for realistic task-shaped inputs, using deterministic rule checks by default and leaving an interface for future judge implementations. + +## Scope +- In scope: + - Synthetic task fixtures shaped like real plugin messages + - A reusable harness that seeds memories, runs plugin hooks, and captures system prompts + - Rule-based checks for expected inclusions and exclusions in memory-on and memory-off runs + - A judge interface that can support future optional LLM scoring +- Out of scope: + - Live model invocation + - Production telemetry + - Large benchmark datasets + +## Approach Options +1. Extend `test/index.test.ts` + - Lowest setup cost, but poor reuse and weak structure once cases grow. +2. Add a dedicated `test/evals/` harness + - Recommended. Provides a typed case schema, reusable execution path, and clean future extensions. +3. Build a standalone CLI benchmark + - Overkill for the first version and unnecessary for CI. + +## Recommended Design +Create a dedicated task-eval layer under `test/evals/`: + +- `fixtures.ts` + - Declares the synthetic case schema and a small initial case set. +- `harness.ts` + - Creates a temp git repo, seeds memories, runs `MemoryPlugin`, feeds messages through `messages.transform`, then through `system.transform`, and returns both memory-on and memory-off outputs. +- `judges.ts` + - Exposes a rule-based judge for CI and a future generic judge interface. +- `task-eval.test.ts` + - Runs the fixture set through the harness and asserts pass/fail with helpful diagnostics. + +## Data Flow +1. A fixture defines memories, messages, and expected checks. +2. The harness creates a temp repo and seeds memory files with `saveMemory()`. +3. The harness runs the plugin once with memory enabled and once with `OPENCODE_MEMORY_IGNORE=1`. +4. The judge compares the resulting system prompts against the fixture's expected inclusions and exclusions. +5. The test reports the first failing expectation with both prompts attached for debugging. + +## Error Handling +- Missing or malformed fixtures should fail fast with descriptive assertion messages. +- The harness should restore `OPENCODE_MEMORY_IGNORE` after each run to avoid cross-test leakage. +- Temp repos should always be cleaned up in `afterEach`. + +## Testing Strategy +- Follow TDD: add task-eval tests first, verify failure, then implement the harness. +- Start with a small synthetic suite covering: + - preference recall in memory-on mode + - memory-off suppression + - tool-reference filtering interaction with recent completed tools +- Run targeted tests first, then the broader suite if needed. + +## Future Extensions +- Add adapters that convert sanitized real transcripts into the same fixture schema. +- Add a non-default judge implementation that scores generated assistant answers with an external LLM. +- Add lightweight summary reporting if the case set grows enough to justify aggregate metrics. diff --git a/package.json b/package.json index e6cf6eb..5efd501 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,7 @@ "dist" ], "scripts": { + "eval:memory": "bun test/evals/run.ts", "build": "tsc -p tsconfig.json", "prepack": "npm run build" }, diff --git a/src/recall.ts b/src/recall.ts index 86fc1e2..5f1e7a2 100644 --- a/src/recall.ts +++ b/src/recall.ts @@ -17,9 +17,79 @@ const MAX_MEMORY_LINES = 200 const MAX_MEMORY_BYTES = 4096 const encoder = new TextEncoder() +const QUERY_STOP_WORDS = new Set([ + "the", + "and", + "for", + "with", + "this", + "that", + "what", + "when", + "where", + "which", + "who", + "why", + "how", + "should", + "would", + "could", + "please", + "about", + "again", + "into", + "from", + "have", + "know", + "need", + "only", + "over", + "tell", + "than", + "then", + "them", + "they", + "will", + "your", + "you", + "are", + "can", + "did", + "has", + "her", + "him", + "his", + "its", + "not", + "our", + "out", + "she", + "was", + "were", + "all", + "any", + "but", + "get", + "had", + "in", + "is", + "it", + "of", + "on", + "or", + "to", +]) function tokenizeQuery(query: string): string[] { - return [...new Set(query.toLowerCase().split(/\s+/).map((token) => token.trim()).filter((token) => token.length >= 2))] + return [ + ...new Set( + query + .toLowerCase() + .split(/[^a-z0-9_]+/) + .map((token) => token.trim()) + .filter((token) => token.length >= 2 && !QUERY_STOP_WORDS.has(token)), + ), + ] } function readMemoryContent(filePath: string): string { @@ -121,7 +191,8 @@ export function recallRelevantMemories( } }).filter(({ header, content }) => !isToolReferenceMemory(header, content, recentTools)) - if (terms.length > 0 && scored.some((s) => s.score > 0)) { + if (terms.length > 0) { + if (!scored.some((s) => s.score > 0)) return [] scored.sort((a, b) => b.score - a.score || b.header.mtimeMs - a.header.mtimeMs) } else { scored.sort((a, b) => b.header.mtimeMs - a.header.mtimeMs) diff --git a/test/evals/cases/effect-coverage.json b/test/evals/cases/effect-coverage.json new file mode 100644 index 0000000..d5b7824 --- /dev/null +++ b/test/evals/cases/effect-coverage.json @@ -0,0 +1,232 @@ +{ + "cases": [ + { + "id": "negative-unrelated-query-does-not-recall-recent-feedback", + "description": "ordinary unrelated requests should not inject the most recent memory just because no keyword matched", + "memories": [ + { + "fileName": "feedback_database_tests", + "name": "Database Test Preference", + "description": "Do not mock database tests", + "type": "feedback", + "content": "Never mock the database in integration tests.\n\n**Why:** Mocked tests hid a broken migration.\n**How to apply:** Use a real test database for DB-facing tests.", + "mtime": "2026-04-30T00:00:00.000Z" + } + ], + "messages": [ + { + "role": "user", + "parts": [ + { + "type": "text", + "text": "Please rename the CLI flag in the README." + } + ] + } + ], + "checks": { + "onContains": [ + "## MEMORY.md", + "Database Test Preference" + ], + "onNotContains": [ + "## Recalled Memories", + "Never mock the database in integration tests." + ], + "offContains": [ + "# Auto Memory" + ], + "offNotContains": [ + "## MEMORY.md", + "Database Test Preference", + "Never mock the database in integration tests." + ] + } + }, + { + "id": "competition-prefers-token-rotation-memories", + "description": "when many memories compete, query-relevant token rotation memories should displace distractors", + "memories": [ + { + "fileName": "auth_rotation_owner", + "name": "Auth Token Rotation Owner", + "description": "Auth token rotation is owned by platform security", + "type": "project", + "content": "Platform security owns auth token rotation incidents.\n\n**Why:** They operate the rotation service.\n**How to apply:** Route auth token rotation questions to that context.", + "mtime": "2026-04-01T00:00:00.000Z" + }, + { + "fileName": "auth_rotation_deadline", + "name": "Auth Token Rotation Deadline", + "description": "Auth token rotation deadline is 2026-05-15", + "type": "project", + "content": "Auth token rotation cleanup must land by 2026-05-15.\n\n**Why:** Downstream services depend on the new signing keys.\n**How to apply:** Prefer low-risk auth token rotation changes.", + "mtime": "2026-04-02T00:00:00.000Z" + }, + { + "fileName": "auth_rotation_dashboard", + "name": "Auth Token Rotation Dashboard", + "description": "Auth token rotation metrics live in the security dashboard", + "type": "reference", + "content": "The auth token rotation dashboard is security.example/d/token-rotation.", + "mtime": "2026-04-03T00:00:00.000Z" + }, + { + "fileName": "auth_rotation_tests", + "name": "Auth Token Rotation Tests", + "description": "Auth token rotation tests must cover rollback", + "type": "feedback", + "content": "Auth token rotation changes must include rollback-path tests.\n\n**Why:** Prior incidents failed during key rollback.\n**How to apply:** Check rollback behavior before approving.", + "mtime": "2026-04-04T00:00:00.000Z" + }, + { + "fileName": "auth_rotation_rollout", + "name": "Auth Token Rotation Rollout", + "description": "Auth token rotation rollout is staged by region", + "type": "project", + "content": "Auth token rotation rolls out by region, starting with us-east.\n\n**Why:** It limits blast radius.\n**How to apply:** Avoid all-region rollout assumptions.", + "mtime": "2026-04-05T00:00:00.000Z" + }, + { + "fileName": "frontend_theme", + "name": "Frontend Theme", + "description": "Dashboard theme preference", + "type": "feedback", + "content": "Use compact tables for dashboard screens.", + "mtime": "2026-04-30T00:00:00.000Z" + }, + { + "fileName": "mobile_release", + "name": "Mobile Release", + "description": "Mobile release branch timing", + "type": "project", + "content": "Mobile release branch cuts on Fridays.", + "mtime": "2026-04-29T00:00:00.000Z" + } + ], + "messages": [ + { + "role": "user", + "parts": [ + { + "type": "text", + "text": "What should I know before touching auth token rotation?" + } + ] + } + ], + "checks": { + "onContains": [ + "Auth token rotation cleanup must land by 2026-05-15.", + "Auth token rotation changes must include rollback-path tests.", + "Auth token rotation rolls out by region", + "The auth token rotation dashboard is security.example/d/token-rotation.", + "Platform security owns auth token rotation incidents." + ], + "onNotContains": [ + "Use compact tables for dashboard screens.", + "Mobile release branch cuts on Fridays." + ], + "offContains": [ + "# Auto Memory" + ], + "offNotContains": [ + "Auth token rotation cleanup must land by 2026-05-15.", + "Use compact tables for dashboard screens.", + "Mobile release branch cuts on Fridays." + ] + } + }, + { + "id": "stale-memory-includes-age-warning", + "description": "old recalled memories should include an age warning before the agent relies on them", + "memories": [ + { + "fileName": "legacy_flag", + "name": "Legacy Flag", + "description": "Legacy auth flag was used by old rollout", + "type": "project", + "content": "The legacy_auth_rollout flag controlled the auth rollout.\n\n**Why:** It was the switch used during the original migration.\n**How to apply:** Verify the flag still exists before recommending it.", + "mtime": "2024-01-01T00:00:00.000Z" + } + ], + "messages": [ + { + "role": "user", + "parts": [ + { + "type": "text", + "text": "Is there old context about the legacy auth flag?" + } + ] + } + ], + "checks": { + "onContains": [ + "Legacy Flag", + "This memory is", + "days old", + "Verify against current code before asserting as fact.", + "legacy_auth_rollout flag controlled the auth rollout." + ], + "offContains": [ + "# Auto Memory" + ], + "offNotContains": [ + "legacy_auth_rollout flag controlled the auth rollout.", + "This memory is" + ] + } + }, + { + "id": "already-surfaced-memory-is-not-recalled-again", + "description": "memory already shown in a prior system prompt should not be re-injected after messages.transform sees it", + "memories": [ + { + "fileName": "billing_regression_checklist", + "name": "Billing Regression Checklist", + "description": "Billing handler edits require the regression checklist", + "type": "feedback", + "content": "Use the billing regression checklist before editing billing handlers.\n\n**Why:** Billing regressions are costly.\n**How to apply:** Run the checklist before changing billing handler code." + } + ], + "messages": [ + { + "role": "system", + "parts": [ + { + "type": "text", + "text": "# Auto Memory\n\n## Recalled Memories\n\n### Billing Regression Checklist (feedback)\nUse the billing regression checklist before editing billing handlers." + } + ] + }, + { + "role": "user", + "parts": [ + { + "type": "text", + "text": "I'm editing billing handlers; what should I check?" + } + ] + } + ], + "checks": { + "onContains": [ + "## MEMORY.md", + "Billing Regression Checklist" + ], + "onNotContains": [ + "## Recalled Memories", + "Run the checklist before changing billing handler code." + ], + "offContains": [ + "# Auto Memory" + ], + "offNotContains": [ + "## MEMORY.md", + "Run the checklist before changing billing handler code." + ] + } + } + ] +} diff --git a/test/evals/cases/sanitized-replay.json b/test/evals/cases/sanitized-replay.json new file mode 100644 index 0000000..5f5cb80 --- /dev/null +++ b/test/evals/cases/sanitized-replay.json @@ -0,0 +1,45 @@ +{ + "cases": [ + { + "id": "sanitized-replay-merge-freeze", + "description": "sanitized replay fixture showing project context recall from a file-backed case", + "memories": [ + { + "fileName": "project_freeze", + "name": "Merge Freeze", + "description": "Merge freeze starts 2026-04-10", + "type": "project", + "content": "Mobile team cutting release branch.\n\n**Why:** Prevent destabilizing the release cut.\n**How to apply:** Hold non-critical PRs until the freeze lifts." + } + ], + "messages": [ + { + "role": "user", + "parts": [ + { + "type": "text", + "text": "Can we merge this cleanup PR next week, or is there any release constraint I should factor in?" + } + ] + } + ], + "checks": { + "onContains": [ + "## MEMORY.md", + "Merge Freeze", + "## Recalled Memories", + "Hold non-critical PRs until the freeze lifts." + ], + "offContains": [ + "# Auto Memory" + ], + "offNotContains": [ + "## MEMORY.md", + "Merge Freeze", + "## Recalled Memories", + "Hold non-critical PRs until the freeze lifts." + ] + } + } + ] +} diff --git a/test/evals/fixtures.ts b/test/evals/fixtures.ts new file mode 100644 index 0000000..be3e37c --- /dev/null +++ b/test/evals/fixtures.ts @@ -0,0 +1,178 @@ +import { readdirSync, readFileSync } from "fs" +import { join } from "path" +import { fileURLToPath } from "url" +import { z } from "zod" + +export type SeedMemory = { + fileName: string + name: string + description: string + type: "user" | "feedback" | "project" | "reference" + content: string + mtime?: string +} + +export type EvalMessagePart = + | { type: "text"; text: string } + | { type: "tool"; tool: string; state: { status: "completed" | "error" } } + +export type EvalMessage = { + role: "system" | "user" | "assistant" + sessionID?: string + parts: EvalMessagePart[] +} + +export type TaskEvalChecks = { + onContains?: string[] + onNotContains?: string[] + offContains?: string[] + offNotContains?: string[] +} + +export type TaskEvalCase = { + id: string + description: string + memories: SeedMemory[] + messages: EvalMessage[] + checks: TaskEvalChecks +} + +const SeedMemorySchema = z.object({ + fileName: z.string().min(1), + name: z.string().min(1), + description: z.string(), + type: z.enum(["user", "feedback", "project", "reference"]), + content: z.string(), + mtime: z.string().optional(), +}) + +const EvalMessagePartSchema = z.union([ + z.object({ + type: z.literal("text"), + text: z.string(), + }), + z.object({ + type: z.literal("tool"), + tool: z.string().min(1), + state: z.object({ + status: z.enum(["completed", "error"]), + }), + }), +]) + +const EvalMessageSchema = z.object({ + role: z.enum(["system", "user", "assistant"]), + sessionID: z.string().optional(), + parts: z.array(EvalMessagePartSchema), +}) + +const TaskEvalChecksSchema = z.object({ + onContains: z.array(z.string()).optional(), + onNotContains: z.array(z.string()).optional(), + offContains: z.array(z.string()).optional(), + offNotContains: z.array(z.string()).optional(), +}) + +const TaskEvalCaseSchema = z.object({ + id: z.string().min(1), + description: z.string().min(1), + memories: z.array(SeedMemorySchema), + messages: z.array(EvalMessageSchema), + checks: TaskEvalChecksSchema, +}) + +const TaskEvalFixtureFileSchema = z.object({ + cases: z.array(TaskEvalCaseSchema).min(1), +}) + +export const TASK_EVAL_CASES: TaskEvalCase[] = [ + { + id: "feedback-recall-on-off-delta", + description: "memory-on should surface testing guidance while memory-off should suppress it", + memories: [ + { + fileName: "feedback_testing", + name: "Testing Approach", + description: "Always use integration tests", + type: "feedback", + content: + "Never mock the database.\n\n**Why:** Mocked tests masked a broken migration.\n**How to apply:** Use a real test database for DB-facing tests.", + }, + ], + messages: [ + { + role: "user", + parts: [{ type: "text", text: "Should I mock the database in these tests?" }], + }, + ], + checks: { + onContains: ["## MEMORY.md", "Testing Approach", "## Recalled Memories", "Never mock the database."], + offContains: ["# Auto Memory"], + offNotContains: ["## MEMORY.md", "Testing Approach", "## Recalled Memories", "Never mock the database."], + }, + }, + { + id: "completed-tool-filters-reference-recall", + description: "completed tool usage should suppress tool reference recall body but still allow other relevant recall", + memories: [ + { + fileName: "grep_ref", + name: "Grep Tool API", + description: "Usage reference for grep tool", + type: "reference", + content: "Use grep -r --include='*.ts' when searching TypeScript files.", + }, + { + fileName: "search_project", + name: "Project Search Policy", + description: "Codebase search guidance", + type: "project", + content: + "Prefer repo-local search first.\n\n**Why:** It keeps exploration faster and more reproducible.\n**How to apply:** Start with rg before broader tools.", + }, + ], + messages: [ + { + role: "user", + parts: [{ type: "text", text: "Search the codebase for where this behavior is implemented." }], + }, + { + role: "assistant", + parts: [{ type: "tool", tool: "grep", state: { status: "completed" } }], + }, + ], + checks: { + onContains: ["## MEMORY.md", "Grep Tool API", "Project Search Policy", "## Recalled Memories", "Prefer repo-local search first."], + onNotContains: ["Use grep -r --include='*.ts' when searching TypeScript files."], + offContains: ["# Auto Memory"], + offNotContains: [ + "## MEMORY.md", + "Grep Tool API", + "Project Search Policy", + "## Recalled Memories", + "Prefer repo-local search first.", + "Use grep -r --include='*.ts' when searching TypeScript files.", + ], + }, + }, +] + +export function loadTaskEvalCasesFromDir(dirPath: string): TaskEvalCase[] { + const fileNames = readdirSync(dirPath, { encoding: "utf-8" }) + .filter((fileName) => fileName.endsWith(".json")) + .sort() + + const cases: TaskEvalCase[] = [] + for (const fileName of fileNames) { + const raw = readFileSync(join(dirPath, fileName), "utf-8") + const parsed = TaskEvalFixtureFileSchema.parse(JSON.parse(raw)) + cases.push(...parsed.cases) + } + + return cases +} + +const CASES_DIR = fileURLToPath(new URL("./cases", import.meta.url)) + +export const FILE_BACKED_TASK_EVAL_CASES = loadTaskEvalCasesFromDir(CASES_DIR) +export const ALL_TASK_EVAL_CASES = [...TASK_EVAL_CASES, ...FILE_BACKED_TASK_EVAL_CASES] diff --git a/test/evals/harness.ts b/test/evals/harness.ts new file mode 100644 index 0000000..2ded580 --- /dev/null +++ b/test/evals/harness.ts @@ -0,0 +1,128 @@ +import { mkdtempSync, mkdirSync, rmSync, utimesSync } from "fs" +import { tmpdir } from "os" +import { join } from "path" +import { MemoryPlugin } from "../../src/index.js" +import { saveMemory } from "../../src/memory.js" +import type { EvalMessage, TaskEvalCase } from "./fixtures.js" +import type { TaskEvalJudge, TaskEvalJudgeResult } from "./judges.js" + +type MessageTransform = ( + input: {}, + output: RuntimeMessagesOutput, +) => Promise + +type SystemTransform = ( + input: { model: unknown; sessionID?: string }, + output: { system: string[] }, +) => Promise + +export type TaskEvalResult = TaskEvalJudgeResult & { + caseID: string + description: string + onPrompt: string + offPrompt: string +} + +type RuntimeMessage = { + info: { role: string; sessionID?: string } + parts: Array<{ type: string; text?: string; tool?: string; state?: { status: string } }> +} + +type RuntimeMessagesOutput = { + messages: RuntimeMessage[] +} + +function makeTempGitRepo(): string { + const root = mkdtempSync(join(tmpdir(), "task-eval-")) + mkdirSync(join(root, ".git"), { recursive: true }) + return root +} + +function cloneMessages(messages: EvalMessage[]): EvalMessage[] { + return JSON.parse(JSON.stringify(messages)) as EvalMessage[] +} + +function materializeMessages(messages: EvalMessage[], sessionID: string): RuntimeMessagesOutput { + return { + messages: cloneMessages(messages).map((message) => ({ + info: { + role: message.role, + sessionID: message.sessionID ?? sessionID, + }, + parts: message.parts.map((part) => ({ ...part })), + })), + } +} + +async function renderSystemPrompt( + worktree: string, + messages: EvalMessage[], + sessionID: string, + ignoreMemory: boolean, +): Promise { + const plugin = await MemoryPlugin({ worktree } as never) + const messagesTransform = plugin["experimental.chat.messages.transform"] as unknown as MessageTransform + const systemTransform = plugin["experimental.chat.system.transform"] as unknown as SystemTransform + const transformedMessages = materializeMessages(messages, sessionID) + const originalIgnore = process.env.OPENCODE_MEMORY_IGNORE + + try { + if (ignoreMemory) process.env.OPENCODE_MEMORY_IGNORE = "1" + else delete process.env.OPENCODE_MEMORY_IGNORE + + await messagesTransform({}, transformedMessages) + + const output = { system: [] as string[] } + await systemTransform({ model: "test-model", sessionID }, output) + return output.system.join("\n\n") + } finally { + if (originalIgnore === undefined) delete process.env.OPENCODE_MEMORY_IGNORE + else process.env.OPENCODE_MEMORY_IGNORE = originalIgnore + } +} + +export async function runTaskEvalCase(taskCase: TaskEvalCase, judge: TaskEvalJudge): Promise { + const repo = makeTempGitRepo() + const originalClaudeConfigDir = process.env.CLAUDE_CONFIG_DIR + process.env.CLAUDE_CONFIG_DIR = join(repo, ".claude-test") + + try { + for (const memory of taskCase.memories) { + const filePath = saveMemory(repo, memory.fileName, memory.name, memory.description, memory.type, memory.content) + if (memory.mtime) { + const mtime = new Date(memory.mtime) + utimesSync(filePath, mtime, mtime) + } + } + + const onPrompt = await renderSystemPrompt(repo, taskCase.messages, `${taskCase.id}:on`, false) + const offPrompt = await renderSystemPrompt(repo, taskCase.messages, `${taskCase.id}:off`, true) + const judged = await judge({ taskCase, onPrompt, offPrompt }) + + return { + caseID: taskCase.id, + description: taskCase.description, + onPrompt, + offPrompt, + passed: judged.passed, + failures: judged.failures, + } + } finally { + if (originalClaudeConfigDir === undefined) delete process.env.CLAUDE_CONFIG_DIR + else process.env.CLAUDE_CONFIG_DIR = originalClaudeConfigDir + rmSync(repo, { recursive: true, force: true }) + } +} + +export async function runTaskEvalSuite( + taskCases: readonly TaskEvalCase[], + judge: TaskEvalJudge, +): Promise { + const results: TaskEvalResult[] = [] + + for (const taskCase of taskCases) { + results.push(await runTaskEvalCase(taskCase, judge)) + } + + return results +} diff --git a/test/evals/judges.ts b/test/evals/judges.ts new file mode 100644 index 0000000..f3093e8 --- /dev/null +++ b/test/evals/judges.ts @@ -0,0 +1,41 @@ +import type { TaskEvalCase } from "./fixtures.js" + +export type TaskEvalJudgeInput = { + taskCase: TaskEvalCase + onPrompt: string + offPrompt: string +} + +export type TaskEvalJudgeResult = { + passed: boolean + failures: string[] +} + +export type TaskEvalJudge = (input: TaskEvalJudgeInput) => TaskEvalJudgeResult | Promise + +function checkContains(label: string, haystack: string, needles: readonly string[], failures: string[]): void { + for (const needle of needles) { + if (!haystack.includes(needle)) failures.push(`${label} missing expected text: ${needle}`) + } +} + +function checkNotContains(label: string, haystack: string, needles: readonly string[], failures: string[]): void { + for (const needle of needles) { + if (haystack.includes(needle)) failures.push(`${label} unexpectedly contained text: ${needle}`) + } +} + +export function ruleBasedJudge({ taskCase, onPrompt, offPrompt }: TaskEvalJudgeInput): TaskEvalJudgeResult { + const failures: string[] = [] + const { checks } = taskCase + + checkContains("memory-on prompt", onPrompt, checks.onContains ?? [], failures) + checkNotContains("memory-on prompt", onPrompt, checks.onNotContains ?? [], failures) + checkContains("memory-off prompt", offPrompt, checks.offContains ?? [], failures) + checkNotContains("memory-off prompt", offPrompt, checks.offNotContains ?? [], failures) + + return { + passed: failures.length === 0, + failures, + } +} diff --git a/test/evals/report.ts b/test/evals/report.ts new file mode 100644 index 0000000..7e14a91 --- /dev/null +++ b/test/evals/report.ts @@ -0,0 +1,21 @@ +import type { TaskEvalResult } from "./harness.js" + +export function formatTaskEvalReport(results: readonly TaskEvalResult[]): string { + const passed = results.filter((result) => result.passed).length + const lines = [`Task eval: ${passed}/${results.length} passed`] + + for (const result of results) { + const status = result.passed ? "pass" : "fail" + lines.push(`[${status}] ${result.caseID} - ${result.description}`) + + for (const failure of result.failures) { + lines.push(` - ${failure}`) + } + } + + return lines.join("\n") +} + +export function hasTaskEvalFailures(results: readonly TaskEvalResult[]): boolean { + return results.some((result) => !result.passed) +} diff --git a/test/evals/run.ts b/test/evals/run.ts new file mode 100644 index 0000000..aa36908 --- /dev/null +++ b/test/evals/run.ts @@ -0,0 +1,12 @@ +import { ALL_TASK_EVAL_CASES } from "./fixtures.js" +import { runTaskEvalSuite } from "./harness.js" +import { ruleBasedJudge } from "./judges.js" +import { formatTaskEvalReport, hasTaskEvalFailures } from "./report.js" + +const results = await runTaskEvalSuite(ALL_TASK_EVAL_CASES, ruleBasedJudge) + +console.log(formatTaskEvalReport(results)) + +if (hasTaskEvalFailures(results)) { + process.exitCode = 1 +} diff --git a/test/evals/task-eval.test.ts b/test/evals/task-eval.test.ts new file mode 100644 index 0000000..faa72b3 --- /dev/null +++ b/test/evals/task-eval.test.ts @@ -0,0 +1,67 @@ +import { describe, expect, test } from "bun:test" +import { FILE_BACKED_TASK_EVAL_CASES, TASK_EVAL_CASES } from "./fixtures.js" +import { runTaskEvalCase, runTaskEvalSuite } from "./harness.js" +import { ruleBasedJudge } from "./judges.js" +import { formatTaskEvalReport } from "./report.js" + +describe("offline task eval harness", () => { + test("built-in synthetic fixtures pass with the rule-based judge", async () => { + const results = await runTaskEvalSuite(TASK_EVAL_CASES, ruleBasedJudge) + const failures = results.filter((result) => !result.passed) + + expect(failures).toEqual([]) + }) + + test("supports custom judges without changing the harness", async () => { + const result = await runTaskEvalCase(TASK_EVAL_CASES[0]!, ({ onPrompt, offPrompt, taskCase }) => { + const failures: string[] = [] + const expectedName = taskCase.memories[0]!.name + + if (!onPrompt.includes(expectedName)) failures.push(`memory-on prompt did not include ${expectedName}`) + if (offPrompt.includes(expectedName)) failures.push(`memory-off prompt leaked ${expectedName}`) + + return { + passed: failures.length === 0, + failures, + } + }) + + expect(result.passed).toBe(true) + expect(result.failures).toEqual([]) + }) + + test("loads file-backed replay fixtures and runs them through the same harness", async () => { + expect(FILE_BACKED_TASK_EVAL_CASES.length).toBeGreaterThan(0) + + const results = await runTaskEvalSuite(FILE_BACKED_TASK_EVAL_CASES, ruleBasedJudge) + const failures = results.filter((result) => !result.passed) + + expect(failures).toEqual([]) + }) + + test("formats a readable report for pass and fail results", () => { + const report = formatTaskEvalReport([ + { + caseID: "passing-case", + description: "passes cleanly", + passed: true, + failures: [], + onPrompt: "on", + offPrompt: "off", + }, + { + caseID: "failing-case", + description: "shows failures", + passed: false, + failures: ["memory-on prompt missing expected text: Important Memory"], + onPrompt: "on", + offPrompt: "off", + }, + ]) + + expect(report).toContain("Task eval: 1/2 passed") + expect(report).toContain("[pass] passing-case") + expect(report).toContain("[fail] failing-case") + expect(report).toContain("memory-on prompt missing expected text: Important Memory") + }) +}) diff --git a/test/recall.test.ts b/test/recall.test.ts index 45f6ed3..e53756f 100644 --- a/test/recall.test.ts +++ b/test/recall.test.ts @@ -93,6 +93,47 @@ describe("recallRelevantMemories", () => { expect(result[0]!.fileName).toBe("auth.md") }) + test("returns no memories when a query has no matches", () => { + const repo = makeTempGitRepo() + const memDir = getMemoryDir(repo) + + writeMemoryFile( + memDir, + "testing.md", + { name: "Testing Preference", description: "Database test guidance", type: "feedback" }, + "Never mock database integration tests.", + new Date("2026-04-30"), + ) + + const result = recallRelevantMemories(repo, "rename README CLI flag") + + expect(result).toEqual([]) + }) + + test("keeps meaningful short query terms", () => { + const repo = makeTempGitRepo() + const memDir = getMemoryDir(repo) + + writeMemoryFile( + memDir, + "go_user.md", + { name: "Go Expertise", description: "User writes Go services", type: "user" }, + "User has deep Go backend experience.", + new Date("2026-04-01"), + ) + writeMemoryFile( + memDir, + "recent_unrelated.md", + { name: "Recent Unrelated", description: "Latest unrelated note", type: "project" }, + "Latest project note about release coordination.", + new Date("2026-04-30"), + ) + + const result = recallRelevantMemories(repo, "Go") + + expect(result[0]!.fileName).toBe("go_user.md") + }) + test("matches query against frontmatter name", () => { const repo = makeTempGitRepo() const memDir = getMemoryDir(repo)