From 739e5632c579e92e8f916c93f074f0b03407d025 Mon Sep 17 00:00:00 2001
From: kuitos <kuitos.lau@gmail.com>
Date: Fri, 1 May 2026 00:46:49 +0800
Subject: [PATCH] fix: avoid unrelated memory recall

---
 README.md                                     |   8 +
 .../2026-04-27-memory-task-eval-design.md     |  60 +++++
 package.json                                  |   1 +
 src/recall.ts                                 |  75 +++++-
 test/evals/cases/effect-coverage.json         | 232 ++++++++++++++++++
 test/evals/cases/sanitized-replay.json        |  45 ++++
 test/evals/fixtures.ts                        | 178 ++++++++++++++
 test/evals/harness.ts                         | 128 ++++++++++
 test/evals/judges.ts                          |  41 ++++
 test/evals/report.ts                          |  21 ++
 test/evals/run.ts                             |  12 +
 test/evals/task-eval.test.ts                  |  67 +++++
 test/recall.test.ts                           |  41 ++++
 13 files changed, 907 insertions(+), 2 deletions(-)
 create mode 100644 docs/superpowers/specs/2026-04-27-memory-task-eval-design.md
 create mode 100644 test/evals/cases/effect-coverage.json
 create mode 100644 test/evals/cases/sanitized-replay.json
 create mode 100644 test/evals/fixtures.ts
 create mode 100644 test/evals/harness.ts
 create mode 100644 test/evals/judges.ts
 create mode 100644 test/evals/report.ts
 create mode 100644 test/evals/run.ts
 create mode 100644 test/evals/task-eval.test.ts

diff --git a/README.md b/README.md
index 48b8e15..a00c07d 100644
--- a/README.md
+++ b/README.md
@@ -268,12 +268,20 @@ Supported memory types:
 # Run tests
 bun test
 
+# Run the focused memory-effect task eval harness
+bun test test/evals/task-eval.test.ts
+
+# Print a memory-effect eval report
+bun run eval:memory
+
 # Build published artifacts
 bun run build
 
 # Release: push to main triggers semantic-release → npm publish
 ```
 
+Task eval fixtures now support both in-code synthetic cases and file-backed replay cases under `test/evals/cases/*.json`.
+
 ## 📄 License
 
 [MIT](LICENSE) © [kuitos](https://github.com/kuitos)
diff --git a/docs/superpowers/specs/2026-04-27-memory-task-eval-design.md b/docs/superpowers/specs/2026-04-27-memory-task-eval-design.md
new file mode 100644
index 0000000..3265aa4
--- /dev/null
+++ b/docs/superpowers/specs/2026-04-27-memory-task-eval-design.md
@@ -0,0 +1,60 @@
+# Memory Task Eval Design
+
+## Goal
+Add a lightweight offline evaluation layer that compares memory-on and memory-off plugin behavior for realistic task-shaped inputs, using deterministic rule checks by default and leaving an interface for future judge implementations.
+
+## Scope
+- In scope:
+  - Synthetic task fixtures shaped like real plugin messages
+  - A reusable harness that seeds memories, runs plugin hooks, and captures system prompts
+  - Rule-based checks for expected inclusions and exclusions in memory-on and memory-off runs
+  - A judge interface that can support future optional LLM scoring
+- Out of scope:
+  - Live model invocation
+  - Production telemetry
+  - Large benchmark datasets
+
+## Approach Options
+1. Extend `test/index.test.ts`
+   - Lowest setup cost, but poor reuse and weak structure once cases grow.
+2. Add a dedicated `test/evals/` harness
+   - Recommended. Provides a typed case schema, reusable execution path, and clean future extensions.
+3. Build a standalone CLI benchmark
+   - Overkill for the first version and unnecessary for CI.
+
+## Recommended Design
+Create a dedicated task-eval layer under `test/evals/`:
+
+- `fixtures.ts`
+  - Declares the synthetic case schema and a small initial case set.
+- `harness.ts`
+  - Creates a temp git repo, seeds memories, runs `MemoryPlugin`, feeds messages through `messages.transform`, then through `system.transform`, and returns both memory-on and memory-off outputs.
+- `judges.ts`
+  - Exposes a rule-based judge for CI and a future generic judge interface.
+- `task-eval.test.ts`
+  - Runs the fixture set through the harness and asserts pass/fail with helpful diagnostics.
+
+## Data Flow
+1. A fixture defines memories, messages, and expected checks.
+2. The harness creates a temp repo and seeds memory files with `saveMemory()`.
+3. The harness runs the plugin once with memory enabled and once with `OPENCODE_MEMORY_IGNORE=1`.
+4. The judge compares the resulting system prompts against the fixture's expected inclusions and exclusions.
+5. The test reports the first failing expectation with both prompts attached for debugging.
+
+## Error Handling
+- Missing or malformed fixtures should fail fast with descriptive assertion messages.
+- The harness should restore `OPENCODE_MEMORY_IGNORE` after each run to avoid cross-test leakage.
+- Temp repos should always be cleaned up in `afterEach`.
+
+## Testing Strategy
+- Follow TDD: add task-eval tests first, verify failure, then implement the harness.
+- Start with a small synthetic suite covering:
+  - preference recall in memory-on mode
+  - memory-off suppression
+  - tool-reference filtering interaction with recent completed tools
+- Run targeted tests first, then the broader suite if needed.
+
+## Future Extensions
+- Add adapters that convert sanitized real transcripts into the same fixture schema.
+- Add a non-default judge implementation that scores generated assistant answers with an external LLM.
+- Add lightweight summary reporting if the case set grows enough to justify aggregate metrics.
diff --git a/package.json b/package.json
index e6cf6eb..5efd501 100644
--- a/package.json
+++ b/package.json
@@ -19,6 +19,7 @@
     "dist"
   ],
   "scripts": {
+    "eval:memory": "bun test/evals/run.ts",
     "build": "tsc -p tsconfig.json",
     "prepack": "npm run build"
   },
diff --git a/src/recall.ts b/src/recall.ts
index 86fc1e2..5f1e7a2 100644
--- a/src/recall.ts
+++ b/src/recall.ts
@@ -17,9 +17,79 @@ const MAX_MEMORY_LINES = 200
 const MAX_MEMORY_BYTES = 4096
 
 const encoder = new TextEncoder()
+const QUERY_STOP_WORDS = new Set([
+  "the",
+  "and",
+  "for",
+  "with",
+  "this",
+  "that",
+  "what",
+  "when",
+  "where",
+  "which",
+  "who",
+  "why",
+  "how",
+  "should",
+  "would",
+  "could",
+  "please",
+  "about",
+  "again",
+  "into",
+  "from",
+  "have",
+  "know",
+  "need",
+  "only",
+  "over",
+  "tell",
+  "than",
+  "then",
+  "them",
+  "they",
+  "will",
+  "your",
+  "you",
+  "are",
+  "can",
+  "did",
+  "has",
+  "her",
+  "him",
+  "his",
+  "its",
+  "not",
+  "our",
+  "out",
+  "she",
+  "was",
+  "were",
+  "all",
+  "any",
+  "but",
+  "get",
+  "had",
+  "in",
+  "is",
+  "it",
+  "of",
+  "on",
+  "or",
+  "to",
+])
 
 function tokenizeQuery(query: string): string[] {
-  return [...new Set(query.toLowerCase().split(/\s+/).map((token) => token.trim()).filter((token) => token.length >= 2))]
+  return [
+    ...new Set(
+      query
+        .toLowerCase()
+        .split(/[^a-z0-9_]+/)
+        .map((token) => token.trim())
+        .filter((token) => token.length >= 2 && !QUERY_STOP_WORDS.has(token)),
+    ),
+  ]
 }
 
 function readMemoryContent(filePath: string): string {
@@ -121,7 +191,8 @@ export function recallRelevantMemories(
     }
   }).filter(({ header, content }) => !isToolReferenceMemory(header, content, recentTools))
 
-  if (terms.length > 0 && scored.some((s) => s.score > 0)) {
+  if (terms.length > 0) {
+    if (!scored.some((s) => s.score > 0)) return []
     scored.sort((a, b) => b.score - a.score || b.header.mtimeMs - a.header.mtimeMs)
   } else {
     scored.sort((a, b) => b.header.mtimeMs - a.header.mtimeMs)
diff --git a/test/evals/cases/effect-coverage.json b/test/evals/cases/effect-coverage.json
new file mode 100644
index 0000000..d5b7824
--- /dev/null
+++ b/test/evals/cases/effect-coverage.json
@@ -0,0 +1,232 @@
+{
+  "cases": [
+    {
+      "id": "negative-unrelated-query-does-not-recall-recent-feedback",
+      "description": "ordinary unrelated requests should not inject the most recent memory just because no keyword matched",
+      "memories": [
+        {
+          "fileName": "feedback_database_tests",
+          "name": "Database Test Preference",
+          "description": "Do not mock database tests",
+          "type": "feedback",
+          "content": "Never mock the database in integration tests.\n\n**Why:** Mocked tests hid a broken migration.\n**How to apply:** Use a real test database for DB-facing tests.",
+          "mtime": "2026-04-30T00:00:00.000Z"
+        }
+      ],
+      "messages": [
+        {
+          "role": "user",
+          "parts": [
+            {
+              "type": "text",
+              "text": "Please rename the CLI flag in the README."
+            }
+          ]
+        }
+      ],
+      "checks": {
+        "onContains": [
+          "## MEMORY.md",
+          "Database Test Preference"
+        ],
+        "onNotContains": [
+          "## Recalled Memories",
+          "Never mock the database in integration tests."
+        ],
+        "offContains": [
+          "# Auto Memory"
+        ],
+        "offNotContains": [
+          "## MEMORY.md",
+          "Database Test Preference",
+          "Never mock the database in integration tests."
+        ]
+      }
+    },
+    {
+      "id": "competition-prefers-token-rotation-memories",
+      "description": "when many memories compete, query-relevant token rotation memories should displace distractors",
+      "memories": [
+        {
+          "fileName": "auth_rotation_owner",
+          "name": "Auth Token Rotation Owner",
+          "description": "Auth token rotation is owned by platform security",
+          "type": "project",
+          "content": "Platform security owns auth token rotation incidents.\n\n**Why:** They operate the rotation service.\n**How to apply:** Route auth token rotation questions to that context.",
+          "mtime": "2026-04-01T00:00:00.000Z"
+        },
+        {
+          "fileName": "auth_rotation_deadline",
+          "name": "Auth Token Rotation Deadline",
+          "description": "Auth token rotation deadline is 2026-05-15",
+          "type": "project",
+          "content": "Auth token rotation cleanup must land by 2026-05-15.\n\n**Why:** Downstream services depend on the new signing keys.\n**How to apply:** Prefer low-risk auth token rotation changes.",
+          "mtime": "2026-04-02T00:00:00.000Z"
+        },
+        {
+          "fileName": "auth_rotation_dashboard",
+          "name": "Auth Token Rotation Dashboard",
+          "description": "Auth token rotation metrics live in the security dashboard",
+          "type": "reference",
+          "content": "The auth token rotation dashboard is security.example/d/token-rotation.",
+          "mtime": "2026-04-03T00:00:00.000Z"
+        },
+        {
+          "fileName": "auth_rotation_tests",
+          "name": "Auth Token Rotation Tests",
+          "description": "Auth token rotation tests must cover rollback",
+          "type": "feedback",
+          "content": "Auth token rotation changes must include rollback-path tests.\n\n**Why:** Prior incidents failed during key rollback.\n**How to apply:** Check rollback behavior before approving.",
+          "mtime": "2026-04-04T00:00:00.000Z"
+        },
+        {
+          "fileName": "auth_rotation_rollout",
+          "name": "Auth Token Rotation Rollout",
+          "description": "Auth token rotation rollout is staged by region",
+          "type": "project",
+          "content": "Auth token rotation rolls out by region, starting with us-east.\n\n**Why:** It limits blast radius.\n**How to apply:** Avoid all-region rollout assumptions.",
+          "mtime": "2026-04-05T00:00:00.000Z"
+        },
+        {
+          "fileName": "frontend_theme",
+          "name": "Frontend Theme",
+          "description": "Dashboard theme preference",
+          "type": "feedback",
+          "content": "Use compact tables for dashboard screens.",
+          "mtime": "2026-04-30T00:00:00.000Z"
+        },
+        {
+          "fileName": "mobile_release",
+          "name": "Mobile Release",
+          "description": "Mobile release branch timing",
+          "type": "project",
+          "content": "Mobile release branch cuts on Fridays.",
+          "mtime": "2026-04-29T00:00:00.000Z"
+        }
+      ],
+      "messages": [
+        {
+          "role": "user",
+          "parts": [
+            {
+              "type": "text",
+              "text": "What should I know before touching auth token rotation?"
+            }
+          ]
+        }
+      ],
+      "checks": {
+        "onContains": [
+          "Auth token rotation cleanup must land by 2026-05-15.",
+          "Auth token rotation changes must include rollback-path tests.",
+          "Auth token rotation rolls out by region",
+          "The auth token rotation dashboard is security.example/d/token-rotation.",
+          "Platform security owns auth token rotation incidents."
+        ],
+        "onNotContains": [
+          "Use compact tables for dashboard screens.",
+          "Mobile release branch cuts on Fridays."
+        ],
+        "offContains": [
+          "# Auto Memory"
+        ],
+        "offNotContains": [
+          "Auth token rotation cleanup must land by 2026-05-15.",
+          "Use compact tables for dashboard screens.",
+          "Mobile release branch cuts on Fridays."
+        ]
+      }
+    },
+    {
+      "id": "stale-memory-includes-age-warning",
+      "description": "old recalled memories should include an age warning before the agent relies on them",
+      "memories": [
+        {
+          "fileName": "legacy_flag",
+          "name": "Legacy Flag",
+          "description": "Legacy auth flag was used by old rollout",
+          "type": "project",
+          "content": "The legacy_auth_rollout flag controlled the auth rollout.\n\n**Why:** It was the switch used during the original migration.\n**How to apply:** Verify the flag still exists before recommending it.",
+          "mtime": "2024-01-01T00:00:00.000Z"
+        }
+      ],
+      "messages": [
+        {
+          "role": "user",
+          "parts": [
+            {
+              "type": "text",
+              "text": "Is there old context about the legacy auth flag?"
+            }
+          ]
+        }
+      ],
+      "checks": {
+        "onContains": [
+          "Legacy Flag",
+          "This memory is",
+          "days old",
+          "Verify against current code before asserting as fact.",
+          "legacy_auth_rollout flag controlled the auth rollout."
+        ],
+        "offContains": [
+          "# Auto Memory"
+        ],
+        "offNotContains": [
+          "legacy_auth_rollout flag controlled the auth rollout.",
+          "This memory is"
+        ]
+      }
+    },
+    {
+      "id": "already-surfaced-memory-is-not-recalled-again",
+      "description": "memory already shown in a prior system prompt should not be re-injected after messages.transform sees it",
+      "memories": [
+        {
+          "fileName": "billing_regression_checklist",
+          "name": "Billing Regression Checklist",
+          "description": "Billing handler edits require the regression checklist",
+          "type": "feedback",
+          "content": "Use the billing regression checklist before editing billing handlers.\n\n**Why:** Billing regressions are costly.\n**How to apply:** Run the checklist before changing billing handler code."
+        }
+      ],
+      "messages": [
+        {
+          "role": "system",
+          "parts": [
+            {
+              "type": "text",
+              "text": "# Auto Memory\n\n## Recalled Memories\n\n### Billing Regression Checklist (feedback)\nUse the billing regression checklist before editing billing handlers."
+            }
+          ]
+        },
+        {
+          "role": "user",
+          "parts": [
+            {
+              "type": "text",
+              "text": "I'm editing billing handlers; what should I check?"
+            }
+          ]
+        }
+      ],
+      "checks": {
+        "onContains": [
+          "## MEMORY.md",
+          "Billing Regression Checklist"
+        ],
+        "onNotContains": [
+          "## Recalled Memories",
+          "Run the checklist before changing billing handler code."
+        ],
+        "offContains": [
+          "# Auto Memory"
+        ],
+        "offNotContains": [
+          "## MEMORY.md",
+          "Run the checklist before changing billing handler code."
+        ]
+      }
+    }
+  ]
+}
diff --git a/test/evals/cases/sanitized-replay.json b/test/evals/cases/sanitized-replay.json
new file mode 100644
index 0000000..5f5cb80
--- /dev/null
+++ b/test/evals/cases/sanitized-replay.json
@@ -0,0 +1,45 @@
+{
+  "cases": [
+    {
+      "id": "sanitized-replay-merge-freeze",
+      "description": "sanitized replay fixture showing project context recall from a file-backed case",
+      "memories": [
+        {
+          "fileName": "project_freeze",
+          "name": "Merge Freeze",
+          "description": "Merge freeze starts 2026-04-10",
+          "type": "project",
+          "content": "Mobile team cutting release branch.\n\n**Why:** Prevent destabilizing the release cut.\n**How to apply:** Hold non-critical PRs until the freeze lifts."
+        }
+      ],
+      "messages": [
+        {
+          "role": "user",
+          "parts": [
+            {
+              "type": "text",
+              "text": "Can we merge this cleanup PR next week, or is there any release constraint I should factor in?"
+            }
+          ]
+        }
+      ],
+      "checks": {
+        "onContains": [
+          "## MEMORY.md",
+          "Merge Freeze",
+          "## Recalled Memories",
+          "Hold non-critical PRs until the freeze lifts."
+        ],
+        "offContains": [
+          "# Auto Memory"
+        ],
+        "offNotContains": [
+          "## MEMORY.md",
+          "Merge Freeze",
+          "## Recalled Memories",
+          "Hold non-critical PRs until the freeze lifts."
+        ]
+      }
+    }
+  ]
+}
diff --git a/test/evals/fixtures.ts b/test/evals/fixtures.ts
new file mode 100644
index 0000000..be3e37c
--- /dev/null
+++ b/test/evals/fixtures.ts
@@ -0,0 +1,178 @@
+import { readdirSync, readFileSync } from "fs"
+import { join } from "path"
+import { fileURLToPath } from "url"
+import { z } from "zod"
+
+export type SeedMemory = {
+  fileName: string
+  name: string
+  description: string
+  type: "user" | "feedback" | "project" | "reference"
+  content: string
+  mtime?: string
+}
+
+export type EvalMessagePart =
+  | { type: "text"; text: string }
+  | { type: "tool"; tool: string; state: { status: "completed" | "error" } }
+
+export type EvalMessage = {
+  role: "system" | "user" | "assistant"
+  sessionID?: string
+  parts: EvalMessagePart[]
+}
+
+export type TaskEvalChecks = {
+  onContains?: string[]
+  onNotContains?: string[]
+  offContains?: string[]
+  offNotContains?: string[]
+}
+
+export type TaskEvalCase = {
+  id: string
+  description: string
+  memories: SeedMemory[]
+  messages: EvalMessage[]
+  checks: TaskEvalChecks
+}
+
+const SeedMemorySchema = z.object({
+  fileName: z.string().min(1),
+  name: z.string().min(1),
+  description: z.string(),
+  type: z.enum(["user", "feedback", "project", "reference"]),
+  content: z.string(),
+  mtime: z.string().optional(),
+})
+
+const EvalMessagePartSchema = z.union([
+  z.object({
+    type: z.literal("text"),
+    text: z.string(),
+  }),
+  z.object({
+    type: z.literal("tool"),
+    tool: z.string().min(1),
+    state: z.object({
+      status: z.enum(["completed", "error"]),
+    }),
+  }),
+])
+
+const EvalMessageSchema = z.object({
+  role: z.enum(["system", "user", "assistant"]),
+  sessionID: z.string().optional(),
+  parts: z.array(EvalMessagePartSchema),
+})
+
+const TaskEvalChecksSchema = z.object({
+  onContains: z.array(z.string()).optional(),
+  onNotContains: z.array(z.string()).optional(),
+  offContains: z.array(z.string()).optional(),
+  offNotContains: z.array(z.string()).optional(),
+})
+
+const TaskEvalCaseSchema = z.object({
+  id: z.string().min(1),
+  description: z.string().min(1),
+  memories: z.array(SeedMemorySchema),
+  messages: z.array(EvalMessageSchema),
+  checks: TaskEvalChecksSchema,
+})
+
+const TaskEvalFixtureFileSchema = z.object({
+  cases: z.array(TaskEvalCaseSchema).min(1),
+})
+
+export const TASK_EVAL_CASES: TaskEvalCase[] = [
+  {
+    id: "feedback-recall-on-off-delta",
+    description: "memory-on should surface testing guidance while memory-off should suppress it",
+    memories: [
+      {
+        fileName: "feedback_testing",
+        name: "Testing Approach",
+        description: "Always use integration tests",
+        type: "feedback",
+        content:
+          "Never mock the database.\n\n**Why:** Mocked tests masked a broken migration.\n**How to apply:** Use a real test database for DB-facing tests.",
+      },
+    ],
+    messages: [
+      {
+        role: "user",
+        parts: [{ type: "text", text: "Should I mock the database in these tests?" }],
+      },
+    ],
+    checks: {
+      onContains: ["## MEMORY.md", "Testing Approach", "## Recalled Memories", "Never mock the database."],
+      offContains: ["# Auto Memory"],
+      offNotContains: ["## MEMORY.md", "Testing Approach", "## Recalled Memories", "Never mock the database."],
+    },
+  },
+  {
+    id: "completed-tool-filters-reference-recall",
+    description: "completed tool usage should suppress tool reference recall body but still allow other relevant recall",
+    memories: [
+      {
+        fileName: "grep_ref",
+        name: "Grep Tool API",
+        description: "Usage reference for grep tool",
+        type: "reference",
+        content: "Use grep -r --include='*.ts' when searching TypeScript files.",
+      },
+      {
+        fileName: "search_project",
+        name: "Project Search Policy",
+        description: "Codebase search guidance",
+        type: "project",
+        content:
+          "Prefer repo-local search first.\n\n**Why:** It keeps exploration faster and more reproducible.\n**How to apply:** Start with rg before broader tools.",
+      },
+    ],
+    messages: [
+      {
+        role: "user",
+        parts: [{ type: "text", text: "Search the codebase for where this behavior is implemented." }],
+      },
+      {
+        role: "assistant",
+        parts: [{ type: "tool", tool: "grep", state: { status: "completed" } }],
+      },
+    ],
+    checks: {
+      onContains: ["## MEMORY.md", "Grep Tool API", "Project Search Policy", "## Recalled Memories", "Prefer repo-local search first."],
+      onNotContains: ["Use grep -r --include='*.ts' when searching TypeScript files."],
+      offContains: ["# Auto Memory"],
+      offNotContains: [
+        "## MEMORY.md",
+        "Grep Tool API",
+        "Project Search Policy",
+        "## Recalled Memories",
+        "Prefer repo-local search first.",
+        "Use grep -r --include='*.ts' when searching TypeScript files.",
+      ],
+    },
+  },
+]
+
+export function loadTaskEvalCasesFromDir(dirPath: string): TaskEvalCase[] {
+  const fileNames = readdirSync(dirPath, { encoding: "utf-8" })
+    .filter((fileName) => fileName.endsWith(".json"))
+    .sort()
+
+  const cases: TaskEvalCase[] = []
+  for (const fileName of fileNames) {
+    const raw = readFileSync(join(dirPath, fileName), "utf-8")
+    const parsed = TaskEvalFixtureFileSchema.parse(JSON.parse(raw))
+    cases.push(...parsed.cases)
+  }
+
+  return cases
+}
+
+const CASES_DIR = fileURLToPath(new URL("./cases", import.meta.url))
+
+export const FILE_BACKED_TASK_EVAL_CASES = loadTaskEvalCasesFromDir(CASES_DIR)
+export const ALL_TASK_EVAL_CASES = [...TASK_EVAL_CASES, ...FILE_BACKED_TASK_EVAL_CASES]
diff --git a/test/evals/harness.ts b/test/evals/harness.ts
new file mode 100644
index 0000000..2ded580
--- /dev/null
+++ b/test/evals/harness.ts
@@ -0,0 +1,128 @@
+import { mkdtempSync, mkdirSync, rmSync, utimesSync } from "fs"
+import { tmpdir } from "os"
+import { join } from "path"
+import { MemoryPlugin } from "../../src/index.js"
+import { saveMemory } from "../../src/memory.js"
+import type { EvalMessage, TaskEvalCase } from "./fixtures.js"
+import type { TaskEvalJudge, TaskEvalJudgeResult } from "./judges.js"
+
+type MessageTransform = (
+  input: {},
+  output: RuntimeMessagesOutput,
+) => Promise<void>
+
+type SystemTransform = (
+  input: { model: unknown; sessionID?: string },
+  output: { system: string[] },
+) => Promise<void>
+
+export type TaskEvalResult = TaskEvalJudgeResult & {
+  caseID: string
+  description: string
+  onPrompt: string
+  offPrompt: string
+}
+
+type RuntimeMessage = {
+  info: { role: string; sessionID?: string }
+  parts: Array<{ type: string; text?: string; tool?: string; state?: { status: string } }>
+}
+
+type RuntimeMessagesOutput = {
+  messages: RuntimeMessage[]
+}
+
+function makeTempGitRepo(): string {
+  const root = mkdtempSync(join(tmpdir(), "task-eval-"))
+  mkdirSync(join(root, ".git"), { recursive: true })
+  return root
+}
+
+function cloneMessages(messages: EvalMessage[]): EvalMessage[] {
+  return JSON.parse(JSON.stringify(messages)) as EvalMessage[]
+}
+
+function materializeMessages(messages: EvalMessage[], sessionID: string): RuntimeMessagesOutput {
+  return {
+    messages: cloneMessages(messages).map((message) => ({
+      info: {
+        role: message.role,
+        sessionID: message.sessionID ?? sessionID,
+      },
+      parts: message.parts.map((part) => ({ ...part })),
+    })),
+  }
+}
+
+async function renderSystemPrompt(
+  worktree: string,
+  messages: EvalMessage[],
+  sessionID: string,
+  ignoreMemory: boolean,
+): Promise<string> {
+  const plugin = await MemoryPlugin({ worktree } as never)
+  const messagesTransform = plugin["experimental.chat.messages.transform"] as unknown as MessageTransform
+  const systemTransform = plugin["experimental.chat.system.transform"] as unknown as SystemTransform
+  const transformedMessages = materializeMessages(messages, sessionID)
+  const originalIgnore = process.env.OPENCODE_MEMORY_IGNORE
+
+  try {
+    if (ignoreMemory) process.env.OPENCODE_MEMORY_IGNORE = "1"
+    else delete process.env.OPENCODE_MEMORY_IGNORE
+
+    await messagesTransform({}, transformedMessages)
+
+    const output = { system: [] as string[] }
+    await systemTransform({ model: "test-model", sessionID }, output)
+    return output.system.join("\n\n")
+  } finally {
+    if (originalIgnore === undefined) delete process.env.OPENCODE_MEMORY_IGNORE
+    else process.env.OPENCODE_MEMORY_IGNORE = originalIgnore
+  }
+}
+
+export async function runTaskEvalCase(taskCase: TaskEvalCase, judge: TaskEvalJudge): Promise<TaskEvalResult> {
+  const repo = makeTempGitRepo()
+  const originalClaudeConfigDir = process.env.CLAUDE_CONFIG_DIR
+  process.env.CLAUDE_CONFIG_DIR = join(repo, ".claude-test")
+
+  try {
+    for (const memory of taskCase.memories) {
+      const filePath = saveMemory(repo, memory.fileName, memory.name, memory.description, memory.type, memory.content)
+      if (memory.mtime) {
+        const mtime = new Date(memory.mtime)
+        utimesSync(filePath, mtime, mtime)
+      }
+    }
+
+    const onPrompt = await renderSystemPrompt(repo, taskCase.messages, `${taskCase.id}:on`, false)
+    const offPrompt = await renderSystemPrompt(repo, taskCase.messages, `${taskCase.id}:off`, true)
+    const judged = await judge({ taskCase, onPrompt, offPrompt })
+
+    return {
+      caseID: taskCase.id,
+      description: taskCase.description,
+      onPrompt,
+      offPrompt,
+      passed: judged.passed,
+      failures: judged.failures,
+    }
+  } finally {
+    if (originalClaudeConfigDir === undefined) delete process.env.CLAUDE_CONFIG_DIR
+    else process.env.CLAUDE_CONFIG_DIR = originalClaudeConfigDir
+    rmSync(repo, { recursive: true, force: true })
+  }
+}
+
+export async function runTaskEvalSuite(
+  taskCases: readonly TaskEvalCase[],
+  judge: TaskEvalJudge,
+): Promise<TaskEvalResult[]> {
+  const results: TaskEvalResult[] = []
+
+  for (const taskCase of taskCases) {
+    results.push(await runTaskEvalCase(taskCase, judge))
+  }
+
+  return results
+}
diff --git a/test/evals/judges.ts b/test/evals/judges.ts
new file mode 100644
index 0000000..f3093e8
--- /dev/null
+++ b/test/evals/judges.ts
@@ -0,0 +1,41 @@
+import type { TaskEvalCase } from "./fixtures.js"
+
+export type TaskEvalJudgeInput = {
+  taskCase: TaskEvalCase
+  onPrompt: string
+  offPrompt: string
+}
+
+export type TaskEvalJudgeResult = {
+  passed: boolean
+  failures: string[]
+}
+
+export type TaskEvalJudge = (input: TaskEvalJudgeInput) => TaskEvalJudgeResult | Promise<TaskEvalJudgeResult>
+
+function checkContains(label: string, haystack: string, needles: readonly string[], failures: string[]): void {
+  for (const needle of needles) {
+    if (!haystack.includes(needle)) failures.push(`${label} missing expected text: ${needle}`)
+  }
+}
+
+function checkNotContains(label: string, haystack: string, needles: readonly string[], failures: string[]): void {
+  for (const needle of needles) {
+    if (haystack.includes(needle)) failures.push(`${label} unexpectedly contained text: ${needle}`)
+  }
+}
+
+export function ruleBasedJudge({ taskCase, onPrompt, offPrompt }: TaskEvalJudgeInput): TaskEvalJudgeResult {
+  const failures: string[] = []
+  const { checks } = taskCase
+
+  checkContains("memory-on prompt", onPrompt, checks.onContains ?? [], failures)
+  checkNotContains("memory-on prompt", onPrompt, checks.onNotContains ?? [], failures)
+  checkContains("memory-off prompt", offPrompt, checks.offContains ?? [], failures)
+  checkNotContains("memory-off prompt", offPrompt, checks.offNotContains ?? [], failures)
+
+  return {
+    passed: failures.length === 0,
+    failures,
+  }
+}
diff --git a/test/evals/report.ts b/test/evals/report.ts
new file mode 100644
index 0000000..7e14a91
--- /dev/null
+++ b/test/evals/report.ts
@@ -0,0 +1,21 @@
+import type { TaskEvalResult } from "./harness.js"
+
+export function formatTaskEvalReport(results: readonly TaskEvalResult[]): string {
+  const passed = results.filter((result) => result.passed).length
+  const lines = [`Task eval: ${passed}/${results.length} passed`]
+
+  for (const result of results) {
+    const status = result.passed ? "pass" : "fail"
+    lines.push(`[${status}] ${result.caseID} - ${result.description}`)
+
+    for (const failure of result.failures) {
+      lines.push(`  - ${failure}`)
+    }
+  }
+
+  return lines.join("\n")
+}
+
+export function hasTaskEvalFailures(results: readonly TaskEvalResult[]): boolean {
+  return results.some((result) => !result.passed)
+}
diff --git a/test/evals/run.ts b/test/evals/run.ts
new file mode 100644
index 0000000..aa36908
--- /dev/null
+++ b/test/evals/run.ts
@@ -0,0 +1,12 @@
+import { ALL_TASK_EVAL_CASES } from "./fixtures.js"
+import { runTaskEvalSuite } from "./harness.js"
+import { ruleBasedJudge } from "./judges.js"
+import { formatTaskEvalReport, hasTaskEvalFailures } from "./report.js"
+
+const results = await runTaskEvalSuite(ALL_TASK_EVAL_CASES, ruleBasedJudge)
+
+console.log(formatTaskEvalReport(results))
+
+if (hasTaskEvalFailures(results)) {
+  process.exitCode = 1
+}
diff --git a/test/evals/task-eval.test.ts b/test/evals/task-eval.test.ts
new file mode 100644
index 0000000..faa72b3
--- /dev/null
+++ b/test/evals/task-eval.test.ts
@@ -0,0 +1,67 @@
+import { describe, expect, test } from "bun:test"
+import { FILE_BACKED_TASK_EVAL_CASES, TASK_EVAL_CASES } from "./fixtures.js"
+import { runTaskEvalCase, runTaskEvalSuite } from "./harness.js"
+import { ruleBasedJudge } from "./judges.js"
+import { formatTaskEvalReport } from "./report.js"
+
+describe("offline task eval harness", () => {
+  test("built-in synthetic fixtures pass with the rule-based judge", async () => {
+    const results = await runTaskEvalSuite(TASK_EVAL_CASES, ruleBasedJudge)
+    const failures = results.filter((result) => !result.passed)
+
+    expect(failures).toEqual([])
+  })
+
+  test("supports custom judges without changing the harness", async () => {
+    const result = await runTaskEvalCase(TASK_EVAL_CASES[0]!, ({ onPrompt, offPrompt, taskCase }) => {
+      const failures: string[] = []
+      const expectedName = taskCase.memories[0]!.name
+
+      if (!onPrompt.includes(expectedName)) failures.push(`memory-on prompt did not include ${expectedName}`)
+      if (offPrompt.includes(expectedName)) failures.push(`memory-off prompt leaked ${expectedName}`)
+
+      return {
+        passed: failures.length === 0,
+        failures,
+      }
+    })
+
+    expect(result.passed).toBe(true)
+    expect(result.failures).toEqual([])
+  })
+
+  test("loads file-backed replay fixtures and runs them through the same harness", async () => {
+    expect(FILE_BACKED_TASK_EVAL_CASES.length).toBeGreaterThan(0)
+
+    const results = await runTaskEvalSuite(FILE_BACKED_TASK_EVAL_CASES, ruleBasedJudge)
+    const failures = results.filter((result) => !result.passed)
+
+    expect(failures).toEqual([])
+  })
+
+  test("formats a readable report for pass and fail results", () => {
+    const report = formatTaskEvalReport([
+      {
+        caseID: "passing-case",
+        description: "passes cleanly",
+        passed: true,
+        failures: [],
+        onPrompt: "on",
+        offPrompt: "off",
+      },
+      {
+        caseID: "failing-case",
+        description: "shows failures",
+        passed: false,
+        failures: ["memory-on prompt missing expected text: Important Memory"],
+        onPrompt: "on",
+        offPrompt: "off",
+      },
+    ])
+
+    expect(report).toContain("Task eval: 1/2 passed")
+    expect(report).toContain("[pass] passing-case")
+    expect(report).toContain("[fail] failing-case")
+    expect(report).toContain("memory-on prompt missing expected text: Important Memory")
+  })
+})
diff --git a/test/recall.test.ts b/test/recall.test.ts
index 45f6ed3..e53756f 100644
--- a/test/recall.test.ts
+++ b/test/recall.test.ts
@@ -93,6 +93,47 @@ describe("recallRelevantMemories", () => {
     expect(result[0]!.fileName).toBe("auth.md")
   })
 
+  test("returns no memories when a query has no matches", () => {
+    const repo = makeTempGitRepo()
+    const memDir = getMemoryDir(repo)
+
+    writeMemoryFile(
+      memDir,
+      "testing.md",
+      { name: "Testing Preference", description: "Database test guidance", type: "feedback" },
+      "Never mock database integration tests.",
+      new Date("2026-04-30"),
+    )
+
+    const result = recallRelevantMemories(repo, "rename README CLI flag")
+
+    expect(result).toEqual([])
+  })
+
+  test("keeps meaningful short query terms", () => {
+    const repo = makeTempGitRepo()
+    const memDir = getMemoryDir(repo)
+
+    writeMemoryFile(
+      memDir,
+      "go_user.md",
+      { name: "Go Expertise", description: "User writes Go services", type: "user" },
+      "User has deep Go backend experience.",
+      new Date("2026-04-01"),
+    )
+    writeMemoryFile(
+      memDir,
+      "recent_unrelated.md",
+      { name: "Recent Unrelated", description: "Latest unrelated note", type: "project" },
+      "Latest project note about release coordination.",
+      new Date("2026-04-30"),
+    )
+
+    const result = recallRelevantMemories(repo, "Go")
+
+    expect(result[0]!.fileName).toBe("go_user.md")
+  })
+
   test("matches query against frontmatter name", () => {
     const repo = makeTempGitRepo()
     const memDir = getMemoryDir(repo)