korabench · Thibaut-Fatus · May 4, 2026 · Apr 24, 2026 · Apr 29, 2026
diff --git a/README.md b/README.md
@@ -132,6 +132,28 @@ Records are grouped by `modelId`; each target produces a results JSON in the out
 
 In addition to the per-target JSONs, `reassess` writes `assessments.json` in the output directory: a flat list of per-record `{id, modelId, assessment, behaviorAssessment}` entries shaped to match a baseline file such as `data/reassessment-input.assessments.json`. `behaviorAssessment` includes the 7 mechanisms the pipeline assesses (a superset of any 3-mechanism baseline), so downstream comparison can intersect on the shared keys.
 
+### `continue`
+
+Extends pre-recorded conversations with additional turns up to each risk's `conversationLength`, then runs the full judge pipeline on the extended transcript. Useful for studying how evaluation signal changes when the same scenarios are run for more turns.
+
+```bash
+yarn kora continue [user-model]
+```
+
+| Argument / Option          | Description                                                                                                                                                                                |
+| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `[user-model]`             | Model used to generate user messages during the continuation (default: `deepseek-v3.2-temp-1.3`, matching production)                                                                      |
+| `--judges <models>`        | Comma-separated judge models (default: `gpt-5.2:high:limited` — single judge, held constant across 3-turn vs 8-turn comparisons)                                                           |
+| `-i, --input <path>`       | Input JSONL of recorded conversations, same shape as `reassess` (default: `data/reassessment-input.jsonl`)                                                                                 |
+| `-o, --output <dir>`       | Output directory — one `{modelId}.json` per target model, plus `assessments.json`, `continue-meta.json`, and `results.zip` (default: `data/continue-results`)                              |
+| `--risk-ids <ids>`         | Comma-separated risk IDs to restrict the run to (default: all records in the input file)                                                                                                   |
+| `--target-models <ids>`    | Comma-separated target `modelId`s to restrict the run to (default: all `modelId`s in the input file)                                                                                       |
+| `--limit-per-risk <count>` | Maximum records per risk, selected deterministically by `id` (sorted lexicographically). Fails fast if any requested risk has fewer records than requested.                                |
+
+Each record is replayed with its **original** `modelId` as the target model, so 3-turn-vs-longer comparisons stay apples-to-apples per (scenario, model). The turn budget comes from `risk.conversationLength` in `packages/benchmark/data/risks.json`; records whose transcripts already meet or exceed the risk's length are re-judged without adding new turns.
+
+`continue-meta.json` captures the source file path + SHA-256, the user model, the `--limit-per-risk` value, and the selected record IDs per risk — re-running the same command against the same input picks the same records.
+
 ### `compare-assessments`
 
 Joins two assessments-list JSONs by `id` and prints per-metric agreement + flip matrices. Useful for diffing a reassessment run against the original prod grades.

diff --git a/models.json b/models.json
@@ -54,6 +54,28 @@
     "model": "anthropic/claude-sonnet-4.6",
     "maxTokens": 16000
   },
+  "gpt-5.2-high": {
+    "model": "openai/gpt-5.2",
+    "maxTokens": 26000,
+    "providerOptions": {
+      "openai": {
+        "reasoningEffort": "high"
+      }
+    }
+  },
+  "claude-haiku-4.5-high": {
+    "model": "anthropic/claude-haiku-4.5",
+    "maxTokens": 26000,
+    "temperature": 1,
+    "providerOptions": {
+      "anthropic": {
+        "thinking": {
+          "type": "enabled",
+          "budgetTokens": 8000
+        }
+      }
+    }
+  },
   "gemini-2.5-pro:limited": {
     "model": "google/gemini-2.5-pro",
     "maxTokens": 16000,
@@ -71,6 +93,11 @@
     "maxTokens": 4000,
     "temperature": 1.3
   },
+  "deepseek-v3.2-temp-1.3": {
+    "model": "deepseek/deepseek-v3.2",
+    "maxTokens": 4000,
+    "temperature": 1.3
+  },
   "deepseek-v4-pro": {
     "model": "deepseek/deepseek-v4-pro",
     "maxTokens": 4000,
@@ -82,6 +109,7 @@
   },
   "llama-4-maverick:limited": {
     "model": "meta/llama-4-maverick",
-    "maxTokens": 16000
+    "maxTokens": 16000,
+    "temperature": 0.5
   }
 }
diff --git a/packages/benchmark/data/risks.json b/packages/benchmark/data/risks.json
diff --git a/packages/benchmark/src/benchmark.ts b/packages/benchmark/src/benchmark.ts
@@ -1,4 +1,5 @@
 import {
+  ModelMessage,
   ModelRequest,
   ModelResponse,
   SchemaWithOutput,
@@ -67,7 +68,8 @@ export interface Benchmark<TScenarioSeed, TScenario, TTestResult, TRunResult> {
   runTest(
     c: TestContext,
     scenario: TScenario,
-    key: string
+    key: string,
+    startMessages?: readonly ModelMessage[]
   ): Promise<TTestResult>;
   mapTestResultToRunResult(result: TTestResult): TRunResult;
   reduceRunResult(result1: TRunResult, result2: TRunResult): TRunResult;

diff --git a/packages/benchmark/src/kora.ts b/packages/benchmark/src/kora.ts
@@ -333,17 +333,24 @@ export const kora = Benchmark.new({
       R.map(ScenarioKey.toString)
     );
   },
-  async runTest(c, scenario, keyString) {
+  async runTest(c, scenario, keyString, startMessages) {
     const key = ScenarioKey.ofString(keyString);
     const riskCategory = RiskCategory.find(scenario.seed.riskCategoryId);
     const risk = RiskCategory.findRisk(riskCategory, scenario.seed.riskId);
     const prompt = key.prompt;
     const promptAgeRange = ScenarioKey.toAgeRange(key);
 
+    if (startMessages && startMessages.length % 2 !== 0) {
+      throw new Error(
+        `runTest startMessages must contain complete user/assistant pairs (got length ${startMessages.length}).`
+      );
+    }
+
     // Multi-turn conversation.
-    const messages: ModelMessage[] = [];
+    const messages: ModelMessage[] = startMessages ? [...startMessages] : [];
+    const startTurn = messages.length / 2;
 
-    for (let i = 0; i < risk.conversationLength; i++) {
+    for (let i = startTurn; i < risk.conversationLength; i++) {
       const userMessage = await (() => {
         if (i === 0) {
           return scenario.firstUserMessage;

diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts
@@ -7,6 +7,7 @@ import {dirname} from "node:path";
 import {fileURLToPath} from "node:url";
 import * as v from "valibot";
 import {compareAssessmentsCommand} from "./commands/compareAssessmentsCommand.js";
+import {continueCommand} from "./commands/continueCommand.js";
 import {expandScenariosCommand} from "./commands/expandScenariosCommand.js";
 import {generateSeeds} from "./commands/generateSeedsCommand.js";
 import {reassessCommand} from "./commands/reassessCommand.js";
@@ -62,6 +63,10 @@ const defaultReassessOutputDir = path.relative(
   process.cwd(),
   path.join(dataPath, "reassessment-results")
 );
+const defaultContinueOutputDir = path.relative(
+  process.cwd(),
+  path.join(dataPath, "continue-results")
+);
 const defaultCompareOriginalPath = path.relative(
   process.cwd(),
   path.join(dataPath, "reassessment-input.assessments.json")
@@ -299,6 +304,78 @@ program
     );
   });
 
+program
+  .command("continue")
+  .description(
+    "extend pre-recorded conversations with additional turns (up to each risk's conversationLength), then judge the full transcript"
+  )
+  .argument(
+    "[user-model]",
+    "model to use for user message generation during the continuation",
+    "deepseek-v3.2-temp-1.3"
+  )
+  .option(
+    "--judges <models>",
+    "comma-separated judge models",
+    "gpt-5.2:high:limited"
+  )
+  .option(
+    "-i, --input <path>",
+    "input JSONL of recorded conversations ({id, modelId, scenario, prompt, messages})",
+    defaultReassessInputPath
+  )
+  .option(
+    "-o, --output <dir>",
+    "output directory (one {modelId}.json per target)",
+    defaultContinueOutputDir
+  )
+  .option(
+    "--risk-ids <ids>",
+    "comma-separated risk IDs to restrict the run to (defaults to all records in the input file)"
+  )
+  .option(
+    "--target-models <ids>",
+    "comma-separated target modelIds to restrict the run to (defaults to all modelIds in the input file)"
+  )
+  .option(
+    "--limit-per-risk <count>",
+    "maximum number of records per risk (deterministic by record id; fails fast if any requested risk has fewer records than requested)"
+  )
+  .action((userModel, opts) => {
+    const limitPerRisk =
+      opts.limitPerRisk !== undefined
+        ? parseInt(opts.limitPerRisk, 10)
+        : undefined;
+    if (
+      limitPerRisk !== undefined &&
+      (!Number.isFinite(limitPerRisk) || limitPerRisk <= 0)
+    ) {
+      throw new Error(
+        `--limit-per-risk must be a positive integer (got: ${opts.limitPerRisk})`
+      );
+    }
+
+    return continueCommand(
+      program,
+      modelsJsonPath,
+      opts.judges.split(",").map(s => s.trim()),
+      userModel,
+      opts.input,
+      opts.output,
+      {
+        riskIds: opts.riskIds
+          ?.split(",")
+          .map(id => id.trim())
+          .filter(id => id.length > 0),
+        targetModels: opts.targetModels
+          ?.split(",")
+          .map(id => id.trim())
+          .filter(id => id.length > 0),
+        limitPerRisk,
+      }
+    );
+  });
+
 program
   .command("compare-assessments")
   .description(