Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,28 @@ Records are grouped by `modelId`; each target produces a results JSON in the out

In addition to the per-target JSONs, `reassess` writes `assessments.json` in the output directory: a flat list of per-record `{id, modelId, assessment, behaviorAssessment}` entries shaped to match a baseline file such as `data/reassessment-input.assessments.json`. `behaviorAssessment` includes the 7 mechanisms the pipeline assesses (a superset of any 3-mechanism baseline), so downstream comparison can intersect on the shared keys.

### `continue`

Extends pre-recorded conversations with additional turns up to each risk's `conversationLength`, then runs the full judge pipeline on the extended transcript. Useful for studying how evaluation signal changes when the same scenarios are run for more turns.

```bash
yarn kora continue [user-model]
```

| Argument / Option | Description |
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `[user-model]` | Model used to generate user messages during the continuation (default: `deepseek-v3.2-temp-1.3`, matching production) |
| `--judges <models>` | Comma-separated judge models (default: `gpt-5.2:high:limited` — single judge, held constant across 3-turn vs 8-turn comparisons) |
| `-i, --input <path>` | Input JSONL of recorded conversations, same shape as `reassess` (default: `data/reassessment-input.jsonl`) |
| `-o, --output <dir>` | Output directory — one `{modelId}.json` per target model, plus `assessments.json`, `continue-meta.json`, and `results.zip` (default: `data/continue-results`) |
| `--risk-ids <ids>` | Comma-separated risk IDs to restrict the run to (default: all records in the input file) |
| `--target-models <ids>` | Comma-separated target `modelId`s to restrict the run to (default: all `modelId`s in the input file) |
| `--limit-per-risk <count>` | Maximum records per risk, selected deterministically by `id` (sorted lexicographically). Fails fast if any requested risk has fewer records than requested. |

Each record is replayed with its **original** `modelId` as the target model, so 3-turn-vs-longer comparisons stay apples-to-apples per (scenario, model). The turn budget comes from `risk.conversationLength` in `packages/benchmark/data/risks.json`; records whose transcripts already meet or exceed the risk's length are re-judged without adding new turns.

`continue-meta.json` captures the source file path + SHA-256, the user model, the `--limit-per-risk` value, and the selected record IDs per risk — re-running the same command against the same input picks the same records.

### `compare-assessments`

Joins two assessments-list JSONs by `id` and prints per-metric agreement + flip matrices. Useful for diffing a reassessment run against the original prod grades.
Expand Down
30 changes: 29 additions & 1 deletion models.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,28 @@
"model": "anthropic/claude-sonnet-4.6",
"maxTokens": 16000
},
"gpt-5.2-high": {
"model": "openai/gpt-5.2",
"maxTokens": 26000,
"providerOptions": {
"openai": {
"reasoningEffort": "high"
}
}
},
"claude-haiku-4.5-high": {
"model": "anthropic/claude-haiku-4.5",
"maxTokens": 26000,
"temperature": 1,
"providerOptions": {
"anthropic": {
"thinking": {
"type": "enabled",
"budgetTokens": 8000
}
}
}
},
"gemini-2.5-pro:limited": {
"model": "google/gemini-2.5-pro",
"maxTokens": 16000,
Expand All @@ -71,6 +93,11 @@
"maxTokens": 4000,
"temperature": 1.3
},
"deepseek-v3.2-temp-1.3": {
"model": "deepseek/deepseek-v3.2",
"maxTokens": 4000,
"temperature": 1.3
},
"deepseek-v4-pro": {
"model": "deepseek/deepseek-v4-pro",
"maxTokens": 4000,
Expand All @@ -82,6 +109,7 @@
},
"llama-4-maverick:limited": {
"model": "meta/llama-4-maverick",
"maxTokens": 16000
"maxTokens": 16000,
"temperature": 0.5
}
}
20 changes: 10 additions & 10 deletions packages/benchmark/data/risks.json

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion packages/benchmark/src/benchmark.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import {
ModelMessage,
ModelRequest,
ModelResponse,
SchemaWithOutput,
Expand Down Expand Up @@ -67,7 +68,8 @@ export interface Benchmark<TScenarioSeed, TScenario, TTestResult, TRunResult> {
runTest(
c: TestContext,
scenario: TScenario,
key: string
key: string,
startMessages?: readonly ModelMessage[]
): Promise<TTestResult>;
mapTestResultToRunResult(result: TTestResult): TRunResult;
reduceRunResult(result1: TRunResult, result2: TRunResult): TRunResult;
Expand Down
13 changes: 10 additions & 3 deletions packages/benchmark/src/kora.ts
Original file line number Diff line number Diff line change
Expand Up @@ -333,17 +333,24 @@ export const kora = Benchmark.new({
R.map(ScenarioKey.toString)
);
},
async runTest(c, scenario, keyString) {
async runTest(c, scenario, keyString, startMessages) {
const key = ScenarioKey.ofString(keyString);
const riskCategory = RiskCategory.find(scenario.seed.riskCategoryId);
const risk = RiskCategory.findRisk(riskCategory, scenario.seed.riskId);
const prompt = key.prompt;
const promptAgeRange = ScenarioKey.toAgeRange(key);

if (startMessages && startMessages.length % 2 !== 0) {
throw new Error(
`runTest startMessages must contain complete user/assistant pairs (got length ${startMessages.length}).`
);
}

// Multi-turn conversation.
const messages: ModelMessage[] = [];
const messages: ModelMessage[] = startMessages ? [...startMessages] : [];
const startTurn = messages.length / 2;

for (let i = 0; i < risk.conversationLength; i++) {
for (let i = startTurn; i < risk.conversationLength; i++) {
const userMessage = await (() => {
if (i === 0) {
return scenario.firstUserMessage;
Expand Down
77 changes: 77 additions & 0 deletions packages/cli/src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {dirname} from "node:path";
import {fileURLToPath} from "node:url";
import * as v from "valibot";
import {compareAssessmentsCommand} from "./commands/compareAssessmentsCommand.js";
import {continueCommand} from "./commands/continueCommand.js";
import {expandScenariosCommand} from "./commands/expandScenariosCommand.js";
import {generateSeeds} from "./commands/generateSeedsCommand.js";
import {reassessCommand} from "./commands/reassessCommand.js";
Expand Down Expand Up @@ -62,6 +63,10 @@ const defaultReassessOutputDir = path.relative(
process.cwd(),
path.join(dataPath, "reassessment-results")
);
const defaultContinueOutputDir = path.relative(
process.cwd(),
path.join(dataPath, "continue-results")
);
const defaultCompareOriginalPath = path.relative(
process.cwd(),
path.join(dataPath, "reassessment-input.assessments.json")
Expand Down Expand Up @@ -299,6 +304,78 @@ program
);
});

program
.command("continue")
.description(
"extend pre-recorded conversations with additional turns (up to each risk's conversationLength), then judge the full transcript"
)
.argument(
"[user-model]",
"model to use for user message generation during the continuation",
"deepseek-v3.2-temp-1.3"
)
.option(
"--judges <models>",
"comma-separated judge models",
"gpt-5.2:high:limited"
)
.option(
"-i, --input <path>",
"input JSONL of recorded conversations ({id, modelId, scenario, prompt, messages})",
defaultReassessInputPath
)
.option(
"-o, --output <dir>",
"output directory (one {modelId}.json per target)",
defaultContinueOutputDir
)
.option(
"--risk-ids <ids>",
"comma-separated risk IDs to restrict the run to (defaults to all records in the input file)"
)
.option(
"--target-models <ids>",
"comma-separated target modelIds to restrict the run to (defaults to all modelIds in the input file)"
)
.option(
"--limit-per-risk <count>",
"maximum number of records per risk (deterministic by record id; fails fast if any requested risk has fewer records than requested)"
)
.action((userModel, opts) => {
const limitPerRisk =
opts.limitPerRisk !== undefined
? parseInt(opts.limitPerRisk, 10)
: undefined;
if (
limitPerRisk !== undefined &&
(!Number.isFinite(limitPerRisk) || limitPerRisk <= 0)
) {
throw new Error(
`--limit-per-risk must be a positive integer (got: ${opts.limitPerRisk})`
);
}

return continueCommand(
program,
modelsJsonPath,
opts.judges.split(",").map(s => s.trim()),
userModel,
opts.input,
opts.output,
{
riskIds: opts.riskIds
?.split(",")
.map(id => id.trim())
.filter(id => id.length > 0),
targetModels: opts.targetModels
?.split(",")
.map(id => id.trim())
.filter(id => id.length > 0),
limitPerRisk,
}
);
});

program
.command("compare-assessments")
.description(
Expand Down
Loading
Loading