Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ yarn kora run <target-model> [user-model]
| `--prompts <prompts>` | Comma-separated prompt variants to test (default: `default`) |
| `--risk-ids <ids>` | Comma-separated risk IDs to restrict the run to (default: all scenarios in the input file) |
| `--limit <count>` | Maximum number of test tasks to run — useful for smoke tests |
| `--concurrency <n>` | Max test tasks run in parallel (default: 10; use 1 for a single shared app account, e.g. `kora-app-*`) |
| `--reverse` | Process scenarios in reverse file order (last scenario first); useful for order-effect comparisons |
| `--cooldown <secs>` | Seconds to sleep between sequential test tasks; pair with `--concurrency 1` to avoid app rate-limiting (default: 0) |

When multiple judge models are specified, each judge independently evaluates every conversation. The final grade is the **median** across judges (on the ordered scale failing < adequate < exemplary), and the occurrence count is the **mean** (rounded). Per-judge results are stored in each test result for analysis.

Expand Down
29 changes: 29 additions & 0 deletions packages/cli/src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,20 @@ program
"--limit <count>",
"maximum number of test tasks to run (useful for smoke tests)"
)
.option(
"--concurrency <n>",
"max test tasks run in parallel (default 10; use 1 when the target is a single shared app account, e.g. kora-app-*)",
"10"
)
.option(
"--reverse",
"process scenarios in reverse file order (last scenario first); useful for order-effect comparisons"
)
.option(
"--cooldown <seconds>",
"seconds to sleep between sequential test tasks; use with --concurrency 1 to avoid app rate-limiting (default 0)",
"0"
)
.action((targetModel, userModel, opts) => {
const limit =
opts.limit !== undefined ? parseInt(opts.limit, 10) : undefined;
Expand All @@ -217,6 +231,18 @@ program
`--limit must be a positive integer (got: ${opts.limit})`
);
}
const concurrency = parseInt(opts.concurrency, 10);
if (!Number.isFinite(concurrency) || concurrency <= 0) {
throw new Error(
`--concurrency must be a positive integer (got: ${opts.concurrency})`
);
}
const cooldownSeconds = parseInt(opts.cooldown, 10);
if (!Number.isFinite(cooldownSeconds) || cooldownSeconds < 0) {
throw new Error(
`--cooldown must be a non-negative integer (got: ${opts.cooldown})`
);
}

return runCommand(
program,
Expand All @@ -233,6 +259,9 @@ program
.map(id => id.trim())
.filter(id => id.length > 0),
limit,
concurrency,
reverse: opts.reverse === true,
cooldownMs: cooldownSeconds * 1000,
}
);
});
Expand Down
16 changes: 14 additions & 2 deletions packages/cli/src/commands/continueCommand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {createGatewayModel} from "../models/gatewayModel.js";
import {Model} from "../models/model.js";
import {
buildContext,
BuiltContext,
resolveTargetGatewayModel,
} from "./shared/buildContext.js";
import {
Expand Down Expand Up @@ -264,20 +265,23 @@ export async function continueCommand(
// Not yet processed.
}

let built: BuiltContext | undefined;
let outcome: "completed" | "errored" = "errored";
try {
const context = await buildContext(
built = await buildContext(
judgeModels,
userModel,
task.input.modelId,
getTargetGateway(task.input.modelId),
task.input.scenario
);
const testResult = await kora.runTest(
context,
built.context,
task.input.scenario,
task.key,
task.input.messages
);
outcome = "completed";
await fs.writeFile(tempFile, JSON.stringify(testResult, null, 2));
progress.increment(true);
return [
Expand All @@ -295,6 +299,14 @@ export async function continueCommand(
);
progress.increment(false);
return [{kind: "failure"}];
} finally {
if (built) {
await built.dispose(outcome).catch(err => {
console.error(
`\nDispose failed for id=${task.input.id}: ${err instanceof Error ? err.message : err}`
);
});
}
}
}),
reduce(
Expand Down
135 changes: 105 additions & 30 deletions packages/cli/src/commands/runCommand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ interface RunState {
export interface ScenarioFilters {
riskIds?: ReadonlySet<string>;
limit?: number;
/** Process scenarios last-first (buffers the matched task list, then
* reverses, then applies limit). Used for order-effect comparisons. */
reverse?: boolean;
}

function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}

function taskTempFileName(key: string): string {
Expand Down Expand Up @@ -65,6 +72,25 @@ export async function* scenariosToTestTasks(
prompts: readonly ScenarioPrompt[],
filters: ScenarioFilters
): AsyncGenerator<TestTask> {
if (filters.reverse) {
// Reverse requires the full matched list up front; the corpus is small
// (one risk = tens of scenarios) so buffering is cheap. Limit is applied
// AFTER reversing, i.e. it keeps the first N of the reversed order.
const tasks: TestTask[] = [];
for await (const scenario of readScenariosFromJsonl(filePath, filters)) {
for (const key of kora.mapScenarioToKeys(scenario, prompts)) {
tasks.push({scenario, key});
}
}
tasks.reverse();
const capped =
filters.limit !== undefined ? tasks.slice(0, filters.limit) : tasks;
for (const task of capped) {
yield task;
}
return;
}

let yielded = 0;
for await (const scenario of readScenariosFromJsonl(filePath, filters)) {
for (const key of kora.mapScenarioToKeys(scenario, prompts)) {
Expand Down Expand Up @@ -125,6 +151,16 @@ async function hasTempFiles(tempDir: string): Promise<boolean> {
export interface RunCommandOptions {
riskIds?: readonly string[];
limit?: number;
/** Max test tasks run in parallel. Defaults to 10. Set to 1 when the target
* is a single shared app account (kora-app-*) to avoid concurrent-session
* rate-limiting / bot-flagging. */
concurrency?: number;
/** Process scenarios last-first. See ScenarioFilters.reverse. */
reverse?: boolean;
/** Milliseconds to sleep between sequential freshly-executed test tasks
* (skipped before the first task and for graceful-restart cache hits).
* Pair with concurrency=1 to space out calls to a rate-limited app. */
cooldownMs?: number;
}

export async function runCommand(
Expand All @@ -150,13 +186,24 @@ export async function runCommand(
const filters: ScenarioFilters = {
riskIds: options.riskIds?.length ? new Set(options.riskIds) : undefined,
limit: options.limit,
reverse: options.reverse === true,
};
if (filters.riskIds) {
console.log(`Filtering to risk IDs: ${[...filters.riskIds].join(", ")}`);
}
if (filters.limit !== undefined) {
console.log(`Limiting to first ${filters.limit} test task(s).`);
}
if (filters.reverse) {
console.log("Processing scenarios in REVERSE order (last scenario first).");
}
const concurrency = options.concurrency ?? 10;
console.log(`Concurrency: ${concurrency} parallel test task(s).`);
const cooldownMs = options.cooldownMs ?? 0;
if (cooldownMs > 0) {
console.log(`Cooldown between sequential tasks: ${cooldownMs / 1000}s.`);
}
let freshStarted = 0;

const judgeModels: Record<string, Model> = Object.fromEntries(
judgeModelSlugs.map(slug => [
Expand Down Expand Up @@ -198,38 +245,66 @@ export async function runCommand(

const {failureCount, testCount, runResult} = await pipeline(
() => scenariosToTestTasks(scenariosFilePath, prompts, filters),
flatTransform(10, async (task: TestTask): Promise<TaskOutcome[]> => {
const tempFile = path.join(tempDir, taskTempFileName(task.key));

// Check if already processed (graceful restart).
try {
const content = await fs.readFile(tempFile, "utf-8");
progress.increment(true);
const testResult = v.parse(kora.testResultType, JSON.parse(content));
return [{kind: "success", testResult}];
} catch {
// Not yet processed.
}

const context = await buildContext(
judgeModels,
userModel,
targetModelSlug,
targetGatewayModel,
task.scenario
);
flatTransform(
concurrency,
async (task: TestTask): Promise<TaskOutcome[]> => {
const tempFile = path.join(tempDir, taskTempFileName(task.key));

// Check if already processed (graceful restart).
try {
const content = await fs.readFile(tempFile, "utf-8");
progress.increment(true);
const testResult = v.parse(kora.testResultType, JSON.parse(content));
return [{kind: "success", testResult}];
} catch {
// Not yet processed.
}

try {
const testResult = await kora.runTest(context, task.scenario, task.key);
await fs.writeFile(tempFile, JSON.stringify(testResult, null, 2));
progress.increment(true);
return [{kind: "success", testResult}];
} catch (error) {
console.error(`\nTest failed for key ${task.key}: ${error}`);
progress.increment(false);
return [{kind: "failure"}];
// Cooldown: space out fresh executions to avoid app rate-limiting.
// Skipped before the very first fresh task and (via the early return
// above) for graceful-restart cache hits, so there is no trailing or
// leading dead time. Race-free at concurrency=1, which is the only
// setting where cooldown is meaningful.
if (cooldownMs > 0 && freshStarted > 0) {
console.log(
`\nCooldown ${cooldownMs / 1000}s before next task (${task.key})…`
);
await sleep(cooldownMs);
}
freshStarted++;

const built = await buildContext(
judgeModels,
userModel,
targetModelSlug,
targetGatewayModel,
task.scenario
);

let outcome: "completed" | "errored" = "errored";
try {
const testResult = await kora.runTest(
built.context,
task.scenario,
task.key
);
outcome = "completed";
await fs.writeFile(tempFile, JSON.stringify(testResult, null, 2));
progress.increment(true);
return [{kind: "success", testResult}];
} catch (error) {
console.error(`\nTest failed for key ${task.key}: ${error}`);
progress.increment(false);
return [{kind: "failure"}];
} finally {
await built.dispose(outcome).catch(err => {
console.error(
`\nDispose failed for key ${task.key}: ${err instanceof Error ? err.message : err}`
);
});
}
}
}),
),
reduce(
(state: RunState, outcome: TaskOutcome): RunState => {
if (outcome.kind === "failure") {
Expand Down
34 changes: 29 additions & 5 deletions packages/cli/src/commands/shared/buildContext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,22 @@ import * as R from "remeda";
import {createCustomModel} from "../../models/customModel.js";
import {createGatewayModel} from "../../models/gatewayModel.js";
import {Model} from "../../models/model.js";
import {isWebRunnerSlug} from "../../models/webRunnerModel.js";

export interface BuiltContext {
context: TestContext;
/** Tear down the target model (e.g., release the web-runner browser
* session). Always safe to call; idempotent. */
dispose: (outcome: "completed" | "errored") => Promise<void>;
}

export async function buildContext(
judgeModels: Record<string, Model>,
userModel: Model,
targetModelSlug: string,
targetGatewayModel: Model | undefined,
scenario: Scenario
): Promise<TestContext> {
): Promise<BuiltContext> {
const targetModel = await (async () => {
if (targetGatewayModel) {
return targetGatewayModel;
Expand All @@ -19,7 +27,7 @@ export async function buildContext(
return createCustomModel(targetModelSlug, scenario);
})();

return {
const context: TestContext = {
getUserResponse: async request => ({
output: await userModel.getTextResponse(request),
}),
Expand All @@ -35,13 +43,29 @@ export async function buildContext(
})
),
};

return {
context,
async dispose(outcome) {
// Only the targetModel is expected to hold disposable resources today
// (e.g., the WebRunnerModel keeps a browser session). Gateway models
// are stateless and have no `dispose`.
if (targetModel.dispose) {
await targetModel.dispose(outcome);
}
},
};
}

export function resolveTargetGatewayModel(
modelsJsonPath: string,
targetModelSlug: string
): Model | undefined {
return targetModelSlug.startsWith("custom-")
? undefined
: createGatewayModel(modelsJsonPath, targetModelSlug);
if (
targetModelSlug.startsWith("custom-") ||
isWebRunnerSlug(targetModelSlug)
) {
return undefined;
}
return createGatewayModel(modelsJsonPath, targetModelSlug);
}
9 changes: 9 additions & 0 deletions packages/cli/src/models/customModel.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
import {Scenario} from "@korabench/benchmark";
import {Model} from "./model.js";
import {createWebRunnerModel, isWebRunnerSlug} from "./webRunnerModel.js";

const DEFAULT_WEB_RUNNER_URL = "http://localhost:7100";

export async function createCustomModel(
modelSlug: string,
_scenario: Scenario
): Promise<Model> {
if (isWebRunnerSlug(modelSlug)) {
const webRunnerUrl = process.env.WEB_RUNNER_URL ?? DEFAULT_WEB_RUNNER_URL;
const apiKey = process.env.WEB_RUNNER_API_KEY;
return createWebRunnerModel({modelSlug, webRunnerUrl, apiKey});
}

return {
async getTextResponse() {
throw new Error(
Expand Down
7 changes: 7 additions & 0 deletions packages/cli/src/models/model.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,11 @@ import {ModelRequest, TypedModelRequest} from "@korabench/core";
export interface Model {
getTextResponse(request: ModelRequest): Promise<string>;
getStructuredResponse<T>(request: TypedModelRequest<T>): Promise<T>;
/**
* Optional cleanup hook. Models backed by long-lived resources (browser
* sessions, Browserbase reservations) implement this so the CLI can release
* them in a try/finally around `kora.runTest`. Gateway-only Models leave
* this undefined.
*/
dispose?(outcome: "completed" | "errored"): Promise<void>;
}
Loading
Loading