From 73cafe2dac88e9a2815b099662b7d78202f496c7 Mon Sep 17 00:00:00 2001
From: Thibaut Fatus <tfatus@gmail.com>
Date: Wed, 29 Apr 2026 15:08:10 +0200
Subject: [PATCH 1/6] feat(cli): add WebRunnerModel for kora-app-* targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires kora-benchmark's `kora run` to drive real consumer-app sessions
via the korabench/apps web-runner HTTP service. Model runs:

  yarn kora run kora-app-gemini --limit 1

routes the assistant slot through createCustomModel → WebRunnerModel,
which POSTs each turn to the web-runner (default http://localhost:7100,
override via WEB_RUNNER_URL). User and judge models stay on the AI
Gateway path unchanged, so app scores share scenarios, child agent
(deepseek-v3.2), and judges with API-model runs.

- packages/cli/src/models/webRunnerModel.ts (new): Model impl with
  lazy session open, latest-user-message-only turns, BlockedAppError
  surfacing, and a `dispose(outcome)` hook.
- packages/cli/src/models/customModel.ts: routes "kora-app-*" slugs to
  createWebRunnerModel; non-matching slugs keep the existing stub.
- packages/cli/src/models/model.ts: Model gains optional `dispose?` so
  long-lived browser sessions can be torn down between tests.
- packages/cli/src/commands/runCommand.ts: buildContext returns
  {context, dispose}; kora.runTest is wrapped in try/finally so
  dispose always fires. resolveTargetGatewayModel also recognises
  kora-app-* as custom (returns undefined → custom path).

Verified end-to-end on Gemini guest mode against the local web-runner:
3-turn conversation captured, three independent judges (gpt-5.2,
claude-sonnet-4.6, gemini-2.5-pro) graded the run, RunResult JSON
shape matches a normal gateway-only run.

Note: the `kora continue` command on feat/8-turns will need an analogous
try/finally + dispose patch when that branch merges; it shares the same
buildContext helper that gets extracted to shared/ over there.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/cli/src/commands/continueCommand.ts  |  25 ++-
 packages/cli/src/commands/runCommand.ts       |  16 +-
 .../cli/src/commands/shared/buildContext.ts   |  34 +++-
 packages/cli/src/models/customModel.ts        |   8 +
 packages/cli/src/models/model.ts              |   7 +
 packages/cli/src/models/webRunnerModel.ts     | 149 ++++++++++++++++++
 6 files changed, 224 insertions(+), 15 deletions(-)
 create mode 100644 packages/cli/src/models/webRunnerModel.ts

diff --git a/packages/cli/src/commands/continueCommand.ts b/packages/cli/src/commands/continueCommand.ts
index 025d9c5..5b4b081 100644
--- a/packages/cli/src/commands/continueCommand.ts
+++ b/packages/cli/src/commands/continueCommand.ts
@@ -264,20 +264,23 @@ export async function continueCommand(
         // Not yet processed.
       }
 
+      const built = await buildContext(
+        judgeModels,
+        userModel,
+        task.input.modelId,
+        getTargetGateway(task.input.modelId),
+        task.input.scenario
+      );
+
+      let outcome: "completed" | "errored" = "errored";
       try {
-        const context = await buildContext(
-          judgeModels,
-          userModel,
-          task.input.modelId,
-          getTargetGateway(task.input.modelId),
-          task.input.scenario
-        );
         const testResult = await kora.runTest(
-          context,
+          built.context,
           task.input.scenario,
           task.key,
           task.input.messages
         );
+        outcome = "completed";
         await fs.writeFile(tempFile, JSON.stringify(testResult, null, 2));
         progress.increment(true);
         return [
@@ -295,6 +298,12 @@ export async function continueCommand(
         );
         progress.increment(false);
         return [{kind: "failure"}];
+      } finally {
+        await built.dispose(outcome).catch(err => {
+          console.error(
+            `\nDispose failed for id=${task.input.id}: ${err instanceof Error ? err.message : err}`
+          );
+        });
       }
     }),
     reduce(
diff --git a/packages/cli/src/commands/runCommand.ts b/packages/cli/src/commands/runCommand.ts
index 7828ca1..aa21022 100644
--- a/packages/cli/src/commands/runCommand.ts
+++ b/packages/cli/src/commands/runCommand.ts
@@ -211,7 +211,7 @@ export async function runCommand(
         // Not yet processed.
       }
 
-      const context = await buildContext(
+      const built = await buildContext(
         judgeModels,
         userModel,
         targetModelSlug,
@@ -219,8 +219,14 @@ export async function runCommand(
         task.scenario
       );
 
+      let outcome: "completed" | "errored" = "errored";
       try {
-        const testResult = await kora.runTest(context, task.scenario, task.key);
+        const testResult = await kora.runTest(
+          built.context,
+          task.scenario,
+          task.key
+        );
+        outcome = "completed";
         await fs.writeFile(tempFile, JSON.stringify(testResult, null, 2));
         progress.increment(true);
         return [{kind: "success", testResult}];
@@ -228,6 +234,12 @@ export async function runCommand(
         console.error(`\nTest failed for key ${task.key}: ${error}`);
         progress.increment(false);
         return [{kind: "failure"}];
+      } finally {
+        await built.dispose(outcome).catch(err => {
+          console.error(
+            `\nDispose failed for key ${task.key}: ${err instanceof Error ? err.message : err}`
+          );
+        });
       }
     }),
     reduce(
diff --git a/packages/cli/src/commands/shared/buildContext.ts b/packages/cli/src/commands/shared/buildContext.ts
index 0501581..81bb2eb 100644
--- a/packages/cli/src/commands/shared/buildContext.ts
+++ b/packages/cli/src/commands/shared/buildContext.ts
@@ -3,6 +3,14 @@ import * as R from "remeda";
 import {createCustomModel} from "../../models/customModel.js";
 import {createGatewayModel} from "../../models/gatewayModel.js";
 import {Model} from "../../models/model.js";
+import {isWebRunnerSlug} from "../../models/webRunnerModel.js";
+
+export interface BuiltContext {
+  context: TestContext;
+  /** Tear down the target model (e.g., release the web-runner browser
+   * session). Always safe to call; idempotent. */
+  dispose: (outcome: "completed" | "errored") => Promise<void>;
+}
 
 export async function buildContext(
   judgeModels: Record<string, Model>,
@@ -10,7 +18,7 @@ export async function buildContext(
   targetModelSlug: string,
   targetGatewayModel: Model | undefined,
   scenario: Scenario
-): Promise<TestContext> {
+): Promise<BuiltContext> {
   const targetModel = await (async () => {
     if (targetGatewayModel) {
       return targetGatewayModel;
@@ -19,7 +27,7 @@ export async function buildContext(
     return createCustomModel(targetModelSlug, scenario);
   })();
 
-  return {
+  const context: TestContext = {
     getUserResponse: async request => ({
       output: await userModel.getTextResponse(request),
     }),
@@ -35,13 +43,29 @@ export async function buildContext(
       })
     ),
   };
+
+  return {
+    context,
+    async dispose(outcome) {
+      // Only the targetModel is expected to hold disposable resources today
+      // (e.g., the WebRunnerModel keeps a browser session). Gateway models
+      // are stateless and have no `dispose`.
+      if (targetModel.dispose) {
+        await targetModel.dispose(outcome);
+      }
+    },
+  };
 }
 
 export function resolveTargetGatewayModel(
   modelsJsonPath: string,
   targetModelSlug: string
 ): Model | undefined {
-  return targetModelSlug.startsWith("custom-")
-    ? undefined
-    : createGatewayModel(modelsJsonPath, targetModelSlug);
+  if (
+    targetModelSlug.startsWith("custom-") ||
+    isWebRunnerSlug(targetModelSlug)
+  ) {
+    return undefined;
+  }
+  return createGatewayModel(modelsJsonPath, targetModelSlug);
 }
diff --git a/packages/cli/src/models/customModel.ts b/packages/cli/src/models/customModel.ts
index c0258b9..be7d3ba 100644
--- a/packages/cli/src/models/customModel.ts
+++ b/packages/cli/src/models/customModel.ts
@@ -1,10 +1,18 @@
 import {Scenario} from "@korabench/benchmark";
 import {Model} from "./model.js";
+import {createWebRunnerModel, isWebRunnerSlug} from "./webRunnerModel.js";
+
+const DEFAULT_WEB_RUNNER_URL = "http://localhost:7100";
 
 export async function createCustomModel(
   modelSlug: string,
   _scenario: Scenario
 ): Promise<Model> {
+  if (isWebRunnerSlug(modelSlug)) {
+    const webRunnerUrl = process.env.WEB_RUNNER_URL ?? DEFAULT_WEB_RUNNER_URL;
+    return createWebRunnerModel({modelSlug, webRunnerUrl});
+  }
+
   return {
     async getTextResponse() {
       throw new Error(
diff --git a/packages/cli/src/models/model.ts b/packages/cli/src/models/model.ts
index 9299124..d3011c3 100644
--- a/packages/cli/src/models/model.ts
+++ b/packages/cli/src/models/model.ts
@@ -3,4 +3,11 @@ import {ModelRequest, TypedModelRequest} from "@korabench/core";
 export interface Model {
   getTextResponse(request: ModelRequest): Promise<string>;
   getStructuredResponse<T>(request: TypedModelRequest<T>): Promise<T>;
+  /**
+   * Optional cleanup hook. Models backed by long-lived resources (browser
+   * sessions, Browserbase reservations) implement this so the CLI can release
+   * them in a try/finally around `kora.runTest`. Gateway-only Models leave
+   * this undefined.
+   */
+  dispose?(outcome: "completed" | "errored"): Promise<void>;
 }
diff --git a/packages/cli/src/models/webRunnerModel.ts b/packages/cli/src/models/webRunnerModel.ts
new file mode 100644
index 0000000..dd40b9b
--- /dev/null
+++ b/packages/cli/src/models/webRunnerModel.ts
@@ -0,0 +1,149 @@
+import {ModelRequest, TypedModelRequest} from "@korabench/core";
+import {randomUUID} from "node:crypto";
+import {Model} from "./model.js";
+
+const KORA_APP_PREFIX = "kora-app-";
+
+export type BlockedReason =
+  | "captcha"
+  | "rate_limit"
+  | "login_required"
+  | "country_block"
+  | "account_suspended"
+  | "unknown_block";
+
+export class BlockedAppError extends Error {
+  constructor(readonly reason: BlockedReason) {
+    super(`App blocked: ${reason}`);
+    this.name = "BlockedAppError";
+  }
+}
+
+interface WebRunnerModelConfig {
+  modelSlug: string;
+  webRunnerUrl: string;
+  /** Optional identifiers passed to web-runner for evidence keying. If absent,
+   * fresh UUIDs are minted per Model instance (one Model per test). */
+  runId?: string;
+  testKey?: string;
+}
+
+function modelSlugToApp(slug: string): string {
+  if (!slug.startsWith(KORA_APP_PREFIX)) {
+    throw new Error(
+      `WebRunnerModel expected a slug starting with "${KORA_APP_PREFIX}"; got "${slug}"`
+    );
+  }
+  return slug.slice(KORA_APP_PREFIX.length);
+}
+
+function lastUserContent(messages: ModelRequest["messages"]): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const m = messages[i]!;
+    if (m.role === "user") return m.content;
+  }
+  throw new Error("No user message in request transcript.");
+}
+
+/**
+ * Model that drives a real browser session against an AI app via the
+ * `kora-apps` web-runner HTTP service. Implements kora-benchmark's `Model`
+ * interface so it slots into `buildContext` like any other target.
+ *
+ * Lifecycle: lazily opens a session keyed by (runId, testKey) on the first
+ * `getTextResponse` call. Subsequent calls within the same test send only the
+ * latest user turn — the live browser session keeps the conversation state
+ * itself. `dispose` closes the session and releases the underlying resources.
+ */
+export function createWebRunnerModel(config: WebRunnerModelConfig): Model {
+  const app = modelSlugToApp(config.modelSlug);
+  // kora-benchmark's TestContext does not thread runId/testKey through the
+  // assistant request, so we mint our own per-Model identifiers. Each test
+  // creates a fresh Model via createCustomModel, so each session is unique.
+  const runId = config.runId ?? randomUUID();
+  const testKey = config.testKey ?? randomUUID();
+  let sessionId: string | null = null;
+
+  async function ensureSession(): Promise<string> {
+    if (sessionId) return sessionId;
+    const r = await fetch(`${config.webRunnerUrl}/sessions`, {
+      method: "POST",
+      headers: {"content-type": "application/json"},
+      body: JSON.stringify({runId, testKey, app}),
+    });
+    if (!r.ok) {
+      throw new Error(
+        `web-runner POST /sessions failed: ${r.status} ${await r.text()}`
+      );
+    }
+    const data = (await r.json()) as {
+      sessionId?: string;
+      blockedReason?: BlockedReason;
+    };
+    if (data.blockedReason) throw new BlockedAppError(data.blockedReason);
+    if (!data.sessionId) throw new Error("web-runner did not return sessionId");
+    sessionId = data.sessionId;
+    return sessionId;
+  }
+
+  async function postTurn(text: string): Promise<string> {
+    if (!sessionId) throw new Error("Session not open.");
+    const r = await fetch(
+      `${config.webRunnerUrl}/sessions/${sessionId}/turn`,
+      {
+        method: "POST",
+        headers: {"content-type": "application/json"},
+        body: JSON.stringify({userMessage: text}),
+      }
+    );
+    if (!r.ok) {
+      throw new Error(
+        `web-runner POST /sessions/${sessionId}/turn failed: ${r.status} ${await r.text()}`
+      );
+    }
+    const data = (await r.json()) as {
+      assistantMessage?: string;
+      blockedReason?: BlockedReason;
+    };
+    if (data.blockedReason) throw new BlockedAppError(data.blockedReason);
+    if (typeof data.assistantMessage !== "string") {
+      throw new Error("web-runner did not return assistantMessage");
+    }
+    return data.assistantMessage;
+  }
+
+  return {
+    async getTextResponse(request: ModelRequest): Promise<string> {
+      await ensureSession();
+      return postTurn(lastUserContent(request.messages));
+    },
+
+    async getStructuredResponse<T>(_request: TypedModelRequest<T>): Promise<T> {
+      throw new Error(
+        `kora-app:* targets do not support structured output. Slug: ${config.modelSlug}`
+      );
+    },
+
+    async dispose(outcome) {
+      if (!sessionId) return;
+      const id = sessionId;
+      sessionId = null;
+      try {
+        await fetch(`${config.webRunnerUrl}/sessions/${id}`, {
+          method: "DELETE",
+          headers: {"content-type": "application/json"},
+          body: JSON.stringify({outcome}),
+        });
+      } catch (err) {
+        // Best-effort — log to stderr so it's visible without throwing.
+        console.error(
+          `web-runner DELETE /sessions/${id} failed: ${err instanceof Error ? err.message : err}`
+        );
+      }
+    },
+  };
+}
+
+export function isWebRunnerSlug(slug: string): boolean {
+  return slug.startsWith(KORA_APP_PREFIX);
+}

From 4b40ccd4a6c8c8c8a4d0708e0cb8ae784cd4b857 Mon Sep 17 00:00:00 2001
From: Thibaut Fatus <tfatus@gmail.com>
Date: Tue, 5 May 2026 17:16:33 +0200
Subject: [PATCH 2/6] feat(cli): WebRunnerModel sends bearer auth when
 configured

Lets the public-CLI dev harness target a deployed web-runner that
requires auth. Reads WEB_RUNNER_API_KEY from env (alongside the
existing WEB_RUNNER_URL), passes it into createWebRunnerModel, and
attaches Authorization: Bearer <key> on all 3 fetches. Anonymous
fallback (no header) preserves behavior against an unauthenticated
local web-runner.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/cli/src/models/customModel.ts    |  3 ++-
 packages/cli/src/models/webRunnerModel.ts | 12 +++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/packages/cli/src/models/customModel.ts b/packages/cli/src/models/customModel.ts
index be7d3ba..6c678a6 100644
--- a/packages/cli/src/models/customModel.ts
+++ b/packages/cli/src/models/customModel.ts
@@ -10,7 +10,8 @@ export async function createCustomModel(
 ): Promise<Model> {
   if (isWebRunnerSlug(modelSlug)) {
     const webRunnerUrl = process.env.WEB_RUNNER_URL ?? DEFAULT_WEB_RUNNER_URL;
-    return createWebRunnerModel({modelSlug, webRunnerUrl});
+    const apiKey = process.env.WEB_RUNNER_API_KEY;
+    return createWebRunnerModel({modelSlug, webRunnerUrl, apiKey});
   }
 
   return {
diff --git a/packages/cli/src/models/webRunnerModel.ts b/packages/cli/src/models/webRunnerModel.ts
index dd40b9b..c69b37d 100644
--- a/packages/cli/src/models/webRunnerModel.ts
+++ b/packages/cli/src/models/webRunnerModel.ts
@@ -22,6 +22,9 @@ export class BlockedAppError extends Error {
 interface WebRunnerModelConfig {
   modelSlug: string;
   webRunnerUrl: string;
+  /** Optional bearer token sent as `Authorization: Bearer <key>`. When omitted,
+   * no auth header is sent (suits an unauthenticated local web-runner). */
+  apiKey?: string;
   /** Optional identifiers passed to web-runner for evidence keying. If absent,
    * fresh UUIDs are minted per Model instance (one Model per test). */
   runId?: string;
@@ -64,11 +67,14 @@ export function createWebRunnerModel(config: WebRunnerModelConfig): Model {
   const testKey = config.testKey ?? randomUUID();
   let sessionId: string | null = null;
 
+  const headers: Record<string, string> = {"content-type": "application/json"};
+  if (config.apiKey) headers["authorization"] = `Bearer ${config.apiKey}`;
+
   async function ensureSession(): Promise<string> {
     if (sessionId) return sessionId;
     const r = await fetch(`${config.webRunnerUrl}/sessions`, {
       method: "POST",
-      headers: {"content-type": "application/json"},
+      headers,
       body: JSON.stringify({runId, testKey, app}),
     });
     if (!r.ok) {
@@ -92,7 +98,7 @@ export function createWebRunnerModel(config: WebRunnerModelConfig): Model {
       `${config.webRunnerUrl}/sessions/${sessionId}/turn`,
       {
         method: "POST",
-        headers: {"content-type": "application/json"},
+        headers,
         body: JSON.stringify({userMessage: text}),
       }
     );
@@ -131,7 +137,7 @@ export function createWebRunnerModel(config: WebRunnerModelConfig): Model {
       try {
         await fetch(`${config.webRunnerUrl}/sessions/${id}`, {
           method: "DELETE",
-          headers: {"content-type": "application/json"},
+          headers,
           body: JSON.stringify({outcome}),
         });
       } catch (err) {

From 43b9c91ee988042f8d09860e75e4fd087dd099be Mon Sep 17 00:00:00 2001
From: Thibaut Fatus <tfatus@gmail.com>
Date: Tue, 5 May 2026 17:29:12 +0200
Subject: [PATCH 3/6] fix(cli): preserve buildContext error catch in
 continueCommand

During the rebase onto main (which extracted buildContext to shared/),
buildContext was inadvertently moved OUTSIDE continueCommand's try
block. On main, construction errors during buildContext were caught
and recorded as a per-task failure; outside the try they would crash
the whole streaming pipeline instead.

Restore the original semantics: keep buildContext inside the try, and
guard the dispose() call in finally with a `built` non-undefined check
since it may not have been assigned if buildContext threw.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/cli/src/commands/continueCommand.ts | 29 +++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/packages/cli/src/commands/continueCommand.ts b/packages/cli/src/commands/continueCommand.ts
index 5b4b081..b514892 100644
--- a/packages/cli/src/commands/continueCommand.ts
+++ b/packages/cli/src/commands/continueCommand.ts
@@ -17,6 +17,7 @@ import {createGatewayModel} from "../models/gatewayModel.js";
 import {Model} from "../models/model.js";
 import {
   buildContext,
+  BuiltContext,
   resolveTargetGatewayModel,
 } from "./shared/buildContext.js";
 import {
@@ -264,16 +265,16 @@ export async function continueCommand(
         // Not yet processed.
       }
 
-      const built = await buildContext(
-        judgeModels,
-        userModel,
-        task.input.modelId,
-        getTargetGateway(task.input.modelId),
-        task.input.scenario
-      );
-
+      let built: BuiltContext | undefined;
       let outcome: "completed" | "errored" = "errored";
       try {
+        built = await buildContext(
+          judgeModels,
+          userModel,
+          task.input.modelId,
+          getTargetGateway(task.input.modelId),
+          task.input.scenario
+        );
         const testResult = await kora.runTest(
           built.context,
           task.input.scenario,
@@ -299,11 +300,13 @@ export async function continueCommand(
         progress.increment(false);
         return [{kind: "failure"}];
       } finally {
-        await built.dispose(outcome).catch(err => {
-          console.error(
-            `\nDispose failed for id=${task.input.id}: ${err instanceof Error ? err.message : err}`
-          );
-        });
+        if (built) {
+          await built.dispose(outcome).catch(err => {
+            console.error(
+              `\nDispose failed for id=${task.input.id}: ${err instanceof Error ? err.message : err}`
+            );
+          });
+        }
       }
     }),
     reduce(

From 4f20e8e062e83713699004bf4e36fc6cc89b0f75 Mon Sep 17 00:00:00 2001
From: Thibaut Fatus <tfatus@gmail.com>
Date: Mon, 18 May 2026 16:30:06 +0200
Subject: [PATCH 4/6] feat(cli): add --concurrency flag to `run` (default 10)

flatTransform was hard-coded to 10 parallel test tasks. For kora-app-*
targets backed by a single shared app account, 10 concurrent browser
sessions on one account triggers rate-limiting / bot-flagging. Expose
--concurrency so such runs can go serial (--concurrency 1). Default
behavior unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/cli/src/cli.ts                 | 12 ++++++++++++
 packages/cli/src/commands/runCommand.ts |  8 +++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts
index ff15830..e2868bd 100644
--- a/packages/cli/src/cli.ts
+++ b/packages/cli/src/cli.ts
@@ -209,6 +209,11 @@ program
     "--limit <count>",
     "maximum number of test tasks to run (useful for smoke tests)"
   )
+  .option(
+    "--concurrency <n>",
+    "max test tasks run in parallel (default 10; use 1 when the target is a single shared app account, e.g. kora-app-*)",
+    "10"
+  )
   .action((targetModel, userModel, opts) => {
     const limit =
       opts.limit !== undefined ? parseInt(opts.limit, 10) : undefined;
@@ -217,6 +222,12 @@ program
         `--limit must be a positive integer (got: ${opts.limit})`
       );
     }
+    const concurrency = parseInt(opts.concurrency, 10);
+    if (!Number.isFinite(concurrency) || concurrency <= 0) {
+      throw new Error(
+        `--concurrency must be a positive integer (got: ${opts.concurrency})`
+      );
+    }
 
     return runCommand(
       program,
@@ -233,6 +244,7 @@ program
           .map(id => id.trim())
           .filter(id => id.length > 0),
         limit,
+        concurrency,
       }
     );
   });
diff --git a/packages/cli/src/commands/runCommand.ts b/packages/cli/src/commands/runCommand.ts
index aa21022..e3c185a 100644
--- a/packages/cli/src/commands/runCommand.ts
+++ b/packages/cli/src/commands/runCommand.ts
@@ -125,6 +125,10 @@ async function hasTempFiles(tempDir: string): Promise<boolean> {
 export interface RunCommandOptions {
   riskIds?: readonly string[];
   limit?: number;
+  /** Max test tasks run in parallel. Defaults to 10. Set to 1 when the target
+   * is a single shared app account (kora-app-*) to avoid concurrent-session
+   * rate-limiting / bot-flagging. */
+  concurrency?: number;
 }
 
 export async function runCommand(
@@ -157,6 +161,8 @@ export async function runCommand(
   if (filters.limit !== undefined) {
     console.log(`Limiting to first ${filters.limit} test task(s).`);
   }
+  const concurrency = options.concurrency ?? 10;
+  console.log(`Concurrency: ${concurrency} parallel test task(s).`);
 
   const judgeModels: Record<string, Model> = Object.fromEntries(
     judgeModelSlugs.map(slug => [
@@ -198,7 +204,7 @@ export async function runCommand(
 
   const {failureCount, testCount, runResult} = await pipeline(
     () => scenariosToTestTasks(scenariosFilePath, prompts, filters),
-    flatTransform(10, async (task: TestTask): Promise<TaskOutcome[]> => {
+    flatTransform(concurrency, async (task: TestTask): Promise<TaskOutcome[]> => {
       const tempFile = path.join(tempDir, taskTempFileName(task.key));
 
       // Check if already processed (graceful restart).

From 22b49fa0f047b1c630992ad3e410d2667fcb9e1f Mon Sep 17 00:00:00 2001
From: Thibaut Fatus <tfatus@gmail.com>
Date: Tue, 19 May 2026 13:40:44 +0200
Subject: [PATCH 5/6] feat(cli): add --reverse and --cooldown flags to `run`

--reverse processes scenarios last-first (buffers the matched task
list, reverses, then applies --limit) for order-effect comparisons.
--cooldown sleeps N seconds between sequential fresh test tasks;
pair with --concurrency 1 to avoid app rate-limiting. Cooldown is
skipped before the first fresh task and for graceful-restart cache
hits, so no leading/trailing dead time.

Also documents --concurrency/--reverse/--cooldown in README.
---
 README.md                               |  3 ++
 packages/cli/src/cli.ts                 | 17 ++++++++
 packages/cli/src/commands/runCommand.ts | 56 +++++++++++++++++++++++++
 3 files changed, 76 insertions(+)

diff --git a/README.md b/README.md
index 8db35d3..b6df91e 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,9 @@ yarn kora run <target-model> [user-model]
 | `--prompts <prompts>` | Comma-separated prompt variants to test (default: `default`)                                                       |
 | `--risk-ids <ids>`    | Comma-separated risk IDs to restrict the run to (default: all scenarios in the input file)                         |
 | `--limit <count>`     | Maximum number of test tasks to run — useful for smoke tests                                                       |
+| `--concurrency <n>`   | Max test tasks run in parallel (default: 10; use 1 for a single shared app account, e.g. `kora-app-*`)             |
+| `--reverse`           | Process scenarios in reverse file order (last scenario first); useful for order-effect comparisons                 |
+| `--cooldown <secs>`   | Seconds to sleep between sequential test tasks; pair with `--concurrency 1` to avoid app rate-limiting (default: 0) |
 
 When multiple judge models are specified, each judge independently evaluates every conversation. The final grade is the **median** across judges (on the ordered scale failing < adequate < exemplary), and the occurrence count is the **mean** (rounded). Per-judge results are stored in each test result for analysis.
 
diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts
index e2868bd..80b42c4 100644
--- a/packages/cli/src/cli.ts
+++ b/packages/cli/src/cli.ts
@@ -214,6 +214,15 @@ program
     "max test tasks run in parallel (default 10; use 1 when the target is a single shared app account, e.g. kora-app-*)",
     "10"
   )
+  .option(
+    "--reverse",
+    "process scenarios in reverse file order (last scenario first); useful for order-effect comparisons"
+  )
+  .option(
+    "--cooldown <seconds>",
+    "seconds to sleep between sequential test tasks; use with --concurrency 1 to avoid app rate-limiting (default 0)",
+    "0"
+  )
   .action((targetModel, userModel, opts) => {
     const limit =
       opts.limit !== undefined ? parseInt(opts.limit, 10) : undefined;
@@ -228,6 +237,12 @@ program
         `--concurrency must be a positive integer (got: ${opts.concurrency})`
       );
     }
+    const cooldownSeconds = parseInt(opts.cooldown, 10);
+    if (!Number.isFinite(cooldownSeconds) || cooldownSeconds < 0) {
+      throw new Error(
+        `--cooldown must be a non-negative integer (got: ${opts.cooldown})`
+      );
+    }
 
     return runCommand(
       program,
@@ -245,6 +260,8 @@ program
           .filter(id => id.length > 0),
         limit,
         concurrency,
+        reverse: opts.reverse === true,
+        cooldownMs: cooldownSeconds * 1000,
       }
     );
   });
diff --git a/packages/cli/src/commands/runCommand.ts b/packages/cli/src/commands/runCommand.ts
index e3c185a..ea4eea2 100644
--- a/packages/cli/src/commands/runCommand.ts
+++ b/packages/cli/src/commands/runCommand.ts
@@ -35,6 +35,13 @@ interface RunState {
 export interface ScenarioFilters {
   riskIds?: ReadonlySet<string>;
   limit?: number;
+  /** Process scenarios last-first (buffers the matched task list, then
+   * reverses, then applies limit). Used for order-effect comparisons. */
+  reverse?: boolean;
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise(resolve => setTimeout(resolve, ms));
 }
 
 function taskTempFileName(key: string): string {
@@ -65,6 +72,25 @@ export async function* scenariosToTestTasks(
   prompts: readonly ScenarioPrompt[],
   filters: ScenarioFilters
 ): AsyncGenerator<TestTask> {
+  if (filters.reverse) {
+    // Reverse requires the full matched list up front; the corpus is small
+    // (one risk = tens of scenarios) so buffering is cheap. Limit is applied
+    // AFTER reversing, i.e. it keeps the first N of the reversed order.
+    const tasks: TestTask[] = [];
+    for await (const scenario of readScenariosFromJsonl(filePath, filters)) {
+      for (const key of kora.mapScenarioToKeys(scenario, prompts)) {
+        tasks.push({scenario, key});
+      }
+    }
+    tasks.reverse();
+    const capped =
+      filters.limit !== undefined ? tasks.slice(0, filters.limit) : tasks;
+    for (const task of capped) {
+      yield task;
+    }
+    return;
+  }
+
   let yielded = 0;
   for await (const scenario of readScenariosFromJsonl(filePath, filters)) {
     for (const key of kora.mapScenarioToKeys(scenario, prompts)) {
@@ -129,6 +155,12 @@ export interface RunCommandOptions {
    * is a single shared app account (kora-app-*) to avoid concurrent-session
    * rate-limiting / bot-flagging. */
   concurrency?: number;
+  /** Process scenarios last-first. See ScenarioFilters.reverse. */
+  reverse?: boolean;
+  /** Milliseconds to sleep between sequential freshly-executed test tasks
+   * (skipped before the first task and for graceful-restart cache hits).
+   * Pair with concurrency=1 to space out calls to a rate-limited app. */
+  cooldownMs?: number;
 }
 
 export async function runCommand(
@@ -154,6 +186,7 @@ export async function runCommand(
   const filters: ScenarioFilters = {
     riskIds: options.riskIds?.length ? new Set(options.riskIds) : undefined,
     limit: options.limit,
+    reverse: options.reverse === true,
   };
   if (filters.riskIds) {
     console.log(`Filtering to risk IDs: ${[...filters.riskIds].join(", ")}`);
@@ -161,8 +194,18 @@ export async function runCommand(
   if (filters.limit !== undefined) {
     console.log(`Limiting to first ${filters.limit} test task(s).`);
   }
+  if (filters.reverse) {
+    console.log("Processing scenarios in REVERSE order (last scenario first).");
+  }
   const concurrency = options.concurrency ?? 10;
   console.log(`Concurrency: ${concurrency} parallel test task(s).`);
+  const cooldownMs = options.cooldownMs ?? 0;
+  if (cooldownMs > 0) {
+    console.log(
+      `Cooldown between sequential tasks: ${cooldownMs / 1000}s.`
+    );
+  }
+  let freshStarted = 0;
 
   const judgeModels: Record<string, Model> = Object.fromEntries(
     judgeModelSlugs.map(slug => [
@@ -217,6 +260,19 @@ export async function runCommand(
         // Not yet processed.
       }
 
+      // Cooldown: space out fresh executions to avoid app rate-limiting.
+      // Skipped before the very first fresh task and (via the early return
+      // above) for graceful-restart cache hits, so there is no trailing or
+      // leading dead time. Race-free at concurrency=1, which is the only
+      // setting where cooldown is meaningful.
+      if (cooldownMs > 0 && freshStarted > 0) {
+        console.log(
+          `\nCooldown ${cooldownMs / 1000}s before next task (${task.key})…`
+        );
+        await sleep(cooldownMs);
+      }
+      freshStarted++;
+
       const built = await buildContext(
         judgeModels,
         userModel,

From af994ca0805d32b72feab7ca0a3b847cd72d79c0 Mon Sep 17 00:00:00 2001
From: Thibaut Fatus <tfatus@gmail.com>
Date: Tue, 19 May 2026 13:45:27 +0200
Subject: [PATCH 6/6] style: apply prettier to runCommand and webRunnerModel

---
 packages/cli/src/commands/runCommand.ts   | 111 +++++++++++-----------
 packages/cli/src/models/webRunnerModel.ts |  13 +--
 2 files changed, 61 insertions(+), 63 deletions(-)

diff --git a/packages/cli/src/commands/runCommand.ts b/packages/cli/src/commands/runCommand.ts
index ea4eea2..7fee5ac 100644
--- a/packages/cli/src/commands/runCommand.ts
+++ b/packages/cli/src/commands/runCommand.ts
@@ -201,9 +201,7 @@ export async function runCommand(
   console.log(`Concurrency: ${concurrency} parallel test task(s).`);
   const cooldownMs = options.cooldownMs ?? 0;
   if (cooldownMs > 0) {
-    console.log(
-      `Cooldown between sequential tasks: ${cooldownMs / 1000}s.`
-    );
+    console.log(`Cooldown between sequential tasks: ${cooldownMs / 1000}s.`);
   }
   let freshStarted = 0;
 
@@ -247,63 +245,66 @@ export async function runCommand(
 
   const {failureCount, testCount, runResult} = await pipeline(
     () => scenariosToTestTasks(scenariosFilePath, prompts, filters),
-    flatTransform(concurrency, async (task: TestTask): Promise<TaskOutcome[]> => {
-      const tempFile = path.join(tempDir, taskTempFileName(task.key));
-
-      // Check if already processed (graceful restart).
-      try {
-        const content = await fs.readFile(tempFile, "utf-8");
-        progress.increment(true);
-        const testResult = v.parse(kora.testResultType, JSON.parse(content));
-        return [{kind: "success", testResult}];
-      } catch {
-        // Not yet processed.
-      }
+    flatTransform(
+      concurrency,
+      async (task: TestTask): Promise<TaskOutcome[]> => {
+        const tempFile = path.join(tempDir, taskTempFileName(task.key));
+
+        // Check if already processed (graceful restart).
+        try {
+          const content = await fs.readFile(tempFile, "utf-8");
+          progress.increment(true);
+          const testResult = v.parse(kora.testResultType, JSON.parse(content));
+          return [{kind: "success", testResult}];
+        } catch {
+          // Not yet processed.
+        }
 
-      // Cooldown: space out fresh executions to avoid app rate-limiting.
-      // Skipped before the very first fresh task and (via the early return
-      // above) for graceful-restart cache hits, so there is no trailing or
-      // leading dead time. Race-free at concurrency=1, which is the only
-      // setting where cooldown is meaningful.
-      if (cooldownMs > 0 && freshStarted > 0) {
-        console.log(
-          `\nCooldown ${cooldownMs / 1000}s before next task (${task.key})…`
+        // Cooldown: space out fresh executions to avoid app rate-limiting.
+        // Skipped before the very first fresh task and (via the early return
+        // above) for graceful-restart cache hits, so there is no trailing or
+        // leading dead time. Race-free at concurrency=1, which is the only
+        // setting where cooldown is meaningful.
+        if (cooldownMs > 0 && freshStarted > 0) {
+          console.log(
+            `\nCooldown ${cooldownMs / 1000}s before next task (${task.key})…`
+          );
+          await sleep(cooldownMs);
+        }
+        freshStarted++;
+
+        const built = await buildContext(
+          judgeModels,
+          userModel,
+          targetModelSlug,
+          targetGatewayModel,
+          task.scenario
         );
-        await sleep(cooldownMs);
-      }
-      freshStarted++;
-
-      const built = await buildContext(
-        judgeModels,
-        userModel,
-        targetModelSlug,
-        targetGatewayModel,
-        task.scenario
-      );
 
-      let outcome: "completed" | "errored" = "errored";
-      try {
-        const testResult = await kora.runTest(
-          built.context,
-          task.scenario,
-          task.key
-        );
-        outcome = "completed";
-        await fs.writeFile(tempFile, JSON.stringify(testResult, null, 2));
-        progress.increment(true);
-        return [{kind: "success", testResult}];
-      } catch (error) {
-        console.error(`\nTest failed for key ${task.key}: ${error}`);
-        progress.increment(false);
-        return [{kind: "failure"}];
-      } finally {
-        await built.dispose(outcome).catch(err => {
-          console.error(
-            `\nDispose failed for key ${task.key}: ${err instanceof Error ? err.message : err}`
+        let outcome: "completed" | "errored" = "errored";
+        try {
+          const testResult = await kora.runTest(
+            built.context,
+            task.scenario,
+            task.key
           );
-        });
+          outcome = "completed";
+          await fs.writeFile(tempFile, JSON.stringify(testResult, null, 2));
+          progress.increment(true);
+          return [{kind: "success", testResult}];
+        } catch (error) {
+          console.error(`\nTest failed for key ${task.key}: ${error}`);
+          progress.increment(false);
+          return [{kind: "failure"}];
+        } finally {
+          await built.dispose(outcome).catch(err => {
+            console.error(
+              `\nDispose failed for key ${task.key}: ${err instanceof Error ? err.message : err}`
+            );
+          });
+        }
       }
-    }),
+    ),
     reduce(
       (state: RunState, outcome: TaskOutcome): RunState => {
         if (outcome.kind === "failure") {
diff --git a/packages/cli/src/models/webRunnerModel.ts b/packages/cli/src/models/webRunnerModel.ts
index c69b37d..50c7a5d 100644
--- a/packages/cli/src/models/webRunnerModel.ts
+++ b/packages/cli/src/models/webRunnerModel.ts
@@ -94,14 +94,11 @@ export function createWebRunnerModel(config: WebRunnerModelConfig): Model {
 
   async function postTurn(text: string): Promise<string> {
     if (!sessionId) throw new Error("Session not open.");
-    const r = await fetch(
-      `${config.webRunnerUrl}/sessions/${sessionId}/turn`,
-      {
-        method: "POST",
-        headers,
-        body: JSON.stringify({userMessage: text}),
-      }
-    );
+    const r = await fetch(`${config.webRunnerUrl}/sessions/${sessionId}/turn`, {
+      method: "POST",
+      headers,
+      body: JSON.stringify({userMessage: text}),
+    });
     if (!r.ok) {
       throw new Error(
         `web-runner POST /sessions/${sessionId}/turn failed: ${r.status} ${await r.text()}`