From fb2851e64bcf99574d6871bb97d681b9ed70981c Mon Sep 17 00:00:00 2001 From: Thibaut Fatus Date: Tue, 12 May 2026 13:05:27 +0200 Subject: [PATCH] [feat] optional trace hook on TestContext for runtime instrumentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an optional `trace?: (event: TraceEvent) => void` field to `TestContext` so consumers can attach observability without taking a dependency on a specific logger. `kora.runTest` emits `user_message` and `assistant_response` events per turn, and `runJudges` emits per-`judge` and overall `judges` events. Existing consumers passing no `trace` are unaffected — the hook is fully optional and the package remains logger-free. --- packages/benchmark/src/benchmark.ts | 8 ++++++++ packages/benchmark/src/kora.ts | 27 ++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/packages/benchmark/src/benchmark.ts b/packages/benchmark/src/benchmark.ts index fc44da6..f46c602 100644 --- a/packages/benchmark/src/benchmark.ts +++ b/packages/benchmark/src/benchmark.ts @@ -29,11 +29,19 @@ export interface JudgeModel { ) => Promise>; } +export type TraceEvent = + | {phase: "user_message"; turn: number; durationMs: number} + | {phase: "assistant_response"; turn: number; durationMs: number} + | {phase: "judge"; slug: string; durationMs: number} + | {phase: "judges"; durationMs: number; judgeCount: number}; + export interface TestContext { getUserResponse: (request: ModelRequest) => Promise; getAssistantResponse: (request: ModelRequest) => Promise; /** Record of judge model slug → callable judge model. */ judgeModels: Record; + /** Optional observability hook. No-op when undefined. */ + trace?: (event: TraceEvent) => void; } export interface GenerationEvent { diff --git a/packages/benchmark/src/kora.ts b/packages/benchmark/src/kora.ts index f8ff70d..a367b89 100644 --- a/packages/benchmark/src/kora.ts +++ b/packages/benchmark/src/kora.ts @@ -13,7 +13,7 @@ import { PinnedDemographics, } from "./allocation/allocatePersonas.js"; import {makeRng, shuffleWith} from "./allocation/rng.js"; -import {Benchmark, JudgeModel} from "./benchmark.js"; +import {Benchmark, JudgeModel, TraceEvent} from "./benchmark.js"; import { generateFirstUserMessage, generateNextUserMessage, @@ -78,7 +78,8 @@ export async function runJudges( judgeModels: Record, scenario: Scenario, prompt: ScenarioPrompt, - messages: readonly ModelMessage[] + messages: readonly ModelMessage[], + trace?: (event: TraceEvent) => void ): Promise { const riskCategory = RiskCategory.find(scenario.seed.riskCategoryId); const risk = RiskCategory.findRisk(riskCategory, scenario.seed.riskId); @@ -94,10 +95,12 @@ export async function runJudges( ); const judgeSlugs = Object.keys(judgeModels); + const tJudges = Date.now(); const judgeAssessments = await Promise.all( judgeSlugs.map(async (slug): Promise => { const judge = judgeModels[slug]!; + const tJudge = Date.now(); const [safetyResult, mechanismResult] = await Promise.all([ judge.getResponse({ messages: [ @@ -114,6 +117,7 @@ export async function runJudges( outputType: MechanismAssessment.io, }), ]); + trace?.({phase: "judge", slug, durationMs: Date.now() - tJudge}); return { judgeModelSlug: slug, @@ -122,6 +126,11 @@ export async function runJudges( }; }) ); + trace?.({ + phase: "judges", + durationMs: Date.now() - tJudges, + judgeCount: judgeSlugs.length, + }); const assessment = aggregateTestAssessments( judgeAssessments.map(j => j.assessment) @@ -448,6 +457,7 @@ export const kora = Benchmark.new({ const startTurn = messages.length / 2; for (let i = startTurn; i < conversationLength; i++) { + const tUserMessage = Date.now(); const userMessage = await (() => { if (i === 0) { return scenario.firstUserMessage; @@ -455,6 +465,11 @@ export const kora = Benchmark.new({ return generateNextUserMessage(c, risk, scenario, messages); })(); + c.trace?.({ + phase: "user_message", + turn: i, + durationMs: Date.now() - tUserMessage, + }); // TODO: Maybe assert refusal of user model. @@ -463,6 +478,7 @@ export const kora = Benchmark.new({ content: userMessage, }); + const tAssistant = Date.now(); const modelMessage = await (async () => { const modelPrompt = conversationToNextMessagePrompt({ ageRange: promptAgeRange, @@ -479,6 +495,11 @@ export const kora = Benchmark.new({ }); return output; })(); + c.trace?.({ + phase: "assistant_response", + turn: i, + durationMs: Date.now() - tAssistant, + }); messages.push({ role: "assistant", @@ -486,7 +507,7 @@ export const kora = Benchmark.new({ }); } - return runJudges(c.judgeModels, scenario, prompt, messages); + return runJudges(c.judgeModels, scenario, prompt, messages, c.trace); }, mapTestResultToRunResult(result) { const {assessment, mechanismAssessment} = result;