diff --git a/.changeset/verifier-trajectory-events.md b/.changeset/verifier-trajectory-events.md new file mode 100644 index 0000000000..9dcb5c8192 --- /dev/null +++ b/.changeset/verifier-trajectory-events.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Capture verifier trajectory evidence from v3 agent events for offline scoring. diff --git a/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts new file mode 100644 index 0000000000..8e3fcc050b --- /dev/null +++ b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts @@ -0,0 +1,75 @@ +/** + * captureAriaTreeProbe — capture a truncated accessibility tree of the active + * page for use as tier-2 evidence in the trajectory recorder. + * + * Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the + * callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay + * the cost. + * + * The a11y tree is the same payload the agent's `ariaTree` tool sees, but + * captured by the harness (not the agent) so the verifier has independent + * textual ground truth for grounding non-visual claims — prices, names, + * dates, list contents — without OCR'ing screenshots. + * + * Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures + * across a ~30-step trajectory at that cap sum to ~240k tokens total, + * which the verifier handles via per-criterion top-K selection. The cap + * is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can + * trade RAM/disk for fidelity. Truncated content is marked explicitly so + * the verifier knows it was clipped. + */ +import type { V3 } from "../../v3.js"; + +const APPROX_CHARS_PER_TOKEN = 4; +const DEFAULT_TOKEN_BUDGET = 8_000; +const DEFAULT_TIMEOUT_MS = 5_000; + +interface CaptureAriaTreeOptions { + /** Soft cap on token count (chars/4 approximation). Default 8000. */ + tokenBudget?: number; + /** Hard timeout on the capture. Default 5s. */ + timeoutMs?: number; +} + +/** + * Returns the truncated a11y tree as a plain string, or undefined when + * capture fails. Never throws — a11y capture is best-effort tier-2 evidence, + * not a hard requirement, so failures are silently absorbed (the verifier + * surfaces this via evidence_insufficient). + */ +export async function captureAriaTreeProbe( + v3: V3, + opts: CaptureAriaTreeOptions = {}, +): Promise { + const envBudget = parseInt( + process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? "", + 10, + ); + const tokenBudget = + opts.tokenBudget ?? + (Number.isFinite(envBudget) && envBudget > 0 + ? envBudget + : DEFAULT_TOKEN_BUDGET); + const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS; + const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN; + + try { + // v3.extract() without a schema returns { pageText } where pageText is the + // rendered accessibility tree — same path the agent's ariaTree tool uses. + const result = (await v3.extract({ timeout: timeoutMs })) as { + pageText?: string; + }; + const pageText = result?.pageText; + if (typeof pageText !== "string" || pageText.length === 0) return undefined; + + if (pageText.length > maxChars) { + return ( + pageText.slice(0, maxChars) + + `\n\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]` + ); + } + return pageText; + } catch { + return undefined; + } +} diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index cff08c8a28..fc2761d902 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -41,6 +41,7 @@ import { AgentAbortError, } from "../types/public/sdkErrors.js"; import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js"; +import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js"; import { CaptchaSolver, CAPTCHA_SOLVED_MSG, @@ -248,6 +249,10 @@ export class V3AgentHandler { | GenerateTextOnStepFinishCallback | StreamTextOnStepFinishCallback, ) { + // Monotonic step counter scoped to this execute() call. Each tool call in + // the agent loop becomes one trajectory step. The counter feeds stepIndex + // on the bus events the TrajectoryRecorder subscribes to. + let stepCounter = 0; return async (event: StepResult) => { this.logger({ category: "agent", @@ -255,6 +260,11 @@ export class V3AgentHandler { level: 2, }); + const stepIndicesInTurn: number[] = []; + let lastFinalAnswer: + | { message: string; output?: Record } + | undefined; + if (event.toolCalls && event.toolCalls.length > 0) { for (let i = 0; i < event.toolCalls.length; i++) { const toolCall = event.toolCalls[i]; @@ -279,6 +289,13 @@ export class V3AgentHandler { ? `${allReasoning} ${doneReasoning}`.trim() : allReasoning || "Task completed successfully"; } + lastFinalAnswer = { + message: state.finalMessage, + output: + typeof args?.output === "object" && args?.output !== null + ? (args.output as Record) + : undefined, + }; } const mappedActions = mapToolResultToActions({ toolCallName: toolCall.toolName, @@ -292,8 +309,100 @@ export class V3AgentHandler { action.timestamp = Date.now(); state.actions.push(action); } + + // Emit step_finished_event per tool call. The TrajectoryRecorder + // builds one Trajectory.Step per emission. tier-1 evidence (the + // bytes the LLM consumed) is captured separately via an + // onStepFinish wrapper in the harness. + const stepIndex = stepCounter++; + stepIndicesInTurn.push(stepIndex); + const toolOk = + !toolResult || + (typeof toolResult === "object" && + !("error" in toolResult) && + !("isError" in toolResult && toolResult.isError)); + this.v3.bus.emit("agent_step_finished_event", { + stepIndex, + actionName: toolCall.toolName, + actionArgs: + typeof args === "object" && args !== null + ? (args as Record) + : {}, + reasoning: event.text ?? "", + toolOutput: { + ok: toolOk, + result: toolResult, + error: + toolResult && + typeof toolResult === "object" && + "error" in toolResult && + typeof (toolResult as { error?: unknown }).error === "string" + ? (toolResult as { error: string }).error + : undefined, + }, + finishedAt: new Date().toISOString(), + }); } state.currentPageUrl = (await this.v3.context.awaitActivePage()).url(); + + // Harness probe — take a single screenshot / a11y snapshot per AI SDK + // step and attach it to every tool call in that turn. The observation + // reflects the settled page state after the batch of tool calls; this + // is more faithful than dropping probe evidence for all but the last + // tool call, while still avoiding per-tool screenshot overhead. + const wantsScreenshotProbe = + this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0; + const wantsStepObservation = + this.v3.bus.listenerCount?.("agent_step_observed_event") > 0; + if ( + stepIndicesInTurn.length > 0 && + (wantsScreenshotProbe || wantsStepObservation) + ) { + try { + const page = await this.v3.context.awaitActivePage(); + let screenshot: Buffer | undefined; + if (wantsScreenshotProbe) { + screenshot = await page.screenshot({ fullPage: false }); + } + let ariaTree: string | undefined; + if (wantsStepObservation) { + // Capture the a11y tree alongside the URL probe so the verifier + // can ground textual claims (prices, names, dates) without OCR. + // Best-effort: returns undefined on failure/timeout. + ariaTree = await captureAriaTreeProbe(this.v3); + } + for (const stepIndex of stepIndicesInTurn) { + if (screenshot) { + // DOM/hybrid: this post-step screenshot is a harness probe + // only. The agent's tier-1 evidence is the tool's return value + // captured separately in agent_step_finished_event. + this.v3.bus.emit("agent_screenshot_taken_event", { + stepIndex, + screenshot, + url: state.currentPageUrl, + evidenceRole: "probe", + }); + } + if (wantsStepObservation) { + this.v3.bus.emit("agent_step_observed_event", { + stepIndex, + url: state.currentPageUrl, + ariaTree, + }); + } + } + } catch (e) { + this.logger({ + category: "agent", + message: `Warning: harness probe failed: ${getErrorMessage(e)}`, + level: 1, + }); + } + } + } + + if (lastFinalAnswer) { + this.v3.bus.emit("agent_final_answer_event", lastFinalAnswer); } if (userCallback) { diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index af3a3dad87..bc1d6d5fd4 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -7,6 +7,7 @@ import { GoogleCUAClient } from "../agent/GoogleCUAClient.js"; import { OpenAICUAClient } from "../agent/OpenAICUAClient.js"; import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js"; import { ensureXPath } from "../agent/utils/xpath.js"; +import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js"; import { ActionExecutionResult, AgentAction, @@ -16,6 +17,7 @@ import { SafetyConfirmationHandler, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; +import type { AgentScreenshotTakenEvent } from "../types/public/busEvents.js"; import { type Action, V3FunctionName } from "../types/public/methods.js"; import { FlowLogger } from "../flowlogger/FlowLogger.js"; import { toTitleCase } from "../../utils.js"; @@ -37,6 +39,13 @@ export class V3CuaAgentHandler { private captchaSolver: CaptchaSolver | null = null; private captchaClickGuardRemaining = 0; private currentInstruction = ""; + // Monotonic step counter used by bus events. The CUA loop is internal to + // the agent client, so unlike v3AgentHandler we don't have per-tool-call + // step events; instead we tag every screenshot emission with an + // incrementing index. + private cuaStepCounter = 0; + private latestCuaScreenshot?: AgentScreenshotTakenEvent; + private latestCuaScreenshotConsumed = true; constructor( v3: V3, @@ -76,6 +85,17 @@ export class V3CuaAgentHandler { this.ensureNotClosed(); const page = await this.v3.context.awaitActivePage(); const screenshotBuffer = await page.screenshot({ fullPage: false }); + + // Emit bus event so TrajectoryRecorder can capture the screenshot. In + // CUA mode this is the same buffer the provider receives — i.e., it + // serves both as tier-1 evidence (what the model saw) and as a tier-2 + // probe. + try { + this.emitCuaScreenshot(screenshotBuffer, page.url()); + } catch { + // bus emit errors are non-fatal + } + return screenshotBuffer.toString("base64"); // base64 png }); @@ -120,6 +140,7 @@ export class V3CuaAgentHandler { (this.options.clientOptions?.waitBetweenActions as number) || defaultDelay; try { + let executionResult: ActionExecutionResult | undefined; // Try to inject cursor before each action if enabled if (this.highlightCursor) { try { @@ -133,7 +154,7 @@ export class V3CuaAgentHandler { // takes its own screenshot via screenshotProvider between API turns. const shouldLog = action.type !== "screenshot"; if (shouldLog) { - await FlowLogger.runWithLogging( + executionResult = await FlowLogger.runWithLogging( { eventType: `V3Cua${toTitleCase(action.type)}`, // e.g. "V3CuaClick" data: { @@ -145,10 +166,13 @@ export class V3CuaAgentHandler { [action], ); } else { - await this.executeAction(action); + executionResult = await this.executeAction(action); } action.timestamp = Date.now(); + if (shouldLog) { + await this.emitCuaActionStep(action, executionResult); + } await new Promise((r) => setTimeout(r, waitBetween)); } catch (error) { @@ -658,6 +682,15 @@ export class V3CuaAgentHandler { const screenshotBuffer = await page.screenshot({ fullPage: false }); const currentUrl = page.url(); + + // Mirror the screenshot to the bus — same buffer the CUA client + // received, so it serves as both tier-1 evidence and tier-2 probe. + try { + this.emitCuaScreenshot(screenshotBuffer, currentUrl); + } catch { + // non-fatal + } + return await this.agentClient.captureScreenshot({ base64Image: screenshotBuffer.toString("base64"), currentUrl, @@ -767,6 +800,129 @@ export class V3CuaAgentHandler { } } + /** + * Emit a pre-action CUA screenshot — the exact buffer the model received + * as input. Tier-1 evidence (agent-mirrored); the tier-2 probe is taken + * separately in emitCuaActionStep after the action runs, so the recorder + * can compare what the model saw against what the page actually showed + * once the keystrokes/clicks landed. + */ + private emitCuaScreenshot( + screenshot: Buffer, + url: string, + ): AgentScreenshotTakenEvent { + const event: AgentScreenshotTakenEvent = { + stepIndex: this.cuaStepCounter++, + screenshot, + url, + evidenceRole: "agent", + }; + this.latestCuaScreenshot = event; + this.latestCuaScreenshotConsumed = false; + this.v3.bus.emit("agent_screenshot_taken_event", event); + return event; + } + + private async emitCuaActionStep( + action: AgentAction, + result: ActionExecutionResult | undefined, + ): Promise { + let pageUrl = + typeof action.pageUrl === "string" + ? action.pageUrl + : this.latestCuaScreenshot?.url; + try { + pageUrl = (await this.v3.context.awaitActivePage()).url(); + } catch { + // Keep the best pre-action URL fallback. + } + let stepIndex: number; + + if (this.latestCuaScreenshot && !this.latestCuaScreenshotConsumed) { + stepIndex = this.latestCuaScreenshot.stepIndex; + this.latestCuaScreenshotConsumed = true; + } else if (this.latestCuaScreenshot) { + stepIndex = this.cuaStepCounter++; + this.v3.bus.emit("agent_screenshot_taken_event", { + ...this.latestCuaScreenshot, + stepIndex, + }); + } else { + stepIndex = this.cuaStepCounter++; + } + + const actionArgs = Object.fromEntries( + Object.entries(action).filter(([key]) => key !== "screenshot"), + ); + const reasoning = + typeof action.reasoning === "string" + ? action.reasoning + : typeof action.action === "string" + ? action.action + : ""; + + this.v3.bus.emit("agent_step_finished_event", { + stepIndex, + actionName: String(action.type), + actionArgs, + reasoning, + toolOutput: { + ok: result?.success !== false, + result: result ?? { success: true }, + error: result?.error, + }, + finishedAt: new Date().toISOString(), + }); + + // Post-action tier-2 probe. The pre-action screenshot from + // screenshotProvider is what the model SAW; this one shows what the + // page actually LOOKS LIKE after the action ran. Without this the + // verifier has no visual evidence that keystrokes/clicks landed, and + // has to trust the action history alone. + // + // Listener-gated to keep ordinary agent runs free of the extra + // screenshot cost — mirrors v3AgentHandler's post-step probe. + const wantsScreenshotProbe = + this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0; + const wantsStepObservation = + this.v3.bus.listenerCount?.("agent_step_observed_event") > 0; + let probeUrl = pageUrl; + if (wantsScreenshotProbe || wantsStepObservation) { + try { + const page = await this.v3.context.awaitActivePage(); + probeUrl = page.url(); + if (wantsScreenshotProbe) { + const probeScreenshot = await page.screenshot({ fullPage: false }); + this.v3.bus.emit("agent_screenshot_taken_event", { + stepIndex, + screenshot: probeScreenshot, + url: probeUrl, + evidenceRole: "probe", + }); + } + } catch (e) { + this.logger({ + category: "agent", + message: `Warning: CUA post-action probe failed: ${ + e instanceof Error ? e.message : String(e) + }`, + level: 1, + }); + } + } + + if (probeUrl && wantsStepObservation) { + // Capture the a11y tree alongside the URL probe so the verifier can + // ground textual claims without OCR. Best-effort. + const ariaTree = await captureAriaTreeProbe(this.v3); + this.v3.bus.emit("agent_step_observed_event", { + stepIndex, + url: probeUrl, + ariaTree, + }); + } + } + private async injectCursor(): Promise { try { const page = await this.v3.context.awaitActivePage(); diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index 8e21fb0309..a5cbccf746 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -28,6 +28,8 @@ import { loadTrajectoryFromDisk, nextResultFilename, normalizeRubric, + shouldPersistTrajectory, + writeTrajectoryDir, } from "./verifier/index.js"; export { V3 } from "./v3.js"; @@ -93,6 +95,8 @@ export { loadTrajectoryFromDisk, nextResultFilename, normalizeRubric, + shouldPersistTrajectory, + writeTrajectoryDir, } from "./verifier/index.js"; export { tool } from "ai"; export { getAISDKLanguageModel } from "./llm/LLMProvider.js"; @@ -147,6 +151,8 @@ const StagehandDefault = { loadTrajectoryFromDisk, nextResultFilename, normalizeRubric, + shouldPersistTrajectory, + writeTrajectoryDir, tool, getAISDKLanguageModel, __internalCreateInMemoryAgentCacheHandle, diff --git a/packages/core/lib/v3/types/public/busEvents.ts b/packages/core/lib/v3/types/public/busEvents.ts new file mode 100644 index 0000000000..e2fa119499 --- /dev/null +++ b/packages/core/lib/v3/types/public/busEvents.ts @@ -0,0 +1,99 @@ +/** + * Bus event payloads emitted by V3 on `v3.bus`. + * + * The bus is an EventEmitter; these types document the payload shape per + * event name so consumers (TrajectoryRecorder in packages/evals, custom + * subscribers) can type their handlers. + * + * The verifier recorder consumes these events to assemble persisted + * trajectories without coupling to individual agent handlers. + */ + +/** + * Names of bus events the agent handlers emit. Use these constants to + * subscribe; the bus accepts arbitrary strings, but a centralized list helps + * catch typos at the call site. + */ +export const BUS_EVENTS = { + AGENT_SCREENSHOT_TAKEN: "agent_screenshot_taken_event", + AGENT_STEP_FINISHED: "agent_step_finished_event", + AGENT_STEP_OBSERVED: "agent_step_observed_event", + AGENT_FINAL_ANSWER: "agent_final_answer_event", +} as const; + +export type BusEventName = (typeof BUS_EVENTS)[keyof typeof BUS_EVENTS]; + +/** + * Payload for `agent_screenshot_taken_event`. The raw screenshot Buffer the + * harness took after a step's tool execution. + * + * Note: in CUA mode the same Buffer is also what the provider received; in + * DOM/hybrid mode it's an independent harness probe. The verifier treats them + * as different evidence tiers regardless. + */ +export interface AgentScreenshotTakenEvent { + /** Zero-based index of the step this screenshot corresponds to. */ + stepIndex: number; + /** PNG bytes from page.screenshot(). */ + screenshot: Buffer; + /** Page URL at the time of capture. */ + url: string; + /** + * Evidence role for this screenshot. + * + * DOM/hybrid post-tool screenshots are probe-only. CUA screenshots are also + * the exact image bytes sent to the provider, so they serve both as tier-1 + * agent evidence and tier-2 probe evidence. + */ + evidenceRole?: "probe" | "agent" | "agent_and_probe"; +} + +/** + * Payload for `agent_step_finished_event`. Emitted once per tool call within + * a step result. Carries the tool's reported outcome and a reference to the + * agent's textual reasoning for the step. + * + * Tier 1 evidence (the bytes the LLM consumed as the tool result) is captured + * separately by the harness via an AgentExecuteCallbacks.onStepFinish wrapper + * and is not part of this payload. + */ +export interface AgentStepFinishedEvent { + stepIndex: number; + /** Name of the tool that ran (e.g., "act", "extract", "click"). */ + actionName: string; + /** Arguments passed to the tool. */ + actionArgs: Record; + /** Agent's textual reasoning (event.text on the AI SDK StepResult). */ + reasoning: string; + /** Outcome of the tool execution as seen by the harness. */ + toolOutput: { + ok: boolean; + /** The tool's native return value. */ + result: unknown; + error?: string; + }; + /** ISO 8601 timestamp at which the step finished. */ + finishedAt: string; +} + +/** + * Payload for `agent_step_observed_event`. Emitted after the harness probe + * completes for a step. + */ +export interface AgentStepObservedEvent { + stepIndex: number; + /** Page URL after the step's tool execution. */ + url: string; + /** v1 — accessibility tree snapshot. */ + ariaTree?: string; + /** v1 — viewport scroll context. */ + scroll?: { top: number; pageHeight: number }; +} + +/** Payload for `agent_final_answer_event`. Emitted when the `done` tool resolves. */ +export interface AgentFinalAnswerEvent { + /** The agent's final summary message. */ + message: string; + /** Optional structured output if the agent's `output` schema was set. */ + output?: Record; +} diff --git a/packages/core/lib/v3/types/public/index.ts b/packages/core/lib/v3/types/public/index.ts index 9c5df08d01..9bf24eb271 100644 --- a/packages/core/lib/v3/types/public/index.ts +++ b/packages/core/lib/v3/types/public/index.ts @@ -1,4 +1,5 @@ export * from "./agent.js"; +export * from "./busEvents.js"; // Export api.ts under namespace to avoid conflicts with methods.ts types export * as Api from "./api.js"; // Also export BrowserbaseRegion directly for convenience diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts index 4061533ab9..2b14cfb16a 100644 --- a/packages/core/lib/v3/verifier/index.ts +++ b/packages/core/lib/v3/verifier/index.ts @@ -25,4 +25,6 @@ export { loadTrajectoryFromDisk, nextResultFilename, normalizeRubric, + shouldPersistTrajectory, + writeTrajectoryDir, } from "./trajectory.js"; diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index a18f025c37..ec602d04d0 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -1,3 +1,5 @@ +import fs from "node:fs/promises"; +import path from "node:path"; import type { AgentEvidenceModality, ProbeEvidence, @@ -187,3 +189,138 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { export function nextResultFilename(label?: string): string { return `result_${normalizeResultLabel(label)}.json`; } + +/** + * Default persistence policy: explicit override, then env, then "on unless CI". + */ +export function shouldPersistTrajectory( + override: boolean | undefined, +): boolean { + if (override !== undefined) return override; + const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase(); + if (env === "1" || env === "true") return true; + if (env === "0" || env === "false") return false; + return !process.env.CI; +} + +/** + * Write the on-disk trajectory layout under `dir`: + * + * / + * ├── task_data.json + * ├── trajectory.json (screenshots referenced by path) + * ├── screenshots/ + * │ ├── probe/.png + * │ └── agent/[_M].png + * ├── times.json + * ├── scores/ (empty; populated separately) + * └── core.log + * + * Image bytes are externalized to PNG files; the in-memory Trajectory is left + * untouched so callers can keep using it after persistence. + */ +export async function writeTrajectoryDir( + dir: string, + trajectory: Trajectory, +): Promise { + await fs.mkdir(dir, { recursive: true }); + await fs.mkdir(path.join(dir, "screenshots", "probe"), { recursive: true }); + await fs.mkdir(path.join(dir, "screenshots", "agent"), { recursive: true }); + + const serializableSteps: unknown[] = []; + for (const step of trajectory.steps) { + const probe: ProbeEvidence = { ...step.probeEvidence }; + if (probe.screenshot) { + const relPath = `screenshots/probe/${step.index + 1}.png`; + await fs.writeFile(path.join(dir, relPath), probe.screenshot); + probe.screenshotPath = relPath; + delete probe.screenshot; + } + + const imageModalities = step.agentEvidence.modalities.filter( + (m) => m.type === "image", + ); + const multipleImages = imageModalities.length > 1; + let imageSeq = 0; + const modalities: unknown[] = []; + for (const m of step.agentEvidence.modalities) { + if (m.type !== "image") { + modalities.push(m); + continue; + } + const suffix = multipleImages ? `_${imageSeq}` : ""; + const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`; + await fs.writeFile(path.join(dir, relPath), m.bytes); + modalities.push({ + type: "image", + imagePath: relPath, + mediaType: m.mediaType, + }); + imageSeq += 1; + } + serializableSteps.push({ + ...step, + probeEvidence: probe, + agentEvidence: { modalities }, + }); + } + + // Image modalities carry imagePath instead of raw bytes on disk; cast + // through unknown rather than widen Trajectory's type contract. + const serialized = { + ...trajectory, + steps: serializableSteps, + } as unknown; + + await fs.writeFile( + path.join(dir, "trajectory.json"), + JSON.stringify(serialized, null, 2), + ); + + await fs.writeFile( + path.join(dir, "task_data.json"), + JSON.stringify( + { + task: trajectory.task, + status: trajectory.status, + finalAnswer: trajectory.finalAnswer ?? null, + }, + null, + 2, + ), + ); + + await fs.writeFile( + path.join(dir, "times.json"), + JSON.stringify( + { + timing: trajectory.timing, + usage: trajectory.usage, + stepCount: trajectory.steps.length, + }, + null, + 2, + ), + ); + + await fs.mkdir(path.join(dir, "scores"), { recursive: true }); + await fs.writeFile(path.join(dir, "core.log"), coreLog(trajectory)); +} + +function coreLog(trajectory: Trajectory): string { + return ( + trajectory.steps + .map((step) => + JSON.stringify({ + step: step.index, + action: step.actionName, + url: step.probeEvidence.url ?? null, + ok: step.toolOutput.ok, + reasoning: step.reasoning || undefined, + startedAt: step.startedAt, + finishedAt: step.finishedAt, + }), + ) + .join("\n") + "\n" + ); +} diff --git a/packages/core/tests/unit/agent-temperature.test.ts b/packages/core/tests/unit/agent-temperature.test.ts index 45184a9888..8f12b4a7e6 100644 --- a/packages/core/tests/unit/agent-temperature.test.ts +++ b/packages/core/tests/unit/agent-temperature.test.ts @@ -125,6 +125,12 @@ function createV3() { context: { awaitActivePage: vi.fn(async () => page), }, + bus: { + emit: vi.fn(), + on: vi.fn(), + off: vi.fn(), + listenerCount: vi.fn(() => 0), + }, isCaptchaAutoSolveEnabled: false, browserbaseApiKey: undefined, logger: vi.fn(), diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts index e73cde4178..fe4003f138 100644 --- a/packages/core/tests/unit/public-api/export-surface.test.ts +++ b/packages/core/tests/unit/public-api/export-surface.test.ts @@ -21,6 +21,7 @@ const publicApiShape = { AISdkClient: Stagehand.AISdkClient, Api: Stagehand.Api, AVAILABLE_CUA_MODELS: Stagehand.AVAILABLE_CUA_MODELS, + BUS_EVENTS: Stagehand.BUS_EVENTS, AgentProvider: Stagehand.AgentProvider, AnnotatedScreenshotText: Stagehand.AnnotatedScreenshotText, ConsoleMessage: Stagehand.ConsoleMessage, @@ -50,12 +51,14 @@ const publicApiShape = { normalizeRubric: Stagehand.normalizeRubric, pageTextSchema: Stagehand.pageTextSchema, providerEnvVarMap: Stagehand.providerEnvVarMap, + shouldPersistTrajectory: Stagehand.shouldPersistTrajectory, toGeminiSchema: Stagehand.toGeminiSchema, toJsonSchema: Stagehand.toJsonSchema, tool: Stagehand.tool, transformSchema: Stagehand.transformSchema, trimTrailingTextNode: Stagehand.trimTrailingTextNode, validateZodSchema: Stagehand.validateZodSchema, + writeTrajectoryDir: Stagehand.writeTrajectoryDir, ...publicErrorTypes, } as const; diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts new file mode 100644 index 0000000000..3cf1c17621 --- /dev/null +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -0,0 +1,331 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { + shouldPersistTrajectory, + writeTrajectoryDir, +} from "@browserbasehq/stagehand"; +import type { + AgentEvidence, + AgentFinalAnswerEvent, + AgentScreenshotTakenEvent, + AgentStepFinishedEvent, + AgentStepObservedEvent, + ProbeEvidence, + TaskSpec, + Trajectory, + TrajectoryStatus, + TrajectoryStep, + TrajectoryUsage, + EvaluationResult, + V3, +} from "@browserbasehq/stagehand"; + +interface PartialStep { + index: number; + actionName: string; + actionArgs: Record; + reasoning: string; + agentEvidence: AgentEvidence; + probeEvidence: ProbeEvidence; + toolOutput: { ok: boolean; result: unknown; error?: string }; + finishedAt: string; +} + +export interface TrajectoryRecorderOptions { + v3: V3; + taskSpec: TaskSpec; + /** + * Root directory under which trajectory dirs are written. Each task run + * gets a subdirectory named by runId/task.id. + * Defaults to `/.trajectories`. + */ + outputRoot?: string; + /** Run identifier (e.g., ISO timestamp + env). Defaults to a fresh timestamp. */ + runId?: string; + /** + * Override the env-gated persistence default. `true` always persists, + * `false` never does, `undefined` defers to VERIFIER_PERSIST_TRAJECTORIES. + */ + persist?: boolean; +} + +export interface TrajectoryFinishOptions { + status: TrajectoryStatus; + finalAnswer?: string; + usage?: Partial; +} + +const ZERO_USAGE: TrajectoryUsage = { + input_tokens: 0, + output_tokens: 0, +}; + +export class TrajectoryRecorder { + private readonly v3: V3; + private readonly taskSpec: TaskSpec; + private readonly runId: string; + private readonly outputDir: string; + private readonly persistEnabled: boolean; + + // Events can arrive out-of-order across step indices; same-step events all + // fire in one microtask. + private readonly partialSteps = new Map>(); + private readonly observationByStep = new Map< + number, + AgentStepObservedEvent + >(); + private readonly screenshotsByStep = new Map< + number, + AgentScreenshotTakenEvent + >(); + private finalAnswerEvent?: AgentFinalAnswerEvent; + private startedAt = ""; + private endedAt = ""; + private listenersAttached = false; + + // Bound handlers so attach/detach refer to the same references. + private readonly onScreenshot = (e: AgentScreenshotTakenEvent) => { + this.screenshotsByStep.set(e.stepIndex, e); + const partial = this.ensurePartial(e.stepIndex); + + // Default to probe when the emit site doesn't tag a role: matches + // v3AgentHandler's post-step screenshot. For CUA the pre-action shot is + // NOT a probe — emitCuaActionStep fills that role post-action. + const role = e.evidenceRole ?? "probe"; + + if (role === "probe" || role === "agent_and_probe") { + const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; + probe.screenshot = e.screenshot; + probe.url = e.url; + partial.probeEvidence = probe; + } else if (!partial.probeEvidence?.url) { + // Capture URL even for tier-1-only events; a later post-action URL + // can still overwrite it. + partial.probeEvidence = { + ...(partial.probeEvidence ?? {}), + url: e.url, + }; + } + + if (role === "agent" || role === "agent_and_probe") { + partial.agentEvidence = mergeAgentEvidence(partial.agentEvidence, { + modalities: [ + { type: "image", bytes: e.screenshot, mediaType: "image/png" }, + ], + }); + } + }; + private readonly onStepFinished = (e: AgentStepFinishedEvent) => { + const partial = this.ensurePartial(e.stepIndex); + partial.actionName = e.actionName; + partial.actionArgs = e.actionArgs; + partial.reasoning = e.reasoning; + partial.toolOutput = e.toolOutput; + partial.finishedAt = e.finishedAt; + partial.agentEvidence = mergeAgentEvidence( + partial.agentEvidence, + buildAgentEvidence(e), + ); + }; + private readonly onStepObserved = (e: AgentStepObservedEvent) => { + this.observationByStep.set(e.stepIndex, e); + const partial = this.ensurePartial(e.stepIndex); + const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; + probe.url = e.url; + if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree; + if (e.scroll !== undefined) probe.scroll = e.scroll; + partial.probeEvidence = probe; + }; + private readonly onFinalAnswer = (e: AgentFinalAnswerEvent) => { + this.finalAnswerEvent = e; + }; + + constructor(opts: TrajectoryRecorderOptions) { + this.v3 = opts.v3; + this.taskSpec = opts.taskSpec; + this.runId = opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-"); + const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories"); + this.outputDir = path.join(root, this.runId, opts.taskSpec.id); + this.persistEnabled = shouldPersistTrajectory(opts.persist); + } + + /** Subscribe to bus events. Call once before agent.execute(). */ + start(): void { + if (this.listenersAttached) return; + this.startedAt = new Date().toISOString(); + this.v3.bus.on("agent_screenshot_taken_event", this.onScreenshot); + this.v3.bus.on("agent_step_finished_event", this.onStepFinished); + this.v3.bus.on("agent_step_observed_event", this.onStepObserved); + this.v3.bus.on("agent_final_answer_event", this.onFinalAnswer); + this.listenersAttached = true; + } + + /** + * Detach listeners, assemble the Trajectory, and (if persistence is on) + * write the on-disk layout. Idempotent. + */ + async finish(opts: TrajectoryFinishOptions): Promise { + this.detach(); + this.endedAt = new Date().toISOString(); + + const steps = this.assembleSteps(); + const trajectory: Trajectory = { + task: this.taskSpec, + steps, + finalAnswer: opts.finalAnswer ?? this.finalAnswerEvent?.message, + status: opts.status, + usage: { ...ZERO_USAGE, ...(opts.usage ?? {}) }, + timing: { startedAt: this.startedAt, endedAt: this.endedAt }, + }; + + if (this.persistEnabled) { + await writeTrajectoryDir(this.outputDir, trajectory); + } + + return trajectory; + } + + /** Throw away in-memory state without writing to disk. Used on early abort. */ + cancel(): void { + this.detach(); + this.partialSteps.clear(); + this.observationByStep.clear(); + this.screenshotsByStep.clear(); + this.finalAnswerEvent = undefined; + } + + /** Where the trajectory dir lives (whether or not it was persisted). */ + get directory(): string { + return this.outputDir; + } + + /** Whether this recorder wrote the trajectory directory on finish(). */ + get persisted(): boolean { + return this.persistEnabled; + } + + /** + * Persist evaluator result next to the trajectory. No-op when trajectory + * persistence is disabled. + */ + async persistResult( + result: EvaluationResult, + filename = "result.json", + ): Promise { + if (!this.persistEnabled) return; + + const scoresDir = path.join(this.outputDir, "scores"); + await fs.mkdir(scoresDir, { recursive: true }); + await fs.writeFile( + path.join(scoresDir, filename), + JSON.stringify(result, null, 2), + ); + + const taskDataPath = path.join(this.outputDir, "task_data.json"); + let taskData: Record = {}; + try { + taskData = JSON.parse(await fs.readFile(taskDataPath, "utf8")) as Record< + string, + unknown + >; + } catch { + taskData = { task: this.taskSpec }; + } + await fs.writeFile( + taskDataPath, + JSON.stringify({ ...taskData, result }, null, 2), + ); + } + + private detach(): void { + if (!this.listenersAttached) return; + this.v3.bus.off("agent_screenshot_taken_event", this.onScreenshot); + this.v3.bus.off("agent_step_finished_event", this.onStepFinished); + this.v3.bus.off("agent_step_observed_event", this.onStepObserved); + this.v3.bus.off("agent_final_answer_event", this.onFinalAnswer); + this.listenersAttached = false; + } + + private ensurePartial(stepIndex: number): Partial { + let p = this.partialSteps.get(stepIndex); + if (!p) { + p = { index: stepIndex }; + this.partialSteps.set(stepIndex, p); + } + return p; + } + + private assembleSteps(): TrajectoryStep[] { + const out: TrajectoryStep[] = []; + const indices = [...this.partialSteps.keys()].sort((a, b) => a - b); + for (const i of indices) { + const p = this.partialSteps.get(i)!; + if ( + p.actionName === undefined || + p.toolOutput === undefined || + p.finishedAt === undefined + ) { + // CUA emits screenshot-only entries between actions; skip them here + // and let writeTrajectoryDir record them via the probe channel. + continue; + } + out.push({ + index: i, + actionName: p.actionName, + actionArgs: p.actionArgs ?? {}, + reasoning: p.reasoning ?? "", + agentEvidence: p.agentEvidence ?? { modalities: [] }, + probeEvidence: p.probeEvidence ?? {}, + toolOutput: p.toolOutput, + startedAt: this.startedAt, + finishedAt: p.finishedAt, + }); + } + return out; + } +} + +function mergeAgentEvidence( + ...parts: Array +): AgentEvidence { + return { + modalities: parts.flatMap((p) => p?.modalities ?? []), + }; +} + +function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence { + const modalities: AgentEvidence["modalities"] = []; + if (e.reasoning) { + modalities.push({ type: "text", content: e.reasoning }); + } + const result = e.toolOutput.result; + if (result === undefined || result === null) { + return { modalities }; + } + if (typeof result === "string") { + modalities.push({ type: "text", content: result }); + } else if (Buffer.isBuffer(result)) { + modalities.push({ + type: "image", + bytes: result, + mediaType: "image/png", + }); + } else if (typeof result === "object") { + // Vision tools embed a screenshotBase64 alongside the JSON result; lift + // it to its own image modality so the verifier sees both. + const r = result as { screenshotBase64?: string } & Record; + if (typeof r.screenshotBase64 === "string") { + try { + modalities.push({ + type: "image", + bytes: Buffer.from(r.screenshotBase64, "base64"), + mediaType: "image/png", + }); + } catch { + // Malformed base64; skip the image and keep the JSON modality. + } + } + modalities.push({ type: "json", content: result }); + } + return { modalities }; +} diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts new file mode 100644 index 0000000000..5c5268e66a --- /dev/null +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -0,0 +1,197 @@ +import { EventEmitter } from "node:events"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { afterEach, describe, expect, it } from "vitest"; +import type { TaskSpec, V3 } from "@browserbasehq/stagehand"; + +import { TrajectoryRecorder } from "../../framework/trajectoryRecorder.js"; + +const tempDirs: string[] = []; + +afterEach(async () => { + while (tempDirs.length > 0) { + const dir = tempDirs.pop(); + if (dir) await fs.rm(dir, { recursive: true, force: true }); + } +}); + +function makeTempDir(): Promise { + return fs + .mkdtemp(path.join(os.tmpdir(), "trajectory-recorder-")) + .then((dir) => { + tempDirs.push(dir); + return dir; + }); +} + +function makeV3(bus = new EventEmitter()): V3 { + return { bus } as unknown as V3; +} + +function makeTaskSpec(): TaskSpec { + return { + id: "recorder-task", + instruction: "Compare economy and business fares.", + initUrl: "https://example.com", + precomputedRubric: { + items: [ + { + criterion: "Report fare delta", + description: "Report the difference between two fares.", + maxPoints: 1, + }, + ], + }, + }; +} + +describe("TrajectoryRecorder", () => { + it("assembles trajectory evidence from bus events", async () => { + const bus = new EventEmitter(); + const recorder = new TrajectoryRecorder({ + v3: makeV3(bus), + taskSpec: makeTaskSpec(), + persist: false, + }); + const screenshot = Buffer.from("screen-1"); + + recorder.start(); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 0, + screenshot, + url: "https://example.com/search", + evidenceRole: "agent_and_probe", + }); + bus.emit("agent_step_finished_event", { + stepIndex: 0, + actionName: "extract", + actionArgs: { instruction: "Read fares" }, + reasoning: "Read visible fare cells.", + toolOutput: { + ok: true, + result: { economy: "$100", business: "$250" }, + }, + finishedAt: new Date(0).toISOString(), + }); + bus.emit("agent_step_observed_event", { + stepIndex: 0, + url: "https://example.com/search", + ariaTree: "RootWebArea\nStaticText: Economy $100", + }); + bus.emit("agent_final_answer_event", { + message: "Business is $150 more than economy.", + }); + + const trajectory = await recorder.finish({ + status: "complete", + usage: { input_tokens: 10, output_tokens: 5 }, + }); + + expect(trajectory.steps).toHaveLength(1); + expect(trajectory.steps[0]).toMatchObject({ + index: 0, + actionName: "extract", + actionArgs: { instruction: "Read fares" }, + reasoning: "Read visible fare cells.", + toolOutput: { + ok: true, + result: { economy: "$100", business: "$250" }, + }, + probeEvidence: { + url: "https://example.com/search", + ariaTree: "RootWebArea\nStaticText: Economy $100", + }, + }); + expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot); + expect(trajectory.steps[0].agentEvidence.modalities).toEqual( + expect.arrayContaining([ + { type: "image", bytes: screenshot, mediaType: "image/png" }, + { type: "text", content: "Read visible fare cells." }, + { type: "json", content: { economy: "$100", business: "$250" } }, + ]), + ); + expect(trajectory.finalAnswer).toBe("Business is $150 more than economy."); + }); + + it("persists trajectory files and evaluator results", async () => { + const outputRoot = await makeTempDir(); + const bus = new EventEmitter(); + const recorder = new TrajectoryRecorder({ + v3: makeV3(bus), + taskSpec: makeTaskSpec(), + outputRoot, + runId: "run-1", + persist: true, + }); + const screenshot = Buffer.from("screen-1"); + + recorder.start(); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 0, + screenshot, + url: "https://example.com/search", + evidenceRole: "agent_and_probe", + }); + bus.emit("agent_step_finished_event", { + stepIndex: 0, + actionName: "act", + actionArgs: { instruction: "Search fares" }, + reasoning: "Search for fares.", + toolOutput: { ok: true, result: "done" }, + finishedAt: new Date(0).toISOString(), + }); + bus.emit("agent_step_observed_event", { + stepIndex: 0, + url: "https://example.com/search", + }); + + await recorder.finish({ status: "complete" }); + await recorder.persistResult({ + outcomeSuccess: true, + explanation: "The task was completed.", + }); + + const taskDir = path.join(outputRoot, "run-1", "recorder-task"); + await expect(fs.readdir(taskDir)).resolves.toEqual( + expect.arrayContaining([ + "core.log", + "scores", + "screenshots", + "task_data.json", + "times.json", + "trajectory.json", + ]), + ); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "probe", "1.png")), + ).resolves.toEqual(screenshot); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "agent", "1.png")), + ).resolves.toEqual(screenshot); + await expect( + fs.readFile(path.join(taskDir, "scores", "result.json"), "utf8"), + ).resolves.toContain('"outcomeSuccess": true'); + + const trajectory = JSON.parse( + await fs.readFile(path.join(taskDir, "trajectory.json"), "utf8"), + ); + expect(trajectory.steps[0].probeEvidence.screenshotPath).toBe( + "screenshots/probe/1.png", + ); + expect(trajectory.steps[0].agentEvidence.modalities).toContainEqual({ + type: "image", + imagePath: "screenshots/agent/1.png", + mediaType: "image/png", + }); + + const taskData = JSON.parse( + await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"), + ); + expect(taskData.result).toMatchObject({ + outcomeSuccess: true, + explanation: "The task was completed.", + }); + }); +});