Skip to content
5 changes: 5 additions & 0 deletions .changeset/verifier-trajectory-events.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
---

Capture verifier trajectory evidence from v3 agent events for offline scoring.
75 changes: 75 additions & 0 deletions packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/**
* captureAriaTreeProbe — capture a truncated accessibility tree of the active
* page for use as tier-2 evidence in the trajectory recorder.
*
* Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the
* callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay
* the cost.
*
* The a11y tree is the same payload the agent's `ariaTree` tool sees, but
* captured by the harness (not the agent) so the verifier has independent
* textual ground truth for grounding non-visual claims — prices, names,
* dates, list contents — without OCR'ing screenshots.
*
* Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures
* across a ~30-step trajectory at that cap sum to ~240k tokens total,
* which the verifier handles via per-criterion top-K selection. The cap
* is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can
* trade RAM/disk for fidelity. Truncated content is marked explicitly so
* the verifier knows it was clipped.
*/
import type { V3 } from "../../v3.js";

const APPROX_CHARS_PER_TOKEN = 4;
const DEFAULT_TOKEN_BUDGET = 8_000;
const DEFAULT_TIMEOUT_MS = 5_000;

interface CaptureAriaTreeOptions {
/** Soft cap on token count (chars/4 approximation). Default 8000. */
tokenBudget?: number;
/** Hard timeout on the capture. Default 5s. */
timeoutMs?: number;
}

/**
* Returns the truncated a11y tree as a plain string, or undefined when
* capture fails. Never throws — a11y capture is best-effort tier-2 evidence,
* not a hard requirement, so failures are silently absorbed (the verifier
* surfaces this via evidence_insufficient).
*/
export async function captureAriaTreeProbe(
v3: V3,
opts: CaptureAriaTreeOptions = {},
): Promise<string | undefined> {
const envBudget = parseInt(
process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? "",
10,
);
const tokenBudget =
opts.tokenBudget ??
(Number.isFinite(envBudget) && envBudget > 0
? envBudget
: DEFAULT_TOKEN_BUDGET);
const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN;

try {
// v3.extract() without a schema returns { pageText } where pageText is the
// rendered accessibility tree — same path the agent's ariaTree tool uses.
const result = (await v3.extract({ timeout: timeoutMs })) as {
pageText?: string;
};
const pageText = result?.pageText;
if (typeof pageText !== "string" || pageText.length === 0) return undefined;

if (pageText.length > maxChars) {
return (
pageText.slice(0, maxChars) +
`\n\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]`
);
}
return pageText;
} catch {
return undefined;
}
}
109 changes: 109 additions & 0 deletions packages/core/lib/v3/handlers/v3AgentHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import {
AgentAbortError,
} from "../types/public/sdkErrors.js";
import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js";
import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js";
import {
CaptchaSolver,
CAPTCHA_SOLVED_MSG,
Expand Down Expand Up @@ -248,13 +249,22 @@ export class V3AgentHandler {
| GenerateTextOnStepFinishCallback<ToolSet>
| StreamTextOnStepFinishCallback<ToolSet>,
) {
// Monotonic step counter scoped to this execute() call. Each tool call in
// the agent loop becomes one trajectory step. The counter feeds stepIndex
// on the bus events the TrajectoryRecorder subscribes to.
let stepCounter = 0;
return async (event: StepResult<ToolSet>) => {
this.logger({
category: "agent",
message: `Step finished: ${event.finishReason}`,
level: 2,
});

const stepIndicesInTurn: number[] = [];
let lastFinalAnswer:
| { message: string; output?: Record<string, unknown> }
| undefined;

if (event.toolCalls && event.toolCalls.length > 0) {
for (let i = 0; i < event.toolCalls.length; i++) {
const toolCall = event.toolCalls[i];
Expand All @@ -279,6 +289,13 @@ export class V3AgentHandler {
? `${allReasoning} ${doneReasoning}`.trim()
: allReasoning || "Task completed successfully";
}
lastFinalAnswer = {
message: state.finalMessage,
output:
typeof args?.output === "object" && args?.output !== null
? (args.output as Record<string, unknown>)
: undefined,
};
}
const mappedActions = mapToolResultToActions({
toolCallName: toolCall.toolName,
Expand All @@ -292,8 +309,100 @@ export class V3AgentHandler {
action.timestamp = Date.now();
state.actions.push(action);
}

// Emit step_finished_event per tool call. The TrajectoryRecorder
// builds one Trajectory.Step per emission. tier-1 evidence (the
// bytes the LLM consumed) is captured separately via an
// onStepFinish wrapper in the harness.
const stepIndex = stepCounter++;
stepIndicesInTurn.push(stepIndex);
const toolOk =
!toolResult ||
(typeof toolResult === "object" &&
!("error" in toolResult) &&
!("isError" in toolResult && toolResult.isError));
this.v3.bus.emit("agent_step_finished_event", {
stepIndex,
actionName: toolCall.toolName,
actionArgs:
typeof args === "object" && args !== null
? (args as Record<string, unknown>)
: {},
reasoning: event.text ?? "",
toolOutput: {
ok: toolOk,
result: toolResult,
error:
toolResult &&
typeof toolResult === "object" &&
"error" in toolResult &&
typeof (toolResult as { error?: unknown }).error === "string"
? (toolResult as { error: string }).error
: undefined,
},
finishedAt: new Date().toISOString(),
});
}
state.currentPageUrl = (await this.v3.context.awaitActivePage()).url();

// Harness probe — take a single screenshot / a11y snapshot per AI SDK
// step and attach it to every tool call in that turn. The observation
// reflects the settled page state after the batch of tool calls; this
// is more faithful than dropping probe evidence for all but the last
// tool call, while still avoiding per-tool screenshot overhead.
const wantsScreenshotProbe =
this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0;
const wantsStepObservation =
this.v3.bus.listenerCount?.("agent_step_observed_event") > 0;
if (
stepIndicesInTurn.length > 0 &&
(wantsScreenshotProbe || wantsStepObservation)
) {
try {
const page = await this.v3.context.awaitActivePage();
let screenshot: Buffer | undefined;
if (wantsScreenshotProbe) {
screenshot = await page.screenshot({ fullPage: false });
}
let ariaTree: string | undefined;
if (wantsStepObservation) {
// Capture the a11y tree alongside the URL probe so the verifier
// can ground textual claims (prices, names, dates) without OCR.
// Best-effort: returns undefined on failure/timeout.
ariaTree = await captureAriaTreeProbe(this.v3);
}
for (const stepIndex of stepIndicesInTurn) {
if (screenshot) {
// DOM/hybrid: this post-step screenshot is a harness probe
// only. The agent's tier-1 evidence is the tool's return value
// captured separately in agent_step_finished_event.
this.v3.bus.emit("agent_screenshot_taken_event", {
stepIndex,
screenshot,
url: state.currentPageUrl,
evidenceRole: "probe",
});
}
if (wantsStepObservation) {
this.v3.bus.emit("agent_step_observed_event", {
stepIndex,
url: state.currentPageUrl,
ariaTree,
});
}
}
} catch (e) {
this.logger({
category: "agent",
message: `Warning: harness probe failed: ${getErrorMessage(e)}`,
level: 1,
});
}
}
}

if (lastFinalAnswer) {
this.v3.bus.emit("agent_final_answer_event", lastFinalAnswer);
}

if (userCallback) {
Expand Down
Loading
Loading