browserbase · miguelg719 · May 15, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
diff --git a/.changeset/verifier-trajectory-events.md b/.changeset/verifier-trajectory-events.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Capture verifier trajectory evidence from v3 agent events for offline scoring.
diff --git a/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts
@@ -0,0 +1,75 @@
+/**
+ * captureAriaTreeProbe — capture a truncated accessibility tree of the active
+ * page for use as tier-2 evidence in the trajectory recorder.
+ *
+ * Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the
+ * callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay
+ * the cost.
+ *
+ * The a11y tree is the same payload the agent's `ariaTree` tool sees, but
+ * captured by the harness (not the agent) so the verifier has independent
+ * textual ground truth for grounding non-visual claims — prices, names,
+ * dates, list contents — without OCR'ing screenshots.
+ *
+ * Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures
+ * across a ~30-step trajectory at that cap sum to ~240k tokens total,
+ * which the verifier handles via per-criterion top-K selection. The cap
+ * is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can
+ * trade RAM/disk for fidelity. Truncated content is marked explicitly so
+ * the verifier knows it was clipped.
+ */
+import type { V3 } from "../../v3.js";
+
+const APPROX_CHARS_PER_TOKEN = 4;
+const DEFAULT_TOKEN_BUDGET = 8_000;
+const DEFAULT_TIMEOUT_MS = 5_000;
+
+interface CaptureAriaTreeOptions {
+  /** Soft cap on token count (chars/4 approximation). Default 8000. */
+  tokenBudget?: number;
+  /** Hard timeout on the capture. Default 5s. */
+  timeoutMs?: number;
+}
+
+/**
+ * Returns the truncated a11y tree as a plain string, or undefined when
+ * capture fails. Never throws — a11y capture is best-effort tier-2 evidence,
+ * not a hard requirement, so failures are silently absorbed (the verifier
+ * surfaces this via evidence_insufficient).
+ */
+export async function captureAriaTreeProbe(
+  v3: V3,
+  opts: CaptureAriaTreeOptions = {},
+): Promise<string | undefined> {
+  const envBudget = parseInt(
+    process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? "",
+    10,
+  );
+  const tokenBudget =
+    opts.tokenBudget ??
+    (Number.isFinite(envBudget) && envBudget > 0
+      ? envBudget
+      : DEFAULT_TOKEN_BUDGET);
+  const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+  const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN;
+
+  try {
+    // v3.extract() without a schema returns { pageText } where pageText is the
+    // rendered accessibility tree — same path the agent's ariaTree tool uses.
+    const result = (await v3.extract({ timeout: timeoutMs })) as {
+      pageText?: string;
+    };
+    const pageText = result?.pageText;
+    if (typeof pageText !== "string" || pageText.length === 0) return undefined;
+
+    if (pageText.length > maxChars) {
+      return (
+        pageText.slice(0, maxChars) +
+        `\n\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]`
+      );
+    }
+    return pageText;
+  } catch {
+    return undefined;
+  }
+}
diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -41,6 +41,7 @@ import {
   AgentAbortError,
 } from "../types/public/sdkErrors.js";
 import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js";
+import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js";
 import {
   CaptchaSolver,
   CAPTCHA_SOLVED_MSG,
@@ -248,13 +249,22 @@ export class V3AgentHandler {
       | GenerateTextOnStepFinishCallback<ToolSet>
       | StreamTextOnStepFinishCallback<ToolSet>,
   ) {
+    // Monotonic step counter scoped to this execute() call. Each tool call in
+    // the agent loop becomes one trajectory step. The counter feeds stepIndex
+    // on the bus events the TrajectoryRecorder subscribes to.
+    let stepCounter = 0;
     return async (event: StepResult<ToolSet>) => {
       this.logger({
         category: "agent",
         message: `Step finished: ${event.finishReason}`,
         level: 2,
       });
 
+      const stepIndicesInTurn: number[] = [];
+      let lastFinalAnswer:
+        | { message: string; output?: Record<string, unknown> }
+        | undefined;
+
       if (event.toolCalls && event.toolCalls.length > 0) {
         for (let i = 0; i < event.toolCalls.length; i++) {
           const toolCall = event.toolCalls[i];
@@ -279,6 +289,13 @@ export class V3AgentHandler {
                 ? `${allReasoning} ${doneReasoning}`.trim()
                 : allReasoning || "Task completed successfully";
             }
+            lastFinalAnswer = {
+              message: state.finalMessage,
+              output:
+                typeof args?.output === "object" && args?.output !== null
+                  ? (args.output as Record<string, unknown>)
+                  : undefined,
+            };
           }
           const mappedActions = mapToolResultToActions({
             toolCallName: toolCall.toolName,
@@ -292,8 +309,100 @@ export class V3AgentHandler {
             action.timestamp = Date.now();
             state.actions.push(action);
           }
+
+          // Emit step_finished_event per tool call. The TrajectoryRecorder
+          // builds one Trajectory.Step per emission. tier-1 evidence (the
+          // bytes the LLM consumed) is captured separately via an
+          // onStepFinish wrapper in the harness.
+          const stepIndex = stepCounter++;
+          stepIndicesInTurn.push(stepIndex);
+          const toolOk =
+            !toolResult ||
+            (typeof toolResult === "object" &&
+              !("error" in toolResult) &&
+              !("isError" in toolResult && toolResult.isError));
+          this.v3.bus.emit("agent_step_finished_event", {
+            stepIndex,
+            actionName: toolCall.toolName,
+            actionArgs:
+              typeof args === "object" && args !== null
+                ? (args as Record<string, unknown>)
+                : {},
+            reasoning: event.text ?? "",
+            toolOutput: {
+              ok: toolOk,
+              result: toolResult,
+              error:
+                toolResult &&
+                typeof toolResult === "object" &&
+                "error" in toolResult &&
+                typeof (toolResult as { error?: unknown }).error === "string"
+                  ? (toolResult as { error: string }).error
+                  : undefined,
+            },
+            finishedAt: new Date().toISOString(),
+          });
         }
         state.currentPageUrl = (await this.v3.context.awaitActivePage()).url();
+
+        // Harness probe — take a single screenshot / a11y snapshot per AI SDK
+        // step and attach it to every tool call in that turn. The observation
+        // reflects the settled page state after the batch of tool calls; this
+        // is more faithful than dropping probe evidence for all but the last
+        // tool call, while still avoiding per-tool screenshot overhead.
+        const wantsScreenshotProbe =
+          this.v3.bus.listenerCount?.("agent_screenshot_taken_event") > 0;
+        const wantsStepObservation =
+          this.v3.bus.listenerCount?.("agent_step_observed_event") > 0;
+        if (
+          stepIndicesInTurn.length > 0 &&
+          (wantsScreenshotProbe || wantsStepObservation)
+        ) {
+          try {
+            const page = await this.v3.context.awaitActivePage();
+            let screenshot: Buffer | undefined;
+            if (wantsScreenshotProbe) {
+              screenshot = await page.screenshot({ fullPage: false });
+            }
+            let ariaTree: string | undefined;
+            if (wantsStepObservation) {
+              // Capture the a11y tree alongside the URL probe so the verifier
+              // can ground textual claims (prices, names, dates) without OCR.
+              // Best-effort: returns undefined on failure/timeout.
+              ariaTree = await captureAriaTreeProbe(this.v3);
+            }
+            for (const stepIndex of stepIndicesInTurn) {
+              if (screenshot) {
+                // DOM/hybrid: this post-step screenshot is a harness probe
+                // only. The agent's tier-1 evidence is the tool's return value
+                // captured separately in agent_step_finished_event.
+                this.v3.bus.emit("agent_screenshot_taken_event", {
+                  stepIndex,
+                  screenshot,
+                  url: state.currentPageUrl,
+                  evidenceRole: "probe",
+                });
+              }
+              if (wantsStepObservation) {
+                this.v3.bus.emit("agent_step_observed_event", {
+                  stepIndex,
+                  url: state.currentPageUrl,
+                  ariaTree,
+                });
+              }
+            }
+          } catch (e) {
+            this.logger({
+              category: "agent",
+              message: `Warning: harness probe failed: ${getErrorMessage(e)}`,
+              level: 1,
+            });
+          }
+        }
+      }
+
+      if (lastFinalAnswer) {
+        this.v3.bus.emit("agent_final_answer_event", lastFinalAnswer);
       }
 
       if (userCallback) {