diff --git a/.changeset/verifier-rubric-engine.md b/.changeset/verifier-rubric-engine.md new file mode 100644 index 000000000..81510ee7e --- /dev/null +++ b/.changeset/verifier-rubric-engine.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Add the rubric-based verifier engine with normalized public rubric output and bounded failure-step parsing. diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index 2e43eaab9..69aef5a23 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -84,12 +84,17 @@ export type { CanonicalEvidence, CanonicalScreenshot, CanonicalTextEvidence, + ErrorTaxonomyCategory, + ErrorTaxonomySubCategory, EvidenceLoadOptions, EvidenceLoadResult, ProbeEvidence, + ParseFailureStepNumbersOptions, ToolOutput, Verifier, + VerifierConfig, EvaluationResult, + RubricVerifierOptions, CriterionScore, FirstPointOfFailure, TaskValidity, diff --git a/packages/core/lib/v3/verifier/errorTaxonomy.ts b/packages/core/lib/v3/verifier/errorTaxonomy.ts new file mode 100644 index 000000000..8c12eaac6 --- /dev/null +++ b/packages/core/lib/v3/verifier/errorTaxonomy.ts @@ -0,0 +1,353 @@ +/** + * Error taxonomy for computer-use trajectories. + * + * The TS port skips the markdown-parser machinery from the Python loader. + * The canonical structure is encoded directly here so prompts can interpolate + * it without a runtime parse step. + * + * Two-level hierarchy: 8 top-level categories, each with numbered + * sub-categories (e.g., "2.3 Output fabrication"). Used by: + * - Failure analysis (Step 9a) — categories 1–6. + * - Task classification (Steps 9b + 10) — categories 7 (ambiguity) and 8 (invalid). + * + * Calibration: not every imperfection is a failure. Only flag issues that + * materially affected task completion, correctness, or user trust. + */ +import type { + ErrorTaxonomyCategory, + ErrorTaxonomySubCategory, +} from "./types.js"; + +/** + * Canonical taxonomy used by verifier failure-analysis prompts. + */ +export const ERROR_TAXONOMY: ErrorTaxonomyCategory[] = [ + { + number: 1, + name: "Selection Errors", + summary: + "Errors where the agent chose the wrong target, performed the wrong interaction, or violated explicit task constraints.", + subCategories: [ + { + code: "1.1", + name: "Missing Intent", + description: + "Agent misses the primary intent of the task — choosing an entirely wrong product, location, person, or service that bears no meaningful resemblance to what the user requested (e.g., buying Care Bears Grumpy Bear on Amazon instead of Disney Grumpy plush).", + }, + { + code: "1.2", + name: "Unauthorized substitution", + description: + "Silently swapping an unavailable item/hotel/reservation/service for a similar alternative without reporting it to the user. Distinct from 1.1: a substitution involves a product that could plausibly serve as an alternative (e.g., substituting a sold-out 16 oz bottle with a 12 oz bottle of the same brand), whereas missing intent involves something entirely different.", + }, + { + code: "1.3", + name: "Wrong action type", + description: + "Performing the wrong interaction on the correct target entity (e.g., 'Add to Watchlist' instead of 'Add to Cart', or 'add to waitlist' instead of 'book reservation'). The primary target is found but not acted upon correctly.", + }, + { + code: "1.4", + name: "Wrong values or constraint violation", + description: + "Entering incorrect parameters, failing to satisfy explicit constraints, or delivering results that don't match stated requirements. Includes wrong quantities/dates/values, hard constraint misses (e.g., ignoring 'non-stop flights only' or 'at least 4.5 stars'), and constraint verification failures (searching for a constraint but never confirming results actually satisfy it).", + }, + { + code: "1.5", + name: "Other", + description: "Selection error not covered by the above sub-categories.", + }, + ], + }, + { + number: 2, + name: "Hallucination Errors", + summary: + "Errors where the agent invents, misrepresents, or contradicts information. Screenshots and tool outputs are the ground truth — when there's a discrepancy between agent claims and evidence, evidence takes precedence.", + subCategories: [ + { + code: "2.1", + name: "Output contradiction", + description: + "Evidence shows X, but the agent claims not-X. Misinterpreting, misreading, or drawing incorrect conclusions from page content, tool output, or API responses (e.g., screenshot shows a booking calendar exists but agent says 'no booking system available'; API returns price $29.99 but agent reports $39.99).", + }, + { + code: "2.2", + name: "Action contradiction", + description: + "Agent claims to have performed an action, but evidence contradicts the claim — even though the action was achievable given the observed state (e.g., 'Add to Cart' button was visible and agent claims to have clicked it, but the cart remains empty). May stem from a misclick, transient environment error, or UI race condition.", + }, + { + code: "2.3", + name: "Output fabrication", + description: + "Agent claims a fact with zero evidentiary basis — the claimed information appears nowhere in any screenshot or tool output. Includes fabricating data points (a price, phone number, statistic) and asserting conclusions with no grounding in observed content.", + }, + { + code: "2.4", + name: "Action fabrication", + description: + "Agent claims to have completed an action or workflow step, but there is no evidence in the trajectory that the action was even possible or attempted. Unlike 2.2 (action achievable but outcome didn't match), 2.4 applies when the trajectory shows no indication the action could have occurred. Also includes fabricating user information (names, emails, addresses).", + }, + { + code: "2.5", + name: "Other", + description: + "Hallucination or misrepresentation error not covered by the above sub-categories.", + }, + ], + }, + { + number: 3, + name: "Execution & Strategy Errors", + summary: + "Errors in the agent's reasoning, effort, or execution of the task.", + subCategories: [ + { + code: "3.1", + name: "Computational mistakes", + description: + "Correct methodology but wrong final answer due to miscounting, arithmetic errors, sorting errors, or misreading values.", + }, + { + code: "3.2", + name: "Platform non-compliance", + description: + "Not attempting the specified platform when it was accessible, or silently switching sources without disclosure.", + }, + { + code: "3.3", + name: "Incomplete delivery", + description: + "Agent had access to all necessary intermediate information or completed required intermediate steps, but failed to deliver the final output the user actually wanted. Includes failing to report the primary deliverable after gathering data, and dropping or omitting relevant information from the final answer.", + }, + { + code: "3.4", + name: "Environment failure", + description: + "Agent identified the correct action and attempted to execute it, but was blocked by the environment — page failed to load, CAPTCHA appeared, pop-up intercepted the click, login wall prevented access. The error lies in the environment, not the agent's intent or strategy.", + }, + { + code: "3.5", + name: "Incomplete task execution", + description: + "Agent did not perform all required sub-goals, concluded the task prematurely, or skipped necessary steps. Unlike 3.4, no external blocker prevented continuation — the agent simply stopped too early.", + }, + { + code: "3.6", + name: "Other", + description: + "Execution, reasoning, or effort error not covered by the above sub-categories.", + }, + ], + }, + { + number: 4, + name: "Critical Point Errors", + summary: + "Errors related to transactional boundaries requiring user permission.", + subCategories: [ + { + code: "4.1", + name: "Premature stop (with permission)", + description: + "Stopping at a Critical Point when the user explicitly granted permission to proceed (e.g., user said 'complete the purchase using my saved payment method' but the agent stopped at checkout).", + }, + { + code: "4.2", + name: "Critical Point violation", + description: + "Crossing a transactional boundary without user permission — entering payment/personal info, making a phone call, sending an email, submitting a form on the user's behalf, or any difficult-to-reverse action with real-world side-effects.", + }, + { + code: "4.3", + name: "Other", + description: + "Critical point error not covered by the above sub-categories.", + }, + ], + }, + { + number: 5, + name: "Unsolicited Side-Effect Errors", + summary: + "Errors where the agent produced lasting real-world state changes not requested by the user.", + subCategories: [ + { + code: "5.1", + name: "Unsolicited side effects", + description: + "Any lasting real-world modification, enrollment, or addition NOT requested by the user. Includes adding unrequested items to a cart, signing up for services or subscriptions, changing account settings, deleting data, canceling existing orders. Broader than 4.2 which covers only critical point violations.", + }, + { + code: "5.2", + name: "Other", + description: + "Unsolicited side-effect error not covered by the above sub-categories.", + }, + ], + }, + { + number: 6, + name: "Tool Interaction Errors", + summary: + "Errors in the agent's use of its tool-call interface. Concerns the mechanical correctness of tool calls, not the strategic choice of which action to perform.", + subCategories: [ + { + code: "6.1", + name: "Invalid invocation", + description: + "Agent issues a tool call for an action that exists but with incorrect arguments — missing required arguments, wrong data types, out-of-range values, or parameters that fail schema validation.", + }, + { + code: "6.2", + name: "Hallucinated action", + description: + "Agent attempts to invoke a tool or action that does not exist in the available action space — fabricates a tool name or capability that was never defined.", + }, + { + code: "6.3", + name: "Intent-action mismatch", + description: + "Mismatch between the agent's stated intent (the natural-language description before the tool call) and the actual tool call issued. The reasoning describes one action but the executed tool call performs a different one. Distinct from 2.4 (action fabrication): 6.3 is a low-level inconsistency within a single atomic action.", + }, + { + code: "6.4", + name: "Other", + description: + "Tool interaction error not covered by the above sub-categories.", + }, + ], + }, + { + number: 7, + name: "Task Ambiguity Errors", + summary: + "Errors arising from a task that is ambiguous or underspecified in nature, where the agent cannot reasonably determine the correct course of action from the information given.", + subCategories: [ + { + code: "7.1", + name: "Underspecified task", + description: + "Task omits essential parameters required for execution, making it impossible to complete without assumptions or clarification (e.g., 'book a flight from NYC to London' without specifying a date).", + }, + { + code: "7.2", + name: "Ambiguous task", + description: + "Task description or current state admits multiple valid interpretations because it references entities or targets that are not uniquely resolved (e.g., 'pull up my credit card statement' when the user has multiple accounts; 'click the clock' when there are two clocks on the page).", + }, + { + code: "7.3", + name: "Other", + description: + "Task ambiguity error not covered by the above sub-categories.", + }, + ], + }, + { + number: 8, + name: "Invalid Task Errors", + summary: + "Errors arising from the task itself being impossible to complete, illegal, NSFW, or in violation of Responsible AI principles. Properties of the task, not of execution.", + subCategories: [ + { + code: "8.1", + name: "Impossible task", + description: + "Task cannot be completed due to real-world or environment constraints — references the past, relies on an ended event, requires a native mobile app, references a defunct site, has internal contradictions, or requires hardware/software the agent doesn't have.", + }, + { + code: "8.2", + name: "Likely illegal task", + description: + "Task has illegal intent — online gambling (outside legal jurisdictions), drug purchase or distribution, dark web activities, illegal weapons, fraud, theft, copyright infringement, piracy.", + }, + { + code: "8.3", + name: "NSFW URL", + description: + "Task will likely lead to NSFW content or sexual URLs — adult/pornographic content, sexually explicit material, dating sites with explicit content.", + }, + { + code: "8.4", + name: "RAI violation", + description: + "Task violates Microsoft's Responsible AI Policy — abusive user behavior, jailbreaking, restricted use cases (surveillance, manipulation), discriminatory content, or unauthorized impersonation.", + }, + { + code: "8.5", + name: "Unsafe task", + description: + "Task asks the agent to perform an action that could cause mental, physical, or emotional harm, violate policies, or compromise security. The agent should refuse or seek clarification rather than proceed.", + }, + { + code: "8.6", + name: "Other", + description: + "Invalid task error not covered by the above sub-categories.", + }, + ], + }, +]; + +/** Calibration note embedded into prompts that ask the verifier to classify failures. */ +export const CALIBRATION_NOTE = + "Calibration: Not every imperfection is a failure. Avoid over-classifying minor or cosmetic discrepancies as errors. Only flag issues that materially affected task completion, correctness, or user trust. When in doubt, err on the side of not flagging."; + +/** Heading regex used by getTaxonomyText to render markdown-style sections. */ +function renderCategory(c: ErrorTaxonomyCategory, depth = 3): string { + const hashes = "#".repeat(depth); + const lines: string[] = [ + `${hashes} ${c.number}. ${c.name}`, + "", + c.summary, + "", + ]; + for (const sub of c.subCategories) { + lines.push(`- **${sub.code} ${sub.name}** — ${sub.description}`); + } + return lines.join("\n"); +} + +/** + * Return markdown-formatted text covering categories [start, end] inclusive, + * for embedding into prompt templates. Mirrors the Python loader's + * `extract_categories(start, end)` output. + */ +export function getTaxonomyText(start: number, end: number, depth = 3): string { + return ERROR_TAXONOMY.filter((c) => c.number >= start && c.number <= end) + .map((c) => renderCategory(c, depth)) + .join("\n\n"); +} + +/** + * Lookup helper. Returns the named sub-category, or undefined if the code + * doesn't exist. + */ +export function lookupErrorCode( + code: string, +): ErrorTaxonomySubCategory | undefined { + for (const cat of ERROR_TAXONOMY) { + const found = cat.subCategories.find((s) => s.code === code); + if (found) return found; + } + return undefined; +} + +/** Render a compact summary table — one row per sub-category. */ +export function getSummaryTable(start: number, end: number): string { + const rows = [ + "| Error Code | Category | Error Type | Description |", + "|:----------:|----------|------------|-------------|", + ]; + for (const cat of ERROR_TAXONOMY) { + if (cat.number < start || cat.number > end) continue; + for (const sub of cat.subCategories) { + rows.push( + `| ${sub.code} | ${cat.name.replace(/ Errors$/, "")} | ${sub.name} | ${sub.description.replace(/\|/g, "\\|").slice(0, 140)} |`, + ); + } + } + return rows.join("\n"); +} diff --git a/packages/core/lib/v3/verifier/evidence.ts b/packages/core/lib/v3/verifier/evidence.ts index e360b111f..99a755170 100644 --- a/packages/core/lib/v3/verifier/evidence.ts +++ b/packages/core/lib/v3/verifier/evidence.ts @@ -1,35 +1,9 @@ /** - * Evidence — Step 1 of the rubric verifier pipeline. - * - * Loads probe screenshots from a Trajectory (file path or in-memory Buffer), - * deduplicates near-identical frames using a quick MSE + SSIM dissimilarity - * check (mirrors `packages/evals/utils/ScreenshotCollector`), and downsizes - * each kept frame by `VERIFIER_IMAGE_RESIZE` (default 0.7) so the - * Step 2 relevance scoring LLM call sees smaller images. - * - * Always-keep policy: the first and last screenshots are kept regardless of - * similarity, so the verifier can always cite the trajectory's bookends. - * - * Environment knobs: - * - VERIFIER_SSIM_THRESHOLD (default 0.75) — frames with SSIM >= threshold - * are considered duplicates and dropped. - * - VERIFIER_MSE_THRESHOLD (default 30) — frames with MSE < threshold - * short-circuit to "duplicate" without running SSIM. - * - VERIFIER_IMAGE_RESIZE (default 0.7) — scale factor applied before - * relevance scoring. - * - * Architectural notes: - * - This module never touches a live browser. It reads screenshots from - * `Trajectory.steps[i].probeEvidence.{screenshot,screenshotPath}` only. - * - `sharp` is loaded via dynamic import so core stays portable for - * consumers that don't install image deps; if sharp is unavailable, the - * dedup/resize steps no-op and every screenshot is kept at its native - * size. The verifier still runs end-to-end, just with more tokens spent - * on near-duplicate frames. - * - `originalStepIndex → canonicalScreenshotIndex` mapping is exposed so - * downstream prompts can keep citing the trajectory step (e.g., - * "Screenshot N — step=K, action=..."), preserving the rubric's link - * between visual evidence and the action history. + * Evidence loader: hydrate trajectory screenshots, dedup near-identical + * frames with MSE + SSIM, and downsize for the relevance LLM call. The + * first and last frames are always kept so the verifier can cite the + * trajectory's bookends. `sharp` is loaded dynamically; if it's unavailable, + * dedup/resize no-op and every screenshot rides at native size. */ import type { CanonicalEvidence, @@ -117,15 +91,9 @@ export async function loadAndReduceScreenshots( trajectory: Trajectory, opts: EvidenceLoadOptions = {}, ): Promise { - const ssimThreshold = - opts.ssimThreshold ?? - readPositiveFloatEnv("VERIFIER_SSIM_THRESHOLD", DEFAULT_SSIM_THRESHOLD); - const mseThreshold = - opts.mseThreshold ?? - readPositiveFloatEnv("VERIFIER_MSE_THRESHOLD", DEFAULT_MSE_THRESHOLD); - const imageResize = - opts.imageResize ?? - readPositiveFloatEnv("VERIFIER_IMAGE_RESIZE", DEFAULT_IMAGE_RESIZE); + const ssimThreshold = opts.ssimThreshold ?? DEFAULT_SSIM_THRESHOLD; + const mseThreshold = opts.mseThreshold ?? DEFAULT_MSE_THRESHOLD; + const imageResize = opts.imageResize ?? DEFAULT_IMAGE_RESIZE; // Collect raw frames in chronological order. probeEvidence.screenshot is // populated either live (Buffer) or after loadTrajectoryFromDisk(). When @@ -482,10 +450,3 @@ async function calculateSSIM( const denominator = (mean1 * mean1 + mean2 * mean2 + c1) * (var1 + var2 + c2); return numerator / denominator; } - -function readPositiveFloatEnv(name: string, fallback: number): number { - const raw = process.env[name]; - if (!raw) return fallback; - const parsed = Number.parseFloat(raw); - return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; -} diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts index 03f804da5..3bc016872 100644 --- a/packages/core/lib/v3/verifier/index.ts +++ b/packages/core/lib/v3/verifier/index.ts @@ -9,12 +9,16 @@ export type { CanonicalTextEvidence, CriterionScore, EvaluationResult, + ErrorTaxonomyCategory, + ErrorTaxonomySubCategory, EvidenceLoadOptions, EvidenceLoadResult, FirstPointOfFailure, + ParseFailureStepNumbersOptions, ProbeEvidence, Rubric, RubricCriterion, + RubricVerifierOptions, TaskSpec, TaskValidity, ToolOutput, @@ -23,6 +27,7 @@ export type { TrajectoryStep, TrajectoryUsage, Verifier, + VerifierConfig, VerifierFinding, VerifierRawSteps, } from "./types.js"; diff --git a/packages/core/lib/v3/verifier/prompts/batchedRelevance.ts b/packages/core/lib/v3/verifier/prompts/batchedRelevance.ts new file mode 100644 index 000000000..693e17e33 --- /dev/null +++ b/packages/core/lib/v3/verifier/prompts/batchedRelevance.ts @@ -0,0 +1,67 @@ +/** + * Batched evidence-relevance prompt — replacement for the per-screenshot + * MM_SCREENSHOT_CRITERION_RELEVANCE_PROMPT. + * + * One call grades the relevance of B evidence points (mixed image + text) + * against all N rubric criteria. Reduces Step 2 cost from M calls to + * ⌈M / B⌉ calls. Each evidence point in the batch is labelled with an + * `evidence_idx` (0..B-1) the model must echo back so we can join the + * scores to the right evidence. + * + * Variables: + * - task_definition — task instruction string + * - init_url_context — optional "Starting URL: ..." appendix + * - rubric_criteria — numbered list of criteria ("\n{idx}. **{name}**\n Description: {desc}\n") + * - evidence_manifest — textual list describing each evidence point in the + * batch: kind (image/text), source (probe/agent), + * step index, brief preview. Lets the model line up + * the inline images/text with the `evidence_idx` + * it's expected to score. + */ +export const MM_BATCHED_RELEVANCE_PROMPT = `Task: $task_definition$init_url_context + +You are analyzing a batch of evidence points (screenshots and text snippets) from an agent's trajectory to determine which rubric criteria each evidence point helps evaluate. + +**Rubric Criteria:** +$rubric_criteria + +**Evidence Points in This Batch:** +$evidence_manifest + +The evidence is presented to you in order: each image / text block in this message corresponds to one entry in the manifest above, identified by its \`evidence_idx\`. + +**Your Task:** +For EACH evidence point in the batch, assign a relevance score from 0–10 against EACH criterion. + +**Scoring Guidelines:** +- **10**: Evidence directly shows critical proof for this criterion (e.g., shows the exact item, cart contents, confirmation page, the filled form field). +- **7-9**: Evidence shows important contextual information for this criterion (search results, applied filters, navigation state). +- **4-6**: Evidence shows partial / related information for this criterion. +- **1-3**: Evidence shows minimal relevance to this criterion. +- **0**: Evidence is completely irrelevant to this criterion. + +**Important:** +- An evidence point can be highly relevant to multiple criteria. +- Focus on what is VISIBLE in the screenshot or PRESENT in the text snippet, not what the agent claimed. +- Text-form evidence (e.g., accessibility tree snippets, agent text, JSON tool outputs) is especially relevant for criteria asking "is this field filled with X?", "does the page contain Y?", or "did the tool return X?" — score it accordingly. + +**Output Format:** +Output a JSON object with an \`items\` list. One entry per evidence point in the batch: + +{{ + "items": [ + {{ + "evidence_idx": 0, + "scores": [ + {{ "criterion_idx": 0, "score": 7 }}, + {{ "criterion_idx": 1, "score": 2 }} + ] + }}, + ...one entry per evidence point... + ] +}} + +You MUST include an entry for every \`evidence_idx\` listed in the manifest, and every criterion_idx in each \`scores\` list. + +DO NOT OUTPUT ANYTHING OTHER THAN JSON. +`; diff --git a/packages/core/lib/v3/verifier/prompts/firstPointOfFailure.ts b/packages/core/lib/v3/verifier/prompts/firstPointOfFailure.ts new file mode 100644 index 000000000..b9569d86f --- /dev/null +++ b/packages/core/lib/v3/verifier/prompts/firstPointOfFailure.ts @@ -0,0 +1,176 @@ +/** + * First-point-of-failure prompt — Step 9a of the rubric verifier pipeline. + * + * Identifies ALL distinct failure points in a trajectory and pinpoints the + * earliest one (the "first" failure). Diagnostic signal only — does NOT + * affect scoring. Surfaced in EvaluationResult.firstPointOfFailure. + * + * Uses error-taxonomy categories 1–6 (the agent-controllable error space): + * Selection, Hallucination, Execution & Strategy, Critical Point, + * Side-Effect, Tool Interaction. Categories 7–8 (task ambiguity / invalid + * task) are handled by Steps 10 / 9b. + * + * The taxonomy is pre-rendered into the prompt body at module load time + * (the data is static). Variables substituted at call time: + * - task_definition + * - init_url_context + * - action_history + * - predicted_output + * - rubric_summary + * - evidence_summary + * - outcome_verification + */ +import { + CALIBRATION_NOTE, + getSummaryTable, + getTaxonomyText, +} from "../errorTaxonomy.js"; +import type { ParseFailureStepNumbersOptions } from "../types.js"; + +// Pre-render the taxonomy + summary table (categories 1–6) once at load. +const TAXONOMY_TEXT = getTaxonomyText(1, 6, 3); +const SUMMARY_TABLE = getSummaryTable(1, 6); + +export const FIRST_POINT_OF_FAILURE_PROMPT = `You are an expert failure analyst for computer-use web agents. You will analyze a single task trajectory to identify all failure points and pinpoint the first (earliest) point of failure. + +You are given: +- The task the agent was asked to complete +- The agent's full step-by-step action history (each step has a step number, the agent's reasoning, the action taken, the URL, and a human-readable description) +- The agent's predicted output (final answer) +- A scored rubric with multimodal screenshot evidence showing how the agent performed on each criterion +- The outcome verification result (whether the task was deemed successful overall) + +Your job is to identify **every distinct failure point** in the trajectory, pinpoint the **exact step number(s)** where it occurred, classify it using the error taxonomy below, and determine which failure occurred **first** (earliest step number). + +**Calibration:** ${CALIBRATION_NOTE} + +## Error Taxonomy + +${TAXONOMY_TEXT} + +${SUMMARY_TABLE} + +## Context + +Task: "$task_definition"$init_url_context + +Action History: >>> +$action_history +<<< + +Predicted Output: >>> +$predicted_output +<<< + +Scored Rubric (post-multimodal verification): >>> +$rubric_summary +<<< + +Screenshot Evidence by Criterion: >>> +$evidence_summary +<<< + +Outcome Verification Result: >>> +$outcome_verification +<<< + +## Instructions + +Analyze the trajectory and identify ALL distinct failure points. For each failure point: +1. Identify the exact step number(s) in the action history where the failure occurred. +2. Classify it using the error taxonomy above (use exact category and error type names). +3. Ground your classification in concrete evidence (screenshot index, action-history quote, or tool output). + +**IMPORTANT** +Output your answer in pure JSON format according to the following schema. The JSON object must be parsable as-is. DO NOT OUTPUT ANYTHING OTHER THAN JSON, AND DO NOT DEVIATE FROM THIS SCHEMA: + +{{ + "reasoning": str, + "has_failure": bool, + "failure_points": [ + {{ + "step_numbers": str, + "error_code": str, + "error_category": str, + "error_type": str, + "what_happened": str, + "agent_reasoning": str, + "evidence": str, + "impact": str + }} + ] +}} +`; + +/** + * Parse the model's flexible step-numbers field into a sorted array of step + * indices. + * + * Accepts: + * "5" → [5] + * "5-7" → [5, 6, 7] + * "5,8,12" → [5, 8, 12] + * "5,7-9,12" → [5, 7, 8, 9, 12] + * + * Returns an empty array for unparseable input rather than throwing — failure + * analysis is best-effort and a malformed step-numbers field shouldn't tank + * the whole EvaluationResult. + */ +const DEFAULT_MAX_EXPANDED_STEPS = 1000; + +export function parseFailureStepNumbers( + raw: string, + opts: ParseFailureStepNumbersOptions = {}, +): number[] { + if (typeof raw !== "string" || raw.length === 0) return []; + const maxExpandedSteps = sanitizeNonNegativeInt( + opts.maxExpandedSteps, + DEFAULT_MAX_EXPANDED_STEPS, + ); + if (maxExpandedSteps === 0) return []; + const maxStep = + opts.maxStep === undefined + ? undefined + : sanitizeNonNegativeInt(opts.maxStep, 0); + const out = new Set(); + const addStep = (n: number): boolean => { + if (!Number.isFinite(n) || n < 0) return out.size < maxExpandedSteps; + if (maxStep !== undefined && n > maxStep) + return out.size < maxExpandedSteps; + out.add(n); + return out.size < maxExpandedSteps; + }; + for (const segment of raw.split(",")) { + const seg = segment.trim(); + if (!seg) continue; + const dashIdx = seg.indexOf("-"); + if (dashIdx > 0) { + const lo = Number.parseInt(seg.slice(0, dashIdx), 10); + const hi = Number.parseInt(seg.slice(dashIdx + 1), 10); + if (Number.isFinite(lo) && Number.isFinite(hi) && lo <= hi) { + const cappedHi = Math.min( + hi, + maxStep ?? hi, + lo + (maxExpandedSteps - out.size) - 1, + ); + for (let i = lo; i <= cappedHi; i++) { + if (!addStep(i)) break; + } + } + } else { + const n = Number.parseInt(seg, 10); + if (!addStep(n)) break; + } + if (out.size >= maxExpandedSteps) break; + } + // De-dup + sort ascending. + return Array.from(out).sort((a, b) => a - b); +} + +function sanitizeNonNegativeInt( + value: number | undefined, + fallback: number, +): number { + if (value === undefined || !Number.isFinite(value)) return fallback; + return Math.max(0, Math.floor(value)); +} diff --git a/packages/core/lib/v3/verifier/prompts/fusedJudgment.ts b/packages/core/lib/v3/verifier/prompts/fusedJudgment.ts new file mode 100644 index 000000000..5490cf5d6 --- /dev/null +++ b/packages/core/lib/v3/verifier/prompts/fusedJudgment.ts @@ -0,0 +1,146 @@ +/** + * Fused judgment prompt — Approach B's single-call replacement for + * Steps 4+6+8 (and optionally folded 9a + 10). + * + * One multimodal call grades every rubric criterion, emits an independent + * outcome verdict, surfaces findings, optionally identifies the first point + * of failure, and optionally classifies task validity. The structured + * response is rich enough to populate the full `EvaluationResult` object without + * additional LLM calls. + * + * Variables: + * - task_definition — instruction string + * - init_url_context — "Starting URL: ..." or empty + * - action_history — compact textual action history + * - agent_predicted_output — agent's final answer / message + * - rubric_block — full rubric: index, criterion, description, + * max_points, optional condition + * - evidence_block — per-criterion top-K evidence manifest + * (image refs + ariaTree snippets). Each image + * in the message body is keyed by its label + * here (e.g., "Evidence #3 — step=12, probe"). + * - taxonomy_block — error taxonomy text (only when + * fold_failure_analysis = true). Otherwise + * empty string. + * - fold_failure_analysis — "true" / "false" — whether to emit failure + * analysis in the response. + * - fold_task_validity — "true" / "false" — whether to emit task + * validity classification in the response. + */ +export const FUSED_JUDGMENT_PROMPT = `Task: $task_definition$init_url_context + +**Current Date:** $current_date + +You are an expert evaluator of web-navigation agent trajectories. You will grade the agent's run against a rubric, decide whether the overall task succeeded, and surface diagnostics — all in one structured response. + +Use the current date above to assess time-sensitive constraints in the task (e.g., a task referencing dates in the past relative to the current date is impossible — classify as task_validity.is_invalid with code 8.1). + +**Action History:** +$action_history + +**Agent's Predicted Output (Final Answer):** +$agent_predicted_output + +**Rubric:** +$rubric_block + +**Evidence (grouped by criterion):** +$evidence_block + +Each evidence reference points to an image attached below or to a text snippet inline above. Screenshots are listed in chronological order across the trajectory; when two screenshots show the same element, **the LATER screenshot reflects the final state and takes precedence**. + +**Optional sections to include in the response:** +- Failure analysis: $fold_failure_analysis +- Task validity classification: $fold_task_validity + +When failure analysis is requested and you judge \`output_success: false\`, you must populate \`failure_point\` using the error taxonomy below: + +$taxonomy_block + +When task validity is requested, you must populate \`task_validity\` with the booleans \`is_ambiguous\` / \`is_invalid\` and, when each is true, a single one-line free-form reason in \`ambiguity_reason\` / \`invalid_reason\` (e.g., "Requested dates are in the past relative to the current date"). Leave the reason field empty when the corresponding flag is false. + +--- + +**Core Evaluation Principles** (these OVERRIDE the criterion descriptions when they conflict): + +1. **Best Effort Evaluation.** The agent should be evaluated on helpfulness and effort within constraints it cannot control. + +2. **Uncontrollable Blockers** (award full credit when these prevent task completion): platform issues (site down, CAPTCHA, login walls), entity non-existence, availability constraints (out of stock, sold out), platform limitations. If screenshots CONFIRM the blocker, award full credit even for downstream dependent criteria. + +3. **Controllable Failures** (penalize): wrong selections when correct options are available, poor execution (not using filters, not attempting specified platforms), hallucinations (claiming success without evidence), insufficient effort. + +4. **Tasks with Explicit Constraints.** Distinguish "searched for the constraint" from "found results that actually satisfy it". If the hard constraint is NOT met in the evidence, award only minimal partial credit for the search effort. + +5. **Ambiguous Wording — Don't Penalize for One Valid Interpretation.** If the task has multiple defensible readings, the agent picking one is fine. + +6. **Cascading Dependencies:** + - Scenario A: Blocker is uncontrollable → award full credit for downstream criteria that couldn't be attempted. + - Scenario B: Blocker is a controllable error → cascade partial/zero credit downstream. + - Scenario C: Don't re-penalize for the same deviation across multiple criteria. + - Scenario D: Shared platform blockers across sibling sub-tasks → award full credit for all affected siblings. + +7. **Conditional Criteria.** Some criteria have a "condition" field. Only score them when the condition is met; otherwise mark the criterion as not applicable (give it max_points so it doesn't drag down the process score). + +8. **Distinguish nitpicks from critical errors:** + - Only nitpicks → 75–100% of max + - Correct approach, wrong final answer → 40–80% + - Critical error → penalize per severity + - Mix of nitpicks + a critical error → score based on the critical error + +**Outcome Judgment:** + +\`output_success\` is your independent binary verdict on whether the agent completed the task. It is informed by the per-criterion scores but is not a function of them — a task can have high process score and still fail (right approach, wrong final answer) or have lower process score and still succeed (the answer is right, intermediate steps were inelegant). + +**Findings:** Surface actionable patterns: failed tool usage, agent-strategy issues, rubric quality problems, capture gaps. Each finding gets a category, severity, description, and (optional) related steps + suggested action. Keep findings sparse and load-bearing. + +--- + +**Output Format:** + +Output one JSON object matching this schema. Include the optional sections only when requested above. + +{{ + "outcome": {{ + "primary_intent": "", + "reasoning": "", + "output_success": true, + "findings": [ + {{ + "category": "agent_tool_usage|agent_strategy|rubric_quality|trajectory_capture|task_specification|verifier_uncertainty|other", + "severity": "info|warning|blocking", + "description": "...", + "suggestedAction": "...", + "relatedSteps": [3, 4] + }} + ] + }}, + "per_criterion": [ + {{ + "criterion_idx": 0, + "applicable_evidence": "Which evidence is applicable; cite by 'Screenshot N — step=K' or aria-tree step number.", + "justification": "How the visual / textual evidence supports your score.", + "earned_points": 4, + "evidence_sufficient": true + }} + ], + "failure_point": {{ + "step_index": 17, + "error_code": "1.4", + "error_category": "Selection", + "description": "" + }}, + "task_validity": {{ + "is_ambiguous": false, + "ambiguity_reason": "", + "is_invalid": false, + "invalid_reason": "" + }} +}} + +- Omit \`failure_point\` when \`output_success\` is true or when failure analysis was not requested. +- Omit \`task_validity\` when task-validity classification was not requested. +- You MUST emit exactly one \`per_criterion\` entry per rubric item, in rubric order. +- \`earned_points\` must be in [0, max_points] for that criterion. + +DO NOT OUTPUT ANYTHING OTHER THAN JSON. +`; diff --git a/packages/core/lib/v3/verifier/prompts/fusedOutcome.ts b/packages/core/lib/v3/verifier/prompts/fusedOutcome.ts new file mode 100644 index 000000000..3ff61aeac --- /dev/null +++ b/packages/core/lib/v3/verifier/prompts/fusedOutcome.ts @@ -0,0 +1,125 @@ +/** + * Fused outcome prompt — Approach A's combined Step 8 + optional folded + * 9a/10 call. Consumes a pre-scored rubric (computed deterministically from + * Approach A's per-criterion analyses) and emits the outcome verdict, + * findings, and optionally the first point of failure + task validity. + * + * Variables: + * - task_definition — instruction string + * - init_url_context — "Starting URL: ..." or empty + * - action_history — compact textual action history + * - outcome_evidence_summary — selected text evidence snippets from the + * trajectory, ordered by step + * - agent_predicted_output — agent's final answer / message + * - rubric_summary — pre-scored rubric: per-criterion earned/max + * + the justifications from per-criterion calls + * - taxonomy_block — error taxonomy text (only when + * fold_failure_analysis is true; "" otherwise) + * - fold_failure_analysis — "true" / "false" + * - fold_task_validity — "true" / "false" + */ +export const FUSED_OUTCOME_PROMPT = `Task: $task_definition$init_url_context + +**Current Date:** $current_date + +You are an expert evaluator of web-navigation agent trajectories. The rubric has already been scored per criterion (results below). Your job is to produce the overall outcome verdict. + +Use the current date above to assess time-sensitive constraints in the task (e.g., a task referencing dates in the past relative to the current date is impossible — classify as task_validity.is_invalid with code 8.1). + +**Action History:** +$action_history + +**Selected Trajectory Evidence:** +$outcome_evidence_summary + +**Agent's Predicted Output (Final Answer):** +$agent_predicted_output + +**Pre-Scored Rubric (per-criterion earned points + justifications):** +$rubric_summary + +**Optional sections in the response:** +- Failure analysis: $fold_failure_analysis +- Task validity classification: $fold_task_validity + +When failure analysis is requested and you judge \`output_success: false\`, populate \`failure_point\` using the error taxonomy below: + +$taxonomy_block + +When task validity is requested, populate \`task_validity\` with the booleans \`is_ambiguous\` / \`is_invalid\` and, when each is true, a single one-line free-form reason in \`ambiguity_reason\` / \`invalid_reason\` (e.g., "Requested dates are in the past relative to the current date"). Leave the reason field empty when the corresponding flag is false. + +--- + +**Outcome judgment:** +\`output_success\` is your independent binary verdict on whether the agent completed the task. It is informed by the per-criterion scores but is not a function of them — a task can have high process score and still fail (right approach, wrong final answer), or have lower process score and still succeed. + +Apply these rules when making the outcome verdict: + +1. **Judge the delivered result, not the route.** If the final answer satisfies the user's requested deliverable and the trajectory does not contradict it, mark success even if the agent used an inefficient path, clicked around after finding the answer, or used a slightly different but reasonable search control. Do not fail solely because the process was imperfect. + +2. **Use the simulated trajectory as truth.** Do not use outside/current-world knowledge to override what the saved browser state shows. If search results, page text, URLs, or screenshots in the trajectory support a concrete answer, judge against that evidence even if the real web may differ. Do not introduce an alternate "correct" answer from your own knowledge unless that alternate answer is visible in the supplied trajectory evidence. + +3. **Treat attached screenshots as partial evidence.** Screenshots may be sampled from a longer trajectory. If the final answer is concrete and the action history shows the agent reached a relevant source (page, list, chart, search result, product page, map result), assume the answer may have been extracted from that source unless the attached screenshots or action history directly contradict it. Do not call a concrete answer fabricated merely because the exact supporting text is absent from the sampled screenshots. + +4. **Require the actual deliverable.** The final answer is the user-visible deliverable. For tasks asking to find, show, get, browse, list, summarize, report, compare, or provide values/content, the final answer must contain the requested content or a direct correct link to the requested document/page. Merely saying the agent "found", "extracted", "provided", "displayed", "retrieved", or "located" the information is not enough, even if the browser reached the right page. The selected trajectory evidence is only context; it cannot fill in missing values, lists, links, or summaries that are absent from the final answer. Only purely navigational tasks phrased as opening a page can succeed from navigation alone. + +5. **Report/document tasks need the report/document.** If the task asks to get, open, retrieve, download, or provide a report/document/page, the final answer must include a direct correct URL, the opened document/page itself must be the final browser state, or the final answer must include the requested document content. Merely naming the document title is not enough unless the user only asked for the title/name. + +6. **Do not invent equivalence.** If the final answer gives a URL, document, article id, product, entity, date, or value that differs from what the trajectory visibly reached, do not assume an unseen redirect, canonicalization, or equivalence. Treat the mismatch as a contradiction unless the supplied trajectory evidence itself shows they are equivalent. + +7. **Hard constraints still matter.** If the latest relevant attached screenshot, selected trajectory evidence, or action-history URL clearly shows that an explicit constraint was not met (for example wrong sort/filter/date/class, permanently closed location reported as active, visible product/list item outside the requested color/size/status, wrong URL/article id, or wrong entity), that is a critical issue. + +8. **Dynamic sources can disagree.** Search results, charts, rankings, prices, and listings can differ across pages or update times in the saved trajectory. If the final answer is concrete and matches at least one relevant trajectory source, do not fail solely because another relevant source in the trajectory shows a different dynamic value, unless the final answer clearly used the contradicted source or the task required that exact source. + +9. **Shopping/search constraints may be source-level.** For product-search tasks, a search query, filter, result page, or product title/snippet can satisfy a descriptive constraint such as "hypoallergenic" when the final answer provides concrete candidate products and no supplied evidence directly contradicts the constraint. Do not fail solely because a later product page does not repeat every search constraint as a formal specification. + +10. **Separate nitpicks from critical issues.** Minor wording, harmless method choices, unsupported extra details that are not part of the requested deliverable, caveats about imperfect verification, or small presentation differences should not flip a successful answer to failure when the final answer still provides concrete requested content and the trajectory does not directly contradict it. Wrong requested entities, unsupported fabricated requested facts, active-vs-closed mistakes, wrong sort/filter results, wrong dates/classes, and missing requested content are critical. + +Use this decision order: + +- First ask whether the final answer contains the requested user-facing artifact: values, list items, summary, comparison, report link, product candidates, or page/document URL. If it only narrates that the artifact was found/extracted/provided, mark failure. +- Then check for direct contradictions in the supplied trajectory evidence. Contradictions must come from the provided action history, URLs, screenshots, or final answer, not from outside knowledge. +- If the final answer is concrete, the trajectory reached a relevant source, and the supplied evidence does not directly contradict the final answer, mark success. + +**Findings:** Surface actionable patterns: failed tool usage, agent-strategy issues, rubric quality problems, capture gaps. Each finding gets a category, severity, description, and (optional) related steps + suggested action. Keep findings sparse and load-bearing. + +--- + +**Output Format:** + +Output one JSON object: + +{{ + "outcome": {{ + "primary_intent": "", + "reasoning": "", + "output_success": true, + "findings": [ + {{ + "category": "agent_tool_usage|agent_strategy|rubric_quality|trajectory_capture|task_specification|verifier_uncertainty|other", + "severity": "info|warning|blocking", + "description": "...", + "suggestedAction": "...", + "relatedSteps": [3, 4] + }} + ] + }}, + "failure_point": {{ + "step_index": 17, + "error_code": "1.4", + "error_category": "Selection", + "description": "" + }}, + "task_validity": {{ + "is_ambiguous": false, + "ambiguity_reason": "", + "is_invalid": false, + "invalid_reason": "" + }} +}} + +- Omit \`failure_point\` when \`output_success\` is true or failure analysis was not requested. +- Omit \`task_validity\` when task-validity classification was not requested. + +DO NOT OUTPUT ANYTHING OTHER THAN JSON. +`; diff --git a/packages/core/lib/v3/verifier/prompts/index.ts b/packages/core/lib/v3/verifier/prompts/index.ts new file mode 100644 index 000000000..f6867dabd --- /dev/null +++ b/packages/core/lib/v3/verifier/prompts/index.ts @@ -0,0 +1,11 @@ +export { RUBRIC_GENERATION_PROMPT } from "./rubricGeneration.js"; +export { + FIRST_POINT_OF_FAILURE_PROMPT, + parseFailureStepNumbers, +} from "./firstPointOfFailure.js"; +export { TASK_VALIDITY_PROMPT } from "./taskValidity.js"; +export { MM_BATCHED_RELEVANCE_PROMPT } from "./batchedRelevance.js"; +export { MM_PER_CRITERION_SCORE_PROMPT } from "./perCriterionScore.js"; +export { FUSED_JUDGMENT_PROMPT } from "./fusedJudgment.js"; +export { FUSED_OUTCOME_PROMPT } from "./fusedOutcome.js"; +export { renderPrompt, buildInitUrlContext } from "./render.js"; diff --git a/packages/core/lib/v3/verifier/prompts/perCriterionScore.ts b/packages/core/lib/v3/verifier/prompts/perCriterionScore.ts new file mode 100644 index 000000000..9ec13e0bf --- /dev/null +++ b/packages/core/lib/v3/verifier/prompts/perCriterionScore.ts @@ -0,0 +1,73 @@ +/** + * Per-criterion scoring prompt — Approach A's replacement for Steps 4 + 6. + * + * One call grades ONE criterion against its top-K evidence (images + text), + * returning both an analysis and an earned-points score. With N criteria + * this is N calls (parallelizable). Drops the Step-6 whole-rubric rescore + * entirely since `processScore` becomes Σ earned_points / Σ max_points. + * + * Variables: + * - task_definition — instruction string + * - init_url_context — "Starting URL: ..." or empty + * - action_history — compact textual action history + * - agent_predicted_output — agent's final answer / message + * - criterion_idx — index in the rubric + * - criterion_name — the criterion text + * - criterion_description — description of what's being measured + * - criterion_max_points — max points for this criterion + * - criterion_condition — optional "Condition: ..." line, or empty + * - evidence_manifest — labelled list of the top-K evidence points + * attached below (image refs + ariaTree + * snippets in chronological order). + */ +export const MM_PER_CRITERION_SCORE_PROMPT = `Task: $task_definition$init_url_context + +You are scoring ONE rubric criterion against the relevant evidence from an agent's trajectory. + +**Action History:** +$action_history + +**Agent's Predicted Output (Final Answer):** +$agent_predicted_output + +**Criterion #$criterion_idx — "$criterion_name"** +- Description: $criterion_description +- Max points: $criterion_max_points +$criterion_condition + +**Evidence (top-K most relevant):** +$evidence_manifest + +Each evidence reference points to an image attached below or to a text snippet shown inline above. Screenshots are listed in chronological order; when two screenshots show the same element, **the LATER screenshot reflects the final state and takes precedence**. + +**Core Evaluation Principles:** + +1. **Best Effort.** Reward effort within constraints the agent cannot control. +2. **Uncontrollable blockers** (CAPTCHA, login walls, sold out, site down, entity nonexistence) → award full credit when screenshots confirm the blocker. +3. **Controllable failures** (wrong selections when correct options exist, hallucinations, premature giveup) → penalize per severity. +4. **Hard constraints in the task** (specific qualifications, attributes, filters) → only award full credit when the constraint is actually met in the final evidence, not just searched for. +5. **Conditional criteria.** If this criterion has a Condition and the condition is NOT met, set \`earned_points\` to \`criterion_max_points\` (criterion is not applicable) and note this in the justification. +6. **Nitpick vs critical error scoring:** + - Only nitpicks → 75–100% of max + - Correct approach, wrong final answer → 40–80% + - Critical error → penalize per severity + +**Output Format:** + +Output one JSON object: + +{{ + "criterion_idx": $criterion_idx, + "applicable_evidence": "Which evidence supports the score; cite by 'Screenshot N — step=K' or aria-tree step number. If no evidence is applicable, state that.", + "justification": "How the evidence supports the score. If using condition-not-met rule, explain.", + "earned_points": , + "evidence_sufficient": true, + "condition_met": null +}} + +- \`earned_points\` must be in [0, $criterion_max_points]. +- \`evidence_sufficient\` = false when the available evidence is genuinely too sparse to grade fairly. The verifier will mark the criterion as evidence-insufficient. +- \`condition_met\` is a boolean when the criterion has a Condition; otherwise null. + +DO NOT OUTPUT ANYTHING OTHER THAN JSON. +`; diff --git a/packages/core/lib/v3/verifier/prompts/render.ts b/packages/core/lib/v3/verifier/prompts/render.ts new file mode 100644 index 000000000..1844348a4 --- /dev/null +++ b/packages/core/lib/v3/verifier/prompts/render.ts @@ -0,0 +1,43 @@ +/** + * Minimal Python-`string.Template`-compatible renderer for verifier prompts. + * + * The verifier prompt templates use Python's `string.Template` semantics: + * - `$identifier` is a substitution placeholder. + * - `$$` is a literal dollar sign. + * + * Porting strategy: keep the prompt strings verbatim (including `$$` for + * literal dollars), and render them through this helper instead of switching + * to TS template literals — the latter would require manually escaping every + * `$` in the prose, which is error-prone for 2000+ lines of prompts. + * + * @example + * renderPrompt("Task: $task", { task: "Buy flour" }) === "Task: Buy flour" + * renderPrompt("Costs $$5", {}) === "Costs $5" + */ +export function renderPrompt( + template: string, + vars: Record, +): string { + // Two-pass: first protect literal $$, then interpolate, then restore. + const placeholder = "__VERIFIER_DOUBLE_DOLLAR__"; + let out = template.replaceAll("$$", placeholder); + out = out.replace(/\$([A-Za-z_][A-Za-z0-9_]*)/g, (_match, key: string) => { + if (key in vars) { + const v = vars[key]; + return v === undefined ? "" : String(v); + } + // Unknown variable: leave it intact so a missing-binding bug surfaces. + return `$${key}`; + }); + return out.replaceAll(placeholder, "$"); +} + +/** + * Build the optional "init URL context" sentence used by most prompts. + * When the task carries a starting URL, append + * " Starting URL: " after the task identifier; otherwise return empty. + */ +export function buildInitUrlContext(initUrl?: string): string { + if (!initUrl) return ""; + return `\n Starting URL: ${initUrl}`; +} diff --git a/packages/core/lib/v3/verifier/prompts/rubricGeneration.ts b/packages/core/lib/v3/verifier/prompts/rubricGeneration.ts new file mode 100644 index 000000000..311da00e3 --- /dev/null +++ b/packages/core/lib/v3/verifier/prompts/rubricGeneration.ts @@ -0,0 +1,317 @@ +/** + * Rubric generation prompt — Step 0a of the rubric verifier pipeline. + * + * Used when a task has no precomputed_rubric (i.e., everything except + * upstream WebTailBench). The LLM generates a structured rubric of criteria + * from the task description alone, which gets cached to disk per task id. + * + * Variables: + * - task_id — the task instruction string + * - init_url_context — optional "Starting URL: ..." appendix (use buildInitUrlContext) + * + * Note on `$$`: Python's `string.Template` treats `$$` as a literal `$`. + * The renderPrompt() helper preserves that semantics — `$$200` in the + * template renders as `$200` in the final prompt sent to the model. + */ +export const RUBRIC_GENERATION_PROMPT = `Task: $task_id$init_url_context + You are an expert tasked with analyzing a given task to identify the key points and sub-goals explicitly stated in the task description to create a rubric for evaluation. + + **Objective**: Carefully analyze the task and extract the critical elements/goals/success criteria **explicitly** mentioned in the task for achieving its goal. Output a set of criteria that can be used to evaluate how well an Agent completed the task, along with descriptions of how to award points for each criterion. + + **Critical Point Definition** + NOTE: The rubric should **not** contain criteria that violate a "Critical Point" or penalize scores for not crossing a Critical Point. A Critical Point, e.g. involving 'Checkout', 'Book', 'Purchase', 'Call', 'Email', etc, is a binding transaction/agreement that would require the user's permission to use personal or sensitive information (name, email, credit card, address, email, resume, etc) in order to complete a transaction (purchase, reservation, etc) or enter into a communication that a human would be expected to do (e.g. call or email). + - For example, when purchasing a product, it is acceptable have a criterion to "Add to Cart", or select the desired timeslot to book a reservation, but it **NOT** acceptable to propose a criteria to actually complete the checkout process (requiring entering the user's name, address, credit card, or sign into their account, etc unless specified in the Task). Stopping on the "Customer Details" is generally acceptable. If some stores/restaurants require a location before showing the product or reservation, the agent can enter a location (e.g. city) if known, but should not enter any personal information (e.g. name, email, phone number, etc). + - For example, if the task is to "call a restaurant to make a reservation," the success criteria should not say to make the call but rather to **find** the phone number. + - Similarly, if the task is to "order new size 12 running shoes" the criteria should not say place the order but instead find the right shoes and add them to the cart. + - If the task contains user information like name/email/address/etc, make a criterion stating that **other** user information not provided in the task is not "made up", and only what is given is reflected accurately in e.g. the checkout process. + + **Controllable vs. Uncontrollable Factors** (Critical for Fair Evaluation): + When creating rubric criteria, distinguish between factors within and outside the agent's control. The agent should receive full credit for accurately identifying and reporting uncontrollable blockers, OR for achieving the primary intent through reasonable alternatives when specified methods fail. + + **UNCONTROLLABLE FACTORS** (Full credit for identifying/reporting these): + 1. **Platform/Infrastructure Issues**: Website down, blocked by CAPTCHA, login walls (without credentials), server errors, missing functionality + 2. **Entity Non-Existence**: Restaurant/business closed or doesn't exist, product discontinued, service no longer available, person/entity not in directory + 3. **Availability/Inventory Constraints**: No reservations/flights on requested date, out of stock, sold out, seasonal unavailability + 4. **Search Result Limitations**: No exact match exists, requested attribute combination impossible, empty result sets + 5. **Platform Capability Limitations**: Platform doesn't list the entity, platform doesn't support required action, platform requires critical point crossing to proceed, but perhaps another platform does. + 6. **Information Accuracy Issues Beyond Agent Control**: Third-party data outdated, price/availability changed during browsing, conflicting information across sources + + **CONTROLLABLE FACTORS** (Should be penalized): + 1. **Primary Intent Violations**: Wrong entity (restaurant/product name), wrong critical attributes (date/location) when correct ones available, ignoring explicit constraints when alternatives exist + 2. **Navigation and Search Errors**: Not attempting specified platform when accessible, poor search strategy, not using available filters + 3. **Execution Errors**: Incorrect data entry, skipping required steps, wrong selections from available options + 4. **Communication Failures**: Not reporting blockers encountered, hallucinations (claiming success without evidence), incomplete reporting, false unavailability claims + 5. **Insufficient Effort**: Premature abandonment after single attempt, not trying alternatives when appropriate, immediately using alternatives without attempting specified approach + 6. **Misunderstanding Task Requirements**: Crossing critical points unnecessarily, adding unrequested steps, ignoring task scope + + **Interpret Task Verbs Charitably (Do NOT Over-Literalize)**: + Many tasks use casual, everyday language. Interpret task verbs as a reasonable user would intend them, NOT in an overly literal or academic sense. The agent is a web navigation assistant — the user wants the agent to find things, navigate to pages, and report back useful information. They are NOT asking for formal academic outputs unless explicitly stated. + + Common examples of verbs and similar subjective terms that should be interpreted charitably: + - **"Locate"**, **"find"**, **"pull up"**, **"look up"**, **"check"**: Navigate to the relevant page/content and report the key information. Do NOT require the agent to output a URL unless explicitly asked. + - **"Review"**, **"read"**, **"look at"**, **"go through"**: Navigate to the content and provide a reasonable summary or overview of what was found. Do NOT require a formal structured review, literary critique, or exhaustive analysis. A brief summary of the key points visible on the page is sufficient. + - **"Show me"**, **"get me"**: Find and present the relevant information. The user wants to see the content, not a URL. + - **"Research"**, **"explore"**: Browse relevant sources and report findings. Do NOT require academic-level depth unless explicitly stated. + + When creating rubric criteria for these or similar kinds of subjective verbs, the success criteria should focus on whether the agent **found the right content and reported useful information**, NOT on the depth, format, or presentation style of the output. + + **SubGoal Definition**: + A **subgoal** is a critical element, condition, or step **explicitly** mentioned in the task description required for success. + - Do not infer or add any unstated subgoals or criteria, e.g. if the task is "what standard length of vinyl outside corner trim does HomeDepot sell?", do NOT add a criterion requiring the URL of the product, because it was not asked. + - **Intermediate Discovery vs. Required Output**: Criteria may verify that the agent *found, viewed, or interacted with* the correct intermediate results during navigation (e.g., "searched for and reviewed relevant Azure courses on Coursera"), but should NOT require the agent to *output or list* those intermediate results unless the task explicitly asks for them. For example, if the task is "find which Azure course on Coursera has the most flexible schedule," the agent should receive credit for browsing and reviewing multiple Azure courses (visible in screenshots/actions), but should NOT be penalized for only reporting the most flexible one in its final answer — that is all the task asked for. + - Do not make redundant or overlapping criteria (e.g. for the task "book a flight on air asia", do NOT make separate criteria for "access airasia.com" and "ensure AirAsia as the booking platform" since they are redundant) + - Separate **what** the subgoals are from **how** to evaluate them + + **CRITICAL: Handling Conditional Criteria**: + + Some tasks contain **conditional requirements** that only apply when specific conditions are met. These must be modeled explicitly using a "condition" field. + + **When to create a conditional criterion**: + - Task contains phrases like "if...", "let me know if...", "report any issues", "in case of unavailability" + - The requirement only applies when a specific trigger condition occurs + - Examples: + * "Add flour to cart. Let me know if there are availability issues." → Reporting is conditional on encountering issues + * "Book a direct flight, or if none available, book a one-stop flight." → One-stop criterion is conditional on no direct flights existing + * "Buy organic blueberries, or if unavailable, buy non-organic." → Non-organic criterion is conditional on organic being unavailable + + **How to structure conditional criteria**: + 1. Add a "condition" field (string) that describes the triggering condition + 2. In the "description" field, explain both the condition AND how to score if condition is met + 3. Make it clear that points are ONLY counted if the condition is met + + **Schema for conditional criteria**: + {{ + "criterion": "Brief name of what's being evaluated", + "condition": "Clear description of when this criterion applies (e.g., 'Only applies if organic blueberries are unavailable')", + "task_span": "Verbatim substring of the original task that THIS criterion is evaluating (e.g., 'organic blueberries')", + "description": "What to evaluate and how to score IF the condition is met. Full credit for..., partial credit for...", + "max_points": N, + "justification": "", + "earned_points": "" + }} + + **Schema for non-conditional criteria** (most criteria): + {{ + "criterion": "Brief name of what's being evaluated", + "task_span": "Verbatim substring of the original task that THIS criterion is evaluating (e.g., 'add flour to cart')", + "description": "What to evaluate and how to score. Full credit for..., partial credit for...", + "max_points": N, + "justification": "", + "earned_points": "" + }} + (Note: No "condition" field means the criterion always applies) + + **task_span — ANTI-HALLUCINATION ANCHOR (REQUIRED)**: + Every criterion **must** include a "task_span" field whose value is a **verbatim substring** of the original task description (the text after "Task:" above). This is the literal phrase from the task that justifies including this criterion. + - If you cannot copy a contiguous substring of the task that justifies the criterion, **the criterion does not belong in the rubric** — the task did not ask for it. Drop it. + - Substrings shorter than 3 words are not enough — pick a span that makes the connection unambiguous. + - Do NOT paraphrase, summarize, or normalize the span. Copy it character-for-character so a downstream check can verify it is a substring of the task. + - For setup/platform criteria implied by the task's specified platform, copy the platform name verbatim (e.g., task_span: "drugssquare.com"). + - For Critical Point boundary criteria, you may use the special token "" — this is the ONE permitted non-substring value, since the Critical Point rule is supplied by these instructions, not the task text. + + **Important**: Do NOT create conditional criteria for requirements that are implicitly satisfied by successful task completion. + - Example: "Add flour to cart. Let me know if unavailable." + * WRONG: Separate conditional criterion "Report unavailability (condition: flour unavailable)" + * RIGHT: Single criterion "Add flour to cart" with description: "Full credit if flour added to cart successfully OR if flour is unavailable and agent reports this" + - Why? Because successful addition implies availability, and we want to avoid the agent needing to explicitly state the obvious. + + **When TO create a conditional criterion**: + - When the task explicitly requests an alternative action or fallback behavior + - When you have mutually exclusive options (only one should be counted based on circumstances) + + **IMPORTANT: Mutually Exclusive Conditionals**: + When a task has mutually exclusive alternatives (only one should apply), make ALL alternatives conditional with opposite conditions. This ensures only ONE is counted. + + Common pattern example: "do X, or if X unavailable, do Y" + - Make BOTH X and Y conditional with opposite conditions + - Only the applicable one will be counted during scoring + + Concrete example: "Buy organic blueberries, or if unavailable, buy non-organic" + - Criterion 1: "Buy organic blueberries" (condition: "Only applies if organic blueberries are available") + - Criterion 2: "Buy non-organic blueberries" (condition: "Only applies if organic blueberries are unavailable") + - During scoring: Only ONE will have is_condition_met=true, so only ONE is counted + + **Examples**: + + Example 1: "Add flour and vegetable oil to cart at Ralphs. Let me know if there are availability issues." + - Criterion 1: Add flour to cart [no condition field] - Description includes: "Full credit if flour added OR if unavailable and agent reports this" + - Criterion 2: Add vegetable oil to cart [no condition field] - Description includes: "Full credit if oil added OR if unavailable and agent reports this" + - Do NOT create separate conditional criteria for reporting, since successful addition implies availability + + Example 2: "Buy organic blueberries at Whole Foods. If they don't have organic, buy non-organic ones. If they don't have any blueberries at all, let me know." + - Criterion 1: Buy organic blueberries [condition: "Only applies if organic blueberries are available"] + - Criterion 2: Buy non-organic blueberries [condition: "Only applies if organic blueberries are unavailable but non-organic are available"] + - Criterion 3: Report complete unavailability [condition: "Only applies if neither organic nor non-organic blueberries are available"] + - Note: Exactly ONE of these three criteria will have is_condition_met=true during scoring + + **Ensure Criterion Disjointness (Avoid Double-Penalty Structures)**: + Make criteria as disjoint and non-overlapping as possible to avoid penalizing the same mistake multiple times. + + **Key Principle**: If criterion A penalizes for not using platform/method X, then other criteria should evaluate task completion aspects (finding entities, making selections, progressing workflows) **independently** of whether platform/method X was used. + + **Pattern to AVOID** (Double-penalty): + - Criterion 1: "Identify a Mexican restaurant on gayot.com" + - Criterion 2: "Reach reservation interface on gayot.com" + - Problem: Both penalize for not using gayot.com → agent loses points twice for same mistake + + **Pattern to FOLLOW** (Disjoint): + - Criterion 1: "Attempt gayot.com as the specified platform" (evaluates platform usage) + - Criterion 2: "Identify a Mexican restaurant in Chicago's Northside" (evaluates entity identification, independent of platform) + - Criterion 3: "Reach a reservation booking interface for the restaurant" (evaluates workflow progress, independent of platform) + - Result: If gayot.com fails, agent only loses points on Criterion 1, not on 2 and 3 + + **Do Not Create Duplicate Criteria**: + Do NOT create multiple criteria that penalize the same mistake. Duplicate criteria create "double jeopardy" — the agent is unfairly penalized twice for a single error, which distorts scores. If two candidate criteria would both deduct points for the same failure, either combine them into a single criterion or restructure them so each evaluates a truly independent aspect. + + **Examples of duplicates to avoid**: + - "Progress booking flow up to Critical Point" + "Respect Critical Point and avoid personal info" + → Instead, create ONE criterion: "Progress booking flow up to (but not beyond) the Critical Point, stopping before entering personal/payment information" + - "Add product to cart" + "Do not complete checkout" + → Instead, create ONE criterion: "Add product to cart and stop before entering personal/payment details" + - "Add the liquid ingredients to the Target cart" + "Ensure only liquid ingredients are added to the cart" + → Instead, create ONE criterion: "Add only the liquid ingredients to the Target cart" — the action and the constraint evaluate the same thing: whether the correct items (and only the correct items) were added. Adding a wrong item should be penalized once, not twice. + - "Identify suitable online purchase options for each listed spice/seasoning" + "Limit purchasing scope to spices/seasonings only" + → Instead, create ONE criterion: "Identify suitable online purchase options for only the listed spices/seasonings" — the scope constraint is already implicit in the action: if the agent correctly identifies options for each listed spice, it has necessarily limited scope to spices. + + **Test for duplication**: Ask yourself: "If the agent makes mistake X, would it lose points in multiple criteria?" If YES, either combine those criteria into one or restructure them so each criterion evaluates a genuinely independent aspect of the task. + + **Decompose List-Style Tasks into Per-Item Criteria**: + When a task contains a "laundry list" of items to perform the same action on (e.g., "add ground beef, onion, garlic, black beans, corn, tomato sauce, chili powder, cumin, cheddar cheese, and cornbread mix to my cart"), create a **separate criterion for each item** rather than a single criterion for all items combined. + + **When to apply this rule**: + - The task lists 3 or more items/entities that all require the same type of action (e.g., add to cart, search for, book, find, look up, etc.) + - The items are independently actionable (success/failure on one item does not inherently depend on another) + + **Why**: Each item may have independent failure modes (out of stock, not found, wrong product selected, different availability). A single combined criterion cannot fairly award partial credit when some items succeed and others fail. Separate criteria allow precise, per-item scoring. + + **How**: Create one criterion per item, each with its own max_points and description including item-specific partial credit guidance (e.g., out of stock, wrong variant selected, not found after reasonable search). + + **Example**: Task: "Add ground beef, onion, garlic, black beans, and corn to my cart on Stop & Shop" + - WRONG: Single criterion "Add all grocery items to cart" [10 points] + - RIGHT: Separate criteria: + * "Add ground beef to cart" [2 points] — Full credit if added, or if unavailable and agent reports this. Partial credit for wrong cut/variant when correct one is available. + * "Add onion to cart" [2 points] — Full credit if added, or if unavailable and agent reports this. + * "Add garlic to cart" [2 points] — Full credit if added, or if unavailable and agent reports this. + * "Add black beans to cart" [2 points] — Full credit if added, or if unavailable and agent reports this. + * "Add corn to cart" [2 points] — Full credit if added, or if unavailable and agent reports this. + + **Note**: You may still have additional criteria for platform access (e.g., "Access the Stop & Shop website") or Critical Point boundaries, separate from the per-item criteria. + + **Partial Credit Guidance** (to be incorporated into the "description" field of each criterion): + Each criterion's "description" field must specify how to handle both successful completion AND common failure modes caused by external factors. Use the framework below: + + **1. Primary Intent vs. Secondary Constraints**: + - **Primary intent**: The core objective that defines task success (e.g., "book reservation at Restaurant X on date Z") + - **Secondary constraints**: Preferred methods/platforms (e.g., "using platform Y") + - Award **FULL credit** if primary intent is achieved through reasonable alternatives when secondary constraints are impossible due to uncontrollable factors + - Award **PARTIAL credit** if secondary constraints are ignored without attempting them first, but primary intent is still achieved + - Award **NO credit** if primary intent is violated (e.g., booking wrong restaurant name or wrong date when correct ones are available) + + **2. Entity Non-Existence Scenarios**: + - If a specified entity (restaurant, product, business, service) no longer exists or cannot be found after reasonable search, award **FULL credit** for clearly reporting this finding + - If an entity doesn't exist, award **FULL credit** for either: (a) reporting non-existence OR (b) identifying a reasonable alternative that matches the primary intent + - Examples: Restaurant closed permanently, product discontinued, airline no longer operates that route, person not in directory + + **3. Availability and Inventory Constraints**: + - For time-sensitive requests (dates, times, reservation slots), award **FULL credit** for accurately reporting unavailability when the requested option genuinely doesn't exist + - Award **FULL credit** for either: (a) clearly stating unavailability OR (b) selecting the best available alternative that matches primary intent + - Examples: No direct flights on requested date, restaurant fully booked, product out of stock, hotel no vacancy + + **4. Platform Capability Limitations**: + - If a specified platform cannot support the required action (e.g., platform doesn't list the entity, platform blocks access, platform requires critical point crossing), award **FULL credit** for either: (a) reporting this limitation OR (b) achieving the goal through an alternative platform + - Do NOT penalize for using alternative platforms when the specified platform is genuinely incapable or inaccessible + - Example: Platform Y doesn't have Restaurant X listed → full credit for booking Restaurant X on Platform Z instead + + **5. Search Results and Filtering Constraints**: + - When a task involves multiple filtering criteria but no result satisfies them all, award **FULL credit** if the agent: (a) identifies the best available option matching the **primary intent**, OR (b) states that no exact match exists, OR (c) both + - For subjective tasks (e.g., "find the best new sushi restaurant"), award **FULL credit** for consulting authoritative sources and making reasonable selections + - Example: Task requests "100% cotton Power Rangers hoodie" but only polyester hoodie exists → full credit for recommending the hoodie OR stating no exact match + + **6. Reasonable Effort Requirements**: + - Award **FULL credit** only if the agent demonstrates reasonable effort before resorting to alternatives (attempting the specified approach, encountering genuine blocker, THEN reporting/finding alternative) + - Award **PARTIAL credit** if agent immediately defaults to alternatives without attempting the specified approach when it was accessible + - Award **NO credit** for premature abandonment without reasonable attempt + + **Instructions**: + 1. Read the task description carefully. + 2. Identify and extract **subgoals** directly stated in the task, and describe how to evaluate each subgoal, including how to award partial credit for common failure modes or external dependencies outside the agent's control. + 3. Output a minimal rubric to grade how well an Agent completed the subgoals. You will format your output as a rubric with the following elements/fields + + The rubric must be: + 1. Formatted as json dictionary of a (possibly nested) list of "items" + 2. Each Item in the rubric must contain the following fields IN ORDER: + - For CONDITIONAL criteria: ["criterion", "condition", "task_span", "description", "max_points", "justification", "earned_points"] + - For NON-CONDITIONAL criteria: ["criterion", "task_span", "description", "max_points", "justification", "earned_points"] + - The "condition" field should ONLY be present for conditional criteria (criteria that only apply when specific conditions are met) + - The "task_span" field is REQUIRED on every criterion (verbatim substring of the task, or "" for Critical Point boundary criteria only) + 3. Choose the "max_points" judiciously to account for possible failure modes that could earn partial credit: goals that would have more failure modes deserve higher max_points. + 4. The "description" should explain *what* goal the criteria is evaluating and *how* partial credit could be awarded to fairly penalize the agent's mistakes while accounting for external dependencies outside the agent's control. + 5. For conditional criteria, the "condition" field must clearly state when the criterion applies (e.g., "Only applies if organic blueberries are unavailable") + 6. Leave the "earned_points" and "justification" fields **empty** (since this rubric isn't being evaluated right now). + 7. Do not make criteria for formatting/style unless stated explicitly in the Task. + 8. Keep the rubric simple, following ONLY the main keypoints the task required. Do not overcomplicate the criteria or include optional items that were not explicitly mentioned. + + **ANTI-PATTERN — Over-Specification / Hallucinated Sub-Goals**: + A common failure mode is inventing criteria that the task never asked for. If a criterion's task_span isn't a substring of the original task description (and it isn't the Critical Point boundary), the criterion is over-specifying. + + Anti-example task: "Find the citation page for the City of Vancouver." + - WRONG criterion: "Display the citation number, fine amount, and payment due date" — the task did not ask for any of these specific fields. There is no "citation number" or "fine amount" in the task text. + * task_span: → drop. + - WRONG criterion: "Provide instructions for paying the citation online" — the task did not ask for payment instructions. + * task_span: → drop. + - RIGHT criterion: "Locate the citation page on the City of Vancouver site" [3 points] + * task_span: "citation page for the City of Vancouver" + * Description: Full credit if agent navigates to the citation/ticket page on the official cityofvancouver site, or if no such page exists and agent reports this. Partial credit for finding an unrelated city services page. + + Anti-example task: "Search arXiv for the most recent paper on retrieval-augmented generation." + - WRONG criterion: "Output the abstract and author list of the paper" — task didn't ask for abstract or author list. The agent just needs to identify and report the paper. + * task_span: → drop. + - RIGHT criterion: "Identify the most recent arXiv paper on retrieval-augmented generation" [4 points] + * task_span: "most recent paper on retrieval-augmented generation" + + **The test**: For each criterion you draft, ask: "Can I copy a contiguous substring of the task that this criterion is evaluating?" If no, the criterion is over-specifying and must be dropped. + + ================== + Example 1 (high level): "Book a reservation at a Mexican restaurant in the Northside of Chicago on 09/29/2025 for 2:45 PM at gayot.com." + High level rubric: + - Start on gayot.com as the specified platform [2 points] //Full credit if: (a) agent attempts gayot.com but it's blocked/unavailable and clearly reports this, OR (b) agent successfully completes the booking through an alternative platform after determining gayot.com cannot complete the booking + - Identify a Mexican restaurant in Chicago's Northside [3 points] //Full credit for correct identification. Partial credit if restaurant location is ambiguous or not clearly in Northside. No credit if wrong cuisine type when Mexican options exist + - Reach a reservation booking interface for the restaurant [2 points] //Full credit for ANY reservation booking interface (decoupled from platform criterion to avoid double-penalizing). Full credit if restaurant doesn't offer online booking and agent reports this + - Attempt date/time selection for 09/29/2025 at 2:45 PM [3 points] //Full credit if: (a) correct date/time selected, OR (b) that date/time is unavailable and agent clearly indicates this, OR (c) agent selects closest available alternative and explains why. No credit if wrong date selected when correct date is available + - Respect Critical Point boundaries (no personal info or final booking) [2 points] //Full credit for stopping before entering personal information. Deduct if agent unnecessarily crosses critical point + + Takeaway: Make criteria as disjoint and non-overlapping as possible, e.g. do NOT make "Reach the reservation interface starting from gayot.com" a criterion bc this overlaps with the first criterion. Each criterion should clearly specify full credit scenarios for both success AND uncontrollable failure modes. + + ================== + Example 2 (detailed): Help purchase 'Fosbait: Lanthanum Carbonate 500mg Tablets' for kidney diseases on drugssquare.com. + + Example Output Rubric: + {{ + "items": [ + {{ + "criterion": "Access drugssquare.com website", + "task_span": "drugssquare.com", + "description": "Successfully navigate to or access the drugssquare.com website as specified in the task. Full credit if: (a) site is successfully accessed, OR (b) site is inaccessible (blocked, down, error) and agent clearly indicates this and either stops or attempts alternative source. Partial credit if agent immediately uses alternative without attempting drugssquare.com first.", + "max_points": 2, + "justification": "", + "earned_points": "" + }}, + {{ + "criterion": "Locate the specific product", + "task_span": "Fosbait: Lanthanum Carbonate 500mg Tablets", + "description": "Find and identify the exact product 'Fosbait: Lanthanum Carbonate 500mg Tablets' on the website. Full credit if: (a) exact product found, OR (b) product is discontinued/out of stock/not found after reasonable search and agent clearly reports this, OR (c) agent identifies closest alternative (different dosage of same medication) when exact product unavailable and explains the substitution. Partial credit for selecting different dosage when the intended 500mg dosage IS available on the site. No credit for selecting entirely different medication when Fosbait is available.", + "max_points": 3, + "justification": "", + "earned_points": "" + }}, + {{ + "criterion": "Facilitate the purchase process until a Critical Point", + "task_span": "", + "description": "Help complete the purchase of the identified product up to but not crossing the Critical Point. Full credit if: (a) product is successfully added to cart and agent stops before entering personal/payment information, OR (b) the website's purchase flow is broken/unavailable and agent reports this, OR (c) website requires account creation or payment info before allowing add-to-cart and agent reports this limitation. Partial credit if agent proceeds partially but makes errors in the purchase flow (wrong quantity, wrong shipping option, etc.). No credit if agent either skips the cart entirely OR unnecessarily enters personal information when not required.", + "max_points": 3, + "justification": "", + "earned_points": "" + }} + ] + }} + Please output an answer in pure JSON format according to the following schema. The JSON object must be parsable as-is. DO NOT OUTPUT ANYTHING OTHER THAN JSON, AND DO NOT DEVIATE FROM THE ABOVE SCHEMA:`; diff --git a/packages/core/lib/v3/verifier/prompts/taskValidity.ts b/packages/core/lib/v3/verifier/prompts/taskValidity.ts new file mode 100644 index 000000000..693aa2bb0 --- /dev/null +++ b/packages/core/lib/v3/verifier/prompts/taskValidity.ts @@ -0,0 +1,116 @@ +/** + * Task validity classification prompt — Step 10 of the rubric verifier + * pipeline. + * + * Task-only verification — classifies the task itself (without trajectory + * context) along two axes: + * - Ambiguity (taxonomy category 7) — underspecified or admits multiple + * valid interpretations + * - Validity (taxonomy category 8) — impossible / illegal / NSFW / RAI + * + * Diagnostic signal only. Populates EvaluationResult.taskValidity. + * + * Variables substituted at call time: + * - task_definition + * - url + * - apps + * - date + */ +import { getTaxonomyText } from "../errorTaxonomy.js"; + +// Axis definitions used by the task-level validity prompt. +const AXIS1_AMBIGUITY_DEFINITION = `A task is **ambiguous** when it is missing critical information that would prevent a reasonable agent from completing it correctly, or when the description admits multiple valid interpretations that cannot be resolved from context alone. + +A task is **not ambiguous** if a competent agent could reasonably infer a single correct course of action from the task description and the given URL/app, even if minor details (like output formatting) are underspecified. + +**NOTE:** For tasks that provide a URL, website, or app, this context is considered part of the specification and may resolve otherwise ambiguous parameters.`; + +const AXIS2_INVALID_DEFINITION = `A task is **invalid** when it cannot be completed due to inherent constraints, or when it is illegal, NSFW, or violates Responsible AI principles. These are properties of the task itself, not of the agent's execution. + +A task is **not invalid** if a competent agent could, in principle, accomplish the goal using standard computer interactions in its environment, even if the task is complex, time-consuming, or requires many steps. + +Do NOT confuse impossibility with ambiguity. An ambiguous task (missing details) may still be possible if the agent can make reasonable assumptions.`; + +// Sub-category bullets from the error taxonomy (categories 7 and 8). +const CAT7_BULLETS = getTaxonomyText(7, 7, 4); +const CAT8_BULLETS = getTaxonomyText(8, 8, 4); + +export const TASK_VALIDITY_PROMPT = `You are an expert failure analyst for computer-use web agents. + +You will analyze a single task **before execution** to identify whether it suffers from task-level issues that would prevent or compromise successful completion. Specifically, you will classify the task along two axes drawn from a standardized error taxonomy: + +1. **Task Ambiguity (Category 7)** — Is the task ambiguous or underspecified? +2. **Invalid Task (Category 8)** — Is the task impossible, illegal, NSFW, or a Responsible AI violation? + +A computer-use agent can interact with a full desktop environment: web browsers, desktop applications, file systems, terminals, and any GUI-based software. + +--- + +## Axis 1 — Task Ambiguity (Category 7) + +${AXIS1_AMBIGUITY_DEFINITION} + +${CAT7_BULLETS} + +**EXAMPLES OF NON-AMBIGUOUS TASKS** +1. Task: "Find the cheapest round-trip flight from San Francisco to Tokyo departing on June 15 and returning June 22, 2026." URL: "https://www.google.com/flights" — origin, destination, dates, trip type, and optimization criterion are all specified. +2. Task: "Add a 16oz bag of Starbucks Pike Place ground coffee to the cart." URL: "https://www.amazon.com/" — the product, size, and action are specific enough. + +**EXAMPLES OF AMBIGUOUS TASKS** +1. Task: "Browse apartments with at least 2 bedrooms…" URL: "https://craigslist.org/" — Craigslist requires a city; this is underspecified (7.1). +2. Task: "Book a flight from NYC to London." URL: "https://www.google.com/flights" — travel dates not specified (7.1). +3. Task: "Highlight over 'the' in file.docx" — open to multiple interpretations when multiple instances exist (7.2). + +--- + +## Axis 2 — Invalid Task (Category 8) + +${AXIS2_INVALID_DEFINITION} + +${CAT8_BULLETS} + +**EXAMPLES OF VALID TASKS** +1. "Find the cheapest round-trip flight from San Francisco to Tokyo." on Google Flights — site supports flight search. +2. "Submit a job application for a software engineering role." on careers.google.com — site supports it. + +**EXAMPLES OF INVALID TASKS** +1. "Log into my personal bank account…" — agent has no credentials (8.1 impossible). +2. "Book a hotel room on LinkedIn." — LinkedIn is professional networking, not a travel site (8.1). +3. "Download and launch GTA 6." — non-existent entity at time of release (8.1). +4. Drug/piracy/prostitution-related tasks → 8.2 (illegal). +5. NSFW-redirecting tasks → 8.3. +6. Fraud, harassment, surveillance, discriminatory targeting → 8.2 / 8.4 (illegal + RAI). + +--- + +## Context + +Task: $task_definition + +URL: $url + +Applications: $apps + +Current Date: $date + +## Instructions + +Analyze the task across both axes. For each axis, provide reasoning and a classification. Be precise: only flag genuine issues that would materially affect task completion. + +**Guiding principles:** +- A task that is merely difficult, tedious, or multi-step is NOT impossible. +- A task that has minor formatting ambiguity is NOT ambiguous. +- The current date is provided to help assess time-sensitive impossibility (e.g., expired events, future releases). + +**IMPORTANT** +Output your answer in pure JSON format according to the following schema. The JSON object must be parsable as-is. DO NOT OUTPUT ANYTHING OTHER THAN JSON, AND DO NOT DEVIATE FROM THIS SCHEMA: + +{{ + "reasoning_is_ambiguous": str, + "is_ambiguous": bool, + "ambiguity_codes": [str], + "reasoning_is_invalid": str, + "is_invalid": bool, + "invalid_task_codes": [str] +}} +`; diff --git a/packages/core/lib/v3/verifier/rubricVerifier.ts b/packages/core/lib/v3/verifier/rubricVerifier.ts new file mode 100644 index 000000000..4fdc28768 --- /dev/null +++ b/packages/core/lib/v3/verifier/rubricVerifier.ts @@ -0,0 +1,1996 @@ +import { z } from "zod"; + +import type { LLMClient, LLMResponse } from "../llm/LLMClient.js"; +import type { LLMParsedResponse } from "../../inference.js"; +import type { LogLine } from "../types/public/logs.js"; + +import type { + CanonicalEvidence, + CanonicalScreenshot, + CanonicalTextEvidence, + CriterionScore, + EvaluationResult, + EvidenceLoadResult, + Rubric, + RubricVerifierOptions, + TaskSpec, + Trajectory, + Verifier, + VerifierConfig, +} from "./types.js"; +import { normalizeRubric } from "./trajectory.js"; +import { + FIRST_POINT_OF_FAILURE_PROMPT, + FUSED_JUDGMENT_PROMPT, + FUSED_OUTCOME_PROMPT, + MM_BATCHED_RELEVANCE_PROMPT, + MM_PER_CRITERION_SCORE_PROMPT, + RUBRIC_GENERATION_PROMPT, + TASK_VALIDITY_PROMPT, + buildInitUrlContext, + parseFailureStepNumbers, + renderPrompt, +} from "./prompts/index.js"; +import { + collectCanonicalEvidence, + isImageEvidence, + isTextEvidence, +} from "./evidence.js"; +import { getTaxonomyText } from "./errorTaxonomy.js"; + +const RubricItemSchema = z.object({ + criterion: z.string(), + description: z.string(), + max_points: z.number(), + condition: z.string().optional(), + task_span: z.string().optional(), + justification: z.string().optional(), + earned_points: z.union([z.number(), z.string()]).optional(), +}); + +const RubricSchema = z.object({ + items: z.array(RubricItemSchema), +}); + +const FindingSchema = z.object({ + category: z + .enum([ + "agent_tool_usage", + "agent_strategy", + "rubric_quality", + "trajectory_capture", + "task_specification", + "verifier_uncertainty", + "other", + ]) + .catch("other"), + severity: z.enum(["info", "warning", "blocking"]).catch("info"), + description: z.string(), + suggestedAction: z.string().optional(), + relatedSteps: z.array(z.number()).optional(), +}); + +const OutcomeSchema = z.object({ + primary_intent: z.string(), + reasoning: z.string(), + output_success: z.boolean(), + findings: z.array(FindingSchema).optional().default([]), +}); + +const FusedOutcomeSchema = z.object({ + primary_intent: z.string(), + reasoning: z.string(), + output_success: z.boolean(), + findings: z.array(FindingSchema).optional().default([]), +}); + +const FusedPerCriterionSchema = z.object({ + criterion_idx: z.coerce.number().int().min(0), + applicable_evidence: z.string().optional().default(""), + justification: z.string().optional().default(""), + earned_points: z.coerce.number(), + evidence_sufficient: z.boolean().optional().default(true), + condition_met: z.boolean().nullable().optional(), +}); + +const FusedFailurePointSchema = z.object({ + step_index: z.coerce.number().int(), + error_code: z.string(), + error_category: z.string(), + description: z.string(), +}); + +const FusedTaskValiditySchema = z.object({ + is_ambiguous: z.boolean(), + ambiguity_reason: z.string().optional().default(""), + is_invalid: z.boolean(), + invalid_reason: z.string().optional().default(""), +}); + +const FusedJudgmentResponseSchema = z.object({ + outcome: FusedOutcomeSchema, + per_criterion: z.array(FusedPerCriterionSchema), + failure_point: FusedFailurePointSchema.optional(), + task_validity: FusedTaskValiditySchema.optional(), +}); + +/** Outcome-only response: no per_criterion field, just outcome + diagnostics. */ +const FusedOutcomeResponseSchema = z.object({ + outcome: FusedOutcomeSchema, + failure_point: FusedFailurePointSchema.optional(), + task_validity: FusedTaskValiditySchema.optional(), +}); + +const BatchedRelevanceItemSchema = z.object({ + evidence_idx: z.coerce.number().int().min(0), + scores: z.array( + z.object({ + criterion_idx: z.coerce.number().int().min(0), + score: z.coerce.number().int().min(0).max(10), + }), + ), +}); +const BatchedRelevanceResponseSchema = z.object({ + items: z.array(BatchedRelevanceItemSchema), +}); + +const PerCriterionScoreResponseSchema = z.object({ + criterion_idx: z.coerce.number().int().min(0), + applicable_evidence: z.string().optional().default(""), + justification: z.string().optional().default(""), + earned_points: z.coerce.number(), + evidence_sufficient: z.boolean().optional().default(true), + condition_met: z.boolean().nullable().optional(), +}); + +const TaskValiditySchema = z.object({ + reasoning_is_ambiguous: z.string(), + is_ambiguous: z.boolean(), + ambiguity_codes: z.array(z.string()).default([]), + reasoning_is_invalid: z.string(), + is_invalid: z.boolean(), + invalid_task_codes: z.array(z.string()).default([]), +}); + +const FailurePointSchema = z.object({ + step_numbers: z.string(), + error_code: z.string(), + error_category: z.string(), + error_type: z.string(), + what_happened: z.string(), + agent_reasoning: z.string(), + evidence: z.string(), + impact: z.string(), +}); + +const FailureAnalysisSchema = z.object({ + reasoning: z.string(), + has_failure: z.boolean(), + failure_points: z.array(FailurePointSchema).default([]), +}); + +const noopLogger: (line: LogLine) => void = () => {}; +const APPROX_CHARS_PER_TOKEN = 4; +const DEFAULT_ACTION_HISTORY_TOKEN_BUDGET = 2_000; +const DEFAULT_EVIDENCE_TOKEN_BUDGET = 3_000; +const DEFAULT_OUTCOME_EVIDENCE_TOKEN_BUDGET = 4_000; +const DEFAULT_OUTCOME_IMAGE_LIMIT = 3; +const DEFAULT_MAX_PARALLEL = 8; +const DEFAULT_TOP_K = 5; +const DEFAULT_RELEVANCE_BATCH_SIZE = 4; +const OUTCOME_EVIDENCE_MAX_STEPS = 14; +const OUTCOME_EVIDENCE_STEP_CHARS = 900; +type VerifierApproach = VerifierConfig["approach"]; +type OptionalStepsMode = VerifierConfig["optionalSteps"]; +const DEFAULT_APPROACH: VerifierApproach = "b"; +const DEFAULT_OPTIONAL_STEPS_MODE: OptionalStepsMode = "folded"; + +const NO_TRUNC = Number.MAX_SAFE_INTEGER; + +function readPositiveIntEnv( + env: NodeJS.ProcessEnv, + name: string, + fallback: number, +): number { + const raw = env[name]; + if (!raw) return fallback; + const parsed = Number.parseInt(raw, 10); + return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; +} + +function readChars( + env: NodeJS.ProcessEnv, + name: string, + fallback: number, + disabled: boolean, +): number { + if (disabled) return NO_TRUNC; + return readPositiveIntEnv(env, name, fallback); +} + +function readApproach(env: NodeJS.ProcessEnv): VerifierApproach { + const raw = env.VERIFIER_APPROACH; + if (raw === "a" || raw === "b" || raw === "outcome-only") return raw; + return DEFAULT_APPROACH; +} + +function readOptionalsMode(env: NodeJS.ProcessEnv): OptionalStepsMode { + const raw = env.VERIFIER_OPTIONAL_STEPS; + if (raw === "folded" || raw === "separate" || raw === "skip") return raw; + return DEFAULT_OPTIONAL_STEPS_MODE; +} + +/** + * Resolve every verifier knob from env (+ optional overrides) into a frozen + * VerifierConfig. Called once by RubricVerifier's constructor; per-call + * overrides flow through verify()'s third arg. + * + * The master switch VERIFIER_DISABLE_TRUNCATION=1 lifts every per-section + * limit to MAX_SAFE_INTEGER — useful on high-context models where + * evidence-bound truncation is the bottleneck, not the token budget. + */ +export function resolveVerifierConfig( + env: NodeJS.ProcessEnv = process.env, + overrides: Partial = {}, +): VerifierConfig { + const truncDisabled = + overrides.truncation?.disabled ?? env.VERIFIER_DISABLE_TRUNCATION === "1"; + return { + approach: overrides.approach ?? readApproach(env), + optionalSteps: overrides.optionalSteps ?? readOptionalsMode(env), + topK: overrides.topK ?? readPositiveIntEnv(env, "VERIFIER_TOP_K", DEFAULT_TOP_K), + relevanceBatchSize: + overrides.relevanceBatchSize ?? + readPositiveIntEnv( + env, + "VERIFIER_RELEVANCE_BATCH_SIZE", + DEFAULT_RELEVANCE_BATCH_SIZE, + ), + outcomeMaxImages: + overrides.outcomeMaxImages ?? + readPositiveIntEnv( + env, + "VERIFIER_OUTCOME_MAX_IMAGES", + DEFAULT_OUTCOME_IMAGE_LIMIT, + ), + maxParallel: + overrides.maxParallel ?? + readPositiveIntEnv(env, "VERIFIER_MAX_PARALLEL", DEFAULT_MAX_PARALLEL), + evidenceTokenBudget: + overrides.evidenceTokenBudget ?? + readPositiveIntEnv( + env, + "VERIFIER_EVIDENCE_TOKEN_BUDGET", + DEFAULT_EVIDENCE_TOKEN_BUDGET, + ), + outcomeEvidenceTokenBudget: + overrides.outcomeEvidenceTokenBudget ?? + readPositiveIntEnv( + env, + "VERIFIER_OUTCOME_EVIDENCE_TOKEN_BUDGET", + DEFAULT_OUTCOME_EVIDENCE_TOKEN_BUDGET, + ), + actionHistoryTokenBudget: + overrides.actionHistoryTokenBudget ?? + readPositiveIntEnv( + env, + "VERIFIER_ACTION_HISTORY_TOKEN_BUDGET", + DEFAULT_ACTION_HISTORY_TOKEN_BUDGET, + ), + truncation: { + disabled: truncDisabled, + evidenceTextPreview: + overrides.truncation?.evidenceTextPreview ?? + readChars(env, "VERIFIER_EVIDENCE_TEXT_PREVIEW_CHARS", 200, truncDisabled), + groupedEvidenceText: + overrides.truncation?.groupedEvidenceText ?? + readChars(env, "VERIFIER_GROUPED_EVIDENCE_TEXT_CHARS", 600, truncDisabled), + buildEvidenceText: + overrides.truncation?.buildEvidenceText ?? + readChars(env, "VERIFIER_BUILD_EVIDENCE_TEXT_CHARS", 160, truncDisabled), + buildEvidenceAria: + overrides.truncation?.buildEvidenceAria ?? + readChars(env, "VERIFIER_BUILD_EVIDENCE_ARIA_CHARS", 1200, truncDisabled), + actionHistoryReasoning: + overrides.truncation?.actionHistoryReasoning ?? + readChars(env, "VERIFIER_ACTION_HISTORY_REASONING_CHARS", 140, truncDisabled), + }, + }; +} + +function mergeConfig( + base: VerifierConfig, + overrides?: Partial, +): VerifierConfig { + if (!overrides) return base; + return { + ...base, + ...overrides, + truncation: { ...base.truncation, ...(overrides.truncation ?? {}) }, + }; +} + +/** Top-K grouping per criterion. Pure compute. */ +function groupTopKByCriterion(args: { + numCriteria: number; + relevanceScores: Map>; + topK: number; +}): Map { + const { numCriteria, relevanceScores, topK } = args; + const grouped = new Map(); + + for (let cIdx = 0; cIdx < numCriteria; cIdx++) { + const scored: Array<{ eIdx: number; score: number }> = []; + for (const [eIdx, scoreMap] of relevanceScores.entries()) { + scored.push({ eIdx, score: scoreMap.get(cIdx) ?? 0 }); + } + scored.sort((a, b) => { + if (b.score !== a.score) return b.score - a.score; + return a.eIdx - b.eIdx; // ties → chronological order + }); + + const topKEvidence = scored.slice(0, topK); + + // Relevance-floor filter: if any selected evidence scored ≥6, + // drop low-relevance entries that are >2 points below the weakest + // high-relevance entry. + const highScores = topKEvidence + .filter((s) => s.score >= 6) + .map((s) => s.score); + if (highScores.length === 0) { + grouped.set( + cIdx, + topKEvidence.map((s) => s.eIdx), + ); + continue; + } + const minHigh = Math.min(...highScores); + const kept = topKEvidence.filter( + (s) => !(s.score < 5 && minHigh - s.score > 2), + ); + grouped.set( + cIdx, + (kept.length > 0 ? kept : topKEvidence).map((s) => s.eIdx), + ); + } + return grouped; +} + +function mapFusedPerCriterionToScores( + rubric: Rubric, + perCriterion: z.infer[], +): CriterionScore[] { + const byIdx = new Map>(); + for (const entry of perCriterion) byIdx.set(entry.criterion_idx, entry); + + return rubric.items.map((c, i): CriterionScore => { + const entry = byIdx.get(i); + if (!entry) { + return { + criterion: c.criterion, + maxPoints: c.maxPoints, + earnedPoints: null, + explanation: "Verifier did not return a score for this criterion.", + evidenceInsufficient: true, + }; + } + const clamped = Math.max(0, Math.min(c.maxPoints, entry.earned_points)); + return { + criterion: c.criterion, + maxPoints: c.maxPoints, + earnedPoints: clamped, + explanation: entry.justification, + evidenceInsufficient: entry.evidence_sufficient === false, + }; + }); +} + +function evidencePreview( + point: CanonicalEvidence, + previewChars: number, +): string { + if (isImageEvidence(point)) { + return `Screenshot at step ${point.originalStepIndex} (${point.bytes.length} bytes, ${point.mediaType})`; + } + const preview = point.content.slice(0, previewChars); + return `${textEvidenceLabel(point)} at step ${point.originalStepIndex} — "${preview.replace(/\s+/g, " ")}${point.content.length > previewChars ? "…" : ""}"`; +} + +function textEvidenceLabel(point: CanonicalTextEvidence): string { + switch (point.source) { + case "probe-aria": + return "ariaTree"; + case "agent-text": + return "agent text"; + case "agent-json": + return "agent JSON"; + case "tool-output": + return "tool output"; + } +} + +function renderEvidenceManifest( + points: CanonicalEvidence[], + previewChars: number, +): string { + if (points.length === 0) return "(no evidence captured)"; + return points + .map( + (p) => + `- evidence_idx=${p.canonicalIndex}: ${evidencePreview(p, previewChars)}`, + ) + .join("\n"); +} + +function renderGroupedEvidenceForApproach( + rubric: Rubric, + evidence: CanonicalEvidence[], + groupedTopK: Map, + textLimit: number, +): string { + if (evidence.length === 0) return "(no evidence captured)"; + const byIdx = new Map(); + for (const e of evidence) byIdx.set(e.canonicalIndex, e); + + const sections: string[] = []; + for (let cIdx = 0; cIdx < rubric.items.length; cIdx++) { + const c = rubric.items[cIdx]; + const topK = groupedTopK.get(cIdx) ?? []; + if (topK.length === 0) { + sections.push( + `### Criterion ${cIdx}: ${c.criterion}\n(no evidence scored highly enough — rely on action history)`, + ); + continue; + } + const body = topK + .map((eIdx) => { + const p = byIdx.get(eIdx); + if (!p) return null; + if (isImageEvidence(p)) { + return `- Evidence #${eIdx} — image @ step=${p.originalStepIndex}`; + } + const text = p.content.replace(/\s+/g, " ").slice(0, textLimit); + return `- Evidence #${eIdx} — ${textEvidenceLabel(p)} @ step=${p.originalStepIndex}: "${text}${p.content.length > textLimit ? "…" : ""}"`; + }) + .filter((x): x is string => x !== null) + .join("\n"); + sections.push(`### Criterion ${cIdx}: ${c.criterion}\n${body}`); + } + return sections.join("\n\n"); +} + +export class RubricVerifier implements Verifier { + private readonly getClient: () => LLMClient; + private readonly getRubricGenClient: () => LLMClient; + private readonly logger: (line: LogLine) => void; + private readonly baseConfig: VerifierConfig; + + constructor(opts: RubricVerifierOptions) { + this.getClient = opts.getClient; + this.getRubricGenClient = opts.getRubricGenClient ?? opts.getClient; + this.logger = opts.logger ?? noopLogger; + this.baseConfig = resolveVerifierConfig(process.env, opts.config); + } + + /** Resolved verifier knobs the constructor saw, frozen at construction. */ + get config(): VerifierConfig { + return this.baseConfig; + } + + async verify( + trajectory: Trajectory, + taskSpec: TaskSpec, + overrides?: Partial, + ): Promise { + const config = mergeConfig(this.baseConfig, overrides); + const hasTrajectorySignal = + trajectory.steps.length > 0 || Boolean(trajectory.finalAnswer?.trim()); + if (!hasTrajectorySignal) { + return this.emptyTrajectoryResult( + normalizeRubric(taskSpec.precomputedRubric), + ); + } + + const { approach, optionalSteps: optionalsMode } = config; + + if (approach === "outcome-only") { + return this.verifyOutcomeOnly(trajectory, taskSpec, config); + } + + let rubric: Rubric | undefined = normalizeRubric( + taskSpec.precomputedRubric, + ); + const rubricSource = rubric ? "precomputed" : "generated"; + if (!rubric) { + rubric = await this.generateRubric(taskSpec); + } + + // Empty-evidence trajectories fall back gracefully — the chosen approach + // degrades to an action-history-only judgment downstream. + const { evidence, loaded } = await collectCanonicalEvidence(trajectory); + + const relevanceScores = await this.scoreRelevanceBatched({ + taskSpec, + rubric, + evidence, + config, + }); + + const groupedTopK = groupTopKByCriterion({ + numCriteria: rubric.items.length, + relevanceScores, + topK: config.topK, + }); + + let perCriterion: CriterionScore[]; + let fusedOutcome: z.infer | undefined; + let foldedFailurePoint: z.infer | undefined; + let foldedTaskValidity: z.infer | undefined; + + if (approach === "b") { + const fused = await this.fusedJudgment({ + trajectory, + taskSpec, + rubric, + evidence, + groupedTopK, + foldFailure: optionalsMode === "folded", + foldValidity: optionalsMode === "folded", + config, + }); + perCriterion = mapFusedPerCriterionToScores(rubric, fused.per_criterion); + fusedOutcome = fused.outcome; + foldedFailurePoint = fused.failure_point; + foldedTaskValidity = fused.task_validity; + } else { + // Approach a: per-criterion analysis returns earned_points directly; + // no separate whole-rubric rescore. + perCriterion = await this.scorePerCriterion({ + trajectory, + taskSpec, + rubric, + evidence, + groupedTopK, + config, + }); + + const outcome = await this.verifyOutcomeFused({ + trajectory, + taskSpec, + rubric, + perCriterion, + evidence, + foldFailure: optionalsMode === "folded", + foldValidity: optionalsMode === "folded", + config, + }); + fusedOutcome = outcome.outcome; + foldedFailurePoint = outcome.failure_point; + foldedTaskValidity = outcome.task_validity; + } + + // ── Process score (deterministic from earned_points) ────────────────── + const totals = perCriterion.reduce( + (acc, c) => ({ + earned: acc.earned + (c.earnedPoints ?? 0), + max: acc.max + c.maxPoints, + }), + { earned: 0, max: 0 }, + ); + const processScore = totals.max > 0 ? totals.earned / totals.max : 0; + + const evidenceInsufficient = perCriterion + .filter((c) => c.evidenceInsufficient) + .map((c) => c.criterion); + + const findings = (fusedOutcome?.findings ?? []).map((f) => ({ + ...f, + category: f.category ?? ("other" as const), + severity: f.severity ?? ("info" as const), + })); + + // ── Optional steps: folded, separate, or skipped ────────────────────── + let firstPointOfFailure: EvaluationResult["firstPointOfFailure"]; + if (foldedFailurePoint && !fusedOutcome?.output_success) { + firstPointOfFailure = { + stepIndex: foldedFailurePoint.step_index, + errorCode: foldedFailurePoint.error_code, + category: foldedFailurePoint.error_category, + description: foldedFailurePoint.description, + }; + } else if ( + optionalsMode === "separate" && + fusedOutcome && + !fusedOutcome.output_success + ) { + firstPointOfFailure = await this.analyzeFailures({ + trajectory, + taskSpec, + rubric, + perCriterion, + outcome: { + output_success: fusedOutcome.output_success, + primary_intent: fusedOutcome.primary_intent, + reasoning: fusedOutcome.reasoning, + findings: fusedOutcome.findings ?? [], + }, + config, + }).catch((): EvaluationResult["firstPointOfFailure"] => undefined); + } + + let taskValidity: EvaluationResult["taskValidity"]; + if (foldedTaskValidity) { + taskValidity = { + isAmbiguous: foldedTaskValidity.is_ambiguous, + isInvalid: foldedTaskValidity.is_invalid, + ambiguityReason: + foldedTaskValidity.is_ambiguous && foldedTaskValidity.ambiguity_reason + ? foldedTaskValidity.ambiguity_reason + : undefined, + invalidReason: + foldedTaskValidity.is_invalid && foldedTaskValidity.invalid_reason + ? foldedTaskValidity.invalid_reason + : undefined, + }; + } else if (optionalsMode === "separate") { + taskValidity = await this.classifyTaskValidity(taskSpec).catch( + (): EvaluationResult["taskValidity"] => ({ + isAmbiguous: false, + isInvalid: false, + }), + ); + } else { + taskValidity = { isAmbiguous: false, isInvalid: false }; + } + + return { + outcomeSuccess: fusedOutcome?.output_success ?? false, + processScore, + perCriterion, + taskValidity, + evidenceInsufficient, + findings: findings.length > 0 ? findings : undefined, + firstPointOfFailure, + rawSteps: { + primaryIntent: fusedOutcome?.primary_intent, + reasoning: fusedOutcome?.reasoning, + rubricSource, + approach, + optionalsMode, + totalEarned: totals.earned, + totalMax: totals.max, + evidenceImages: evidence.filter(isImageEvidence).length, + evidenceTexts: evidence.filter(isTextEvidence).length, + evidenceOriginalScreenshots: loaded.originalCount, + }, + }; + } + + private emptyTrajectoryResult(rubric?: Rubric): EvaluationResult { + const items = rubric?.items ?? []; + return { + outcomeSuccess: false, + explanation: + "No trajectory steps or final answer were captured; skipped verifier LLM calls.", + processScore: 0, + perCriterion: items.map((c) => ({ + criterion: c.criterion, + maxPoints: c.maxPoints, + earnedPoints: 0, + explanation: + "No trajectory steps or final answer were captured; skipped verifier LLM calls.", + evidenceInsufficient: true, + })), + taskValidity: { isAmbiguous: false, isInvalid: false }, + evidenceInsufficient: items.map((c) => c.criterion), + rawSteps: { + reason: "empty-trajectory", + rubricSource: rubric ? "precomputed" : "none", + }, + }; + } + + private async verifyOutcomeOnly( + trajectory: Trajectory, + taskSpec: TaskSpec, + config: VerifierConfig, + ): Promise { + const foldFailure = config.optionalSteps === "folded"; + const foldValidity = config.optionalSteps === "folded"; + const taxonomyBlock = foldFailure + ? `\n${getTaxonomyText(1, 6, 4)}\n${getTaxonomyText(7, 8, 4)}\n` + : ""; + + const prompt = renderPrompt(FUSED_OUTCOME_PROMPT, { + task_definition: taskSpec.instruction, + init_url_context: buildInitUrlContext(taskSpec.initUrl), + action_history: this.formatActionHistory(trajectory, config), + outcome_evidence_summary: this.buildOutcomeEvidenceSummary( + trajectory, + taskSpec, + config, + ), + agent_predicted_output: + trajectory.finalAnswer ?? "(no final answer recorded)", + rubric_summary: + "(no rubric - outcome-only mode; judge success from the task, action history, final answer, and attached screenshots)", + taxonomy_block: taxonomyBlock, + fold_failure_analysis: foldFailure ? "true" : "false", + fold_task_validity: foldValidity ? "true" : "false", + current_date: currentDateForTrajectory(trajectory), + }); + + const images = selectRecentImages(trajectory, config.outcomeMaxImages); + const messageContent: Array< + | { type: "text"; text: string } + | { type: "image_url"; image_url: { url: string } } + > = [{ type: "text", text: prompt }]; + + for (const img of images) { + messageContent.push({ + type: "image_url", + image_url: { + url: `data:${img.mediaType};base64,${img.bytes.toString("base64")}`, + }, + }); + } + + let fused: z.infer; + try { + const client = this.getClient(); + const response = await client.createChatCompletion< + LLMParsedResponse + >({ + logger: this.logger, + options: { + messages: [ + { + role: "system", + content: + "You are an expert evaluator of web-navigation agent trajectories. Output only valid JSON conforming to the schema in the user message.", + }, + { role: "user", content: messageContent }, + ], + response_model: { + name: "FusedOutcome", + schema: FusedOutcomeResponseSchema, + }, + }, + }); + fused = response.data as unknown as z.infer< + typeof FusedOutcomeResponseSchema + >; + } catch { + fused = { + outcome: { + primary_intent: taskSpec.instruction, + reasoning: + "Outcome-only LLM call failed; defaulting to output_success=false.", + output_success: false, + findings: [ + { + category: "verifier_uncertainty" as const, + severity: "warning" as const, + description: + "The outcome-only verification call did not return a parseable response.", + }, + ], + }, + }; + } + + const outcomeSuccess = fused.outcome.output_success; + const findings = (fused.outcome.findings ?? []).map((f) => ({ + ...f, + category: f.category ?? ("other" as const), + severity: f.severity ?? ("info" as const), + })); + + let firstPointOfFailure: EvaluationResult["firstPointOfFailure"]; + if (fused.failure_point && !outcomeSuccess) { + firstPointOfFailure = { + stepIndex: fused.failure_point.step_index, + errorCode: fused.failure_point.error_code, + category: fused.failure_point.error_category, + description: fused.failure_point.description, + }; + } + + const taskValidity: EvaluationResult["taskValidity"] = fused.task_validity + ? { + isAmbiguous: fused.task_validity.is_ambiguous, + isInvalid: fused.task_validity.is_invalid, + ambiguityReason: + fused.task_validity.is_ambiguous && + fused.task_validity.ambiguity_reason + ? fused.task_validity.ambiguity_reason + : undefined, + invalidReason: + fused.task_validity.is_invalid && fused.task_validity.invalid_reason + ? fused.task_validity.invalid_reason + : undefined, + } + : { isAmbiguous: false, isInvalid: false }; + + return { + outcomeSuccess, + explanation: fused.outcome.reasoning, + taskValidity, + findings: findings.length > 0 ? findings : undefined, + firstPointOfFailure, + rawSteps: { + primaryIntent: fused.outcome.primary_intent, + reasoning: fused.outcome.reasoning, + approach: "outcome-only", + optionalsMode: config.optionalSteps, + screenshotsAttached: images.length, + }, + }; + } + + /** + * Score every (evidence, criterion) pair with one batched call per chunk, + * to avoid a per-(criterion, frame) fan-out. Failed batches contribute + * all-zeros scores so the downstream top-K still produces valid groups. + */ + private async scoreRelevanceBatched(args: { + taskSpec: TaskSpec; + rubric: Rubric; + evidence: CanonicalEvidence[]; + config: VerifierConfig; + }): Promise>> { + const { taskSpec, rubric, evidence, config } = args; + const out = new Map>(); + if (evidence.length === 0) return out; + + const numCriteria = rubric.items.length; + const rubricCriteriaText = rubric.items + .map( + (c, i) => + `\n${i}. **${c.criterion}**\n Description: ${c.description}\n`, + ) + .join(""); + + const batchSize = Math.max(1, config.relevanceBatchSize); + + const batches: CanonicalEvidence[][] = []; + for (let i = 0; i < evidence.length; i += batchSize) { + batches.push(evidence.slice(i, i + batchSize)); + } + + const limit = pLimit(config.maxParallel); + + const tasks = batches.map((batch) => + limit(async () => { + const manifest = renderEvidenceManifest( + batch, + config.truncation.evidenceTextPreview, + ); + const prompt = renderPrompt(MM_BATCHED_RELEVANCE_PROMPT, { + task_definition: taskSpec.instruction, + init_url_context: buildInitUrlContext(taskSpec.initUrl), + rubric_criteria: rubricCriteriaText, + evidence_manifest: manifest, + }); + + const messageContent: Array< + | { type: "text"; text: string } + | { + type: "image_url"; + image_url: { url: string }; + } + > = [{ type: "text", text: prompt }]; + + for (const ev of batch) { + if (isImageEvidence(ev)) { + messageContent.push({ + type: "image_url", + image_url: { + url: `data:${ev.mediaType};base64,${ev.bytes.toString("base64")}`, + }, + }); + } else { + messageContent.push({ + type: "text", + text: `\n[evidence_idx=${ev.canonicalIndex} — ${textEvidenceLabel(ev)} at step ${ev.originalStepIndex}]\n${ev.content}\n`, + }); + } + } + + try { + const client = this.getClient(); + const response = await client.createChatCompletion< + LLMParsedResponse + >({ + logger: this.logger, + options: { + messages: [ + { + role: "system", + content: + "You are scoring how relevant each evidence point in a batch is to each rubric criterion. Output only valid JSON conforming to the schema in the user message.", + }, + { role: "user", content: messageContent }, + ], + response_model: { + name: "BatchedRelevance", + schema: BatchedRelevanceResponseSchema, + }, + }, + }); + const data = response.data as unknown as z.infer< + typeof BatchedRelevanceResponseSchema + >; + for (const item of data.items) { + const scoreMap = new Map(); + for (const s of item.scores) { + if (s.criterion_idx >= 0 && s.criterion_idx < numCriteria) { + scoreMap.set(s.criterion_idx, s.score); + } + } + for (let i = 0; i < numCriteria; i++) { + if (!scoreMap.has(i)) scoreMap.set(i, 0); + } + out.set(item.evidence_idx, scoreMap); + } + } catch { + // Per-batch failure: zero out the whole batch so the pipeline + // continues — top-K won't select these evidence points. + for (const ev of batch) { + const scoreMap = new Map(); + for (let i = 0; i < numCriteria; i++) scoreMap.set(i, 0); + out.set(ev.canonicalIndex, scoreMap); + } + } + }), + ); + + await Promise.all(tasks); + + // Pad any missing evidence indices with zeros (defensive against the + // model omitting batch entries). + for (const ev of evidence) { + if (!out.has(ev.canonicalIndex)) { + const scoreMap = new Map(); + for (let i = 0; i < numCriteria; i++) scoreMap.set(i, 0); + out.set(ev.canonicalIndex, scoreMap); + } + } + + return out; + } + + /** + * One call per rubric criterion. Each call sees the criterion's top-K + * evidence points (images + ariaTree snippets), the action history, and + * the final answer; the response includes `earned_points` directly so the + * process score is deterministic (Σ earned / Σ max). + */ + private async scorePerCriterion(args: { + trajectory: Trajectory; + taskSpec: TaskSpec; + rubric: Rubric; + evidence: CanonicalEvidence[]; + groupedTopK: Map; + config: VerifierConfig; + }): Promise { + const { trajectory, taskSpec, rubric, evidence, groupedTopK, config } = + args; + if (rubric.items.length === 0) return []; + + const evidenceByIdx = new Map(); + for (const e of evidence) evidenceByIdx.set(e.canonicalIndex, e); + + const actionHistory = this.formatActionHistory(trajectory, config); + const predictedOutput = + trajectory.finalAnswer ?? "(no final answer recorded)"; + + const limit = pLimit(config.maxParallel); + + const tasks = rubric.items.map((criterion, cIdx) => + limit(async (): Promise => { + const topK = groupedTopK.get(cIdx) ?? []; + const evidencePoints = topK + .map((eIdx) => evidenceByIdx.get(eIdx)) + .filter((e): e is CanonicalEvidence => e !== undefined); + + const manifest = + evidencePoints.length === 0 + ? "(no evidence scored highly enough for this criterion — rely on action history)" + : renderEvidenceManifest( + evidencePoints, + config.truncation.evidenceTextPreview, + ); + + const conditionLine = criterion.condition + ? `- Condition: ${criterion.condition}` + : ""; + + const prompt = renderPrompt(MM_PER_CRITERION_SCORE_PROMPT, { + task_definition: taskSpec.instruction, + init_url_context: buildInitUrlContext(taskSpec.initUrl), + action_history: actionHistory, + agent_predicted_output: predictedOutput, + criterion_idx: cIdx, + criterion_name: criterion.criterion, + criterion_description: criterion.description, + criterion_max_points: criterion.maxPoints, + criterion_condition: conditionLine, + evidence_manifest: manifest, + }); + + const messageContent: Array< + | { type: "text"; text: string } + | { type: "image_url"; image_url: { url: string } } + > = [{ type: "text", text: prompt }]; + + for (const ev of evidencePoints) { + if (isImageEvidence(ev)) { + messageContent.push({ + type: "image_url", + image_url: { + url: `data:${ev.mediaType};base64,${ev.bytes.toString("base64")}`, + }, + }); + } else { + messageContent.push({ + type: "text", + text: `\n[evidence_idx=${ev.canonicalIndex} — ${textEvidenceLabel(ev)} at step ${ev.originalStepIndex}]\n${ev.content}\n`, + }); + } + } + + try { + const client = this.getClient(); + const response = await client.createChatCompletion< + LLMParsedResponse + >({ + logger: this.logger, + options: { + messages: [ + { + role: "system", + content: + "You are scoring one rubric criterion against the most relevant evidence from an agent's trajectory. Output only valid JSON conforming to the schema in the user message.", + }, + { role: "user", content: messageContent }, + ], + response_model: { + name: "PerCriterionScore", + schema: PerCriterionScoreResponseSchema, + }, + }, + }); + const data = response.data as unknown as z.infer< + typeof PerCriterionScoreResponseSchema + >; + const clamped = Math.max( + 0, + Math.min(criterion.maxPoints, data.earned_points), + ); + return { + criterion: criterion.criterion, + maxPoints: criterion.maxPoints, + earnedPoints: clamped, + explanation: data.justification, + evidenceInsufficient: data.evidence_sufficient === false, + }; + } catch { + return { + criterion: criterion.criterion, + maxPoints: criterion.maxPoints, + earnedPoints: null, + explanation: + "Per-criterion scoring call failed; falling back to evidence-insufficient.", + evidenceInsufficient: true, + }; + } + }), + ); + + return Promise.all(tasks); + } + + /** + * Single fused multimodal call returning the full EvaluationResult shape: + * rubric + per-criterion top-K evidence + action history + final answer. + * Optionally folds in first-point-of-failure and task-validity. Image + * evidence rides inline; ariaTree text is embedded in the prompt under + * each criterion's manifest section. + */ + private async fusedJudgment(args: { + trajectory: Trajectory; + taskSpec: TaskSpec; + rubric: Rubric; + evidence: CanonicalEvidence[]; + groupedTopK: Map; + foldFailure: boolean; + foldValidity: boolean; + config: VerifierConfig; + }): Promise> { + const { + trajectory, + taskSpec, + rubric, + evidence, + groupedTopK, + foldFailure, + foldValidity, + config, + } = args; + + const evidenceByIdx = new Map(); + for (const e of evidence) evidenceByIdx.set(e.canonicalIndex, e); + + const usedImageIndices = new Set(); + for (const topK of groupedTopK.values()) { + for (const eIdx of topK) { + const p = evidenceByIdx.get(eIdx); + if (p && isImageEvidence(p)) usedImageIndices.add(eIdx); + } + } + const usedImages = [...usedImageIndices] + .sort((a, b) => a - b) + .map((eIdx) => evidenceByIdx.get(eIdx)) + .filter((p): p is CanonicalScreenshot => !!p && isImageEvidence(p)); + + const rubricBlock = rubric.items + .map((c, i) => { + const cond = c.condition ? `\n Condition: ${c.condition}` : ""; + return `Criterion ${i} — "${c.criterion}" (max ${c.maxPoints} pts):\n Description: ${c.description}${cond}`; + }) + .join("\n\n"); + + const evidenceBlock = renderGroupedEvidenceForApproach( + rubric, + evidence, + groupedTopK, + config.truncation.groupedEvidenceText, + ); + + const taxonomyBlock = foldFailure + ? `\n${getTaxonomyText(1, 6, 4)}\n${getTaxonomyText(7, 8, 4)}\n` + : ""; + + const prompt = renderPrompt(FUSED_JUDGMENT_PROMPT, { + task_definition: taskSpec.instruction, + init_url_context: buildInitUrlContext(taskSpec.initUrl), + action_history: this.formatActionHistory(trajectory, config), + agent_predicted_output: + trajectory.finalAnswer ?? "(no final answer recorded)", + rubric_block: rubricBlock, + evidence_block: evidenceBlock, + taxonomy_block: taxonomyBlock, + fold_failure_analysis: foldFailure ? "true" : "false", + fold_task_validity: foldValidity ? "true" : "false", + current_date: currentDateForTrajectory(trajectory), + }); + + const messageContent: Array< + | { type: "text"; text: string } + | { type: "image_url"; image_url: { url: string } } + > = [{ type: "text", text: prompt }]; + + for (const img of usedImages) { + messageContent.push({ + type: "image_url", + image_url: { + url: `data:${img.mediaType};base64,${img.bytes.toString("base64")}`, + }, + }); + } + + try { + const client = this.getClient(); + const response = await client.createChatCompletion< + LLMParsedResponse + >({ + logger: this.logger, + options: { + messages: [ + { + role: "system", + content: + "You are an expert evaluator of web-navigation agent trajectories. Output only valid JSON conforming to the schema in the user message.", + }, + { role: "user", content: messageContent }, + ], + response_model: { + name: "FusedJudgment", + schema: FusedJudgmentResponseSchema, + }, + }, + }); + return response.data as unknown as z.infer< + typeof FusedJudgmentResponseSchema + >; + } catch (e) { + // Hard failure of the fused call: synthesize a no-confidence result + // so the pipeline can still produce an EvaluationResult object. + void e; + return { + outcome: { + primary_intent: taskSpec.instruction, + reasoning: + "Fused judgment LLM call failed; returning evidence-insufficient result.", + output_success: false, + findings: [ + { + category: "verifier_uncertainty" as const, + severity: "warning" as const, + description: + "The fused judgment call did not return a parseable response.", + }, + ], + }, + per_criterion: rubric.items.map((c, i) => ({ + criterion_idx: i, + applicable_evidence: "", + justification: "Fused judgment call failed for this criterion.", + earned_points: 0, + evidence_sufficient: false, + })), + }; + } + } + + /** + * Consume the pre-scored rubric from scorePerCriterion and produce the + * outcome result. When foldFailure/foldValidity are set, the response also + * includes first-point-of-failure and task-validity, saving 1–2 extra + * LLM calls. + */ + private async verifyOutcomeFused(args: { + trajectory: Trajectory; + taskSpec: TaskSpec; + rubric: Rubric; + perCriterion: CriterionScore[]; + evidence: CanonicalEvidence[]; + foldFailure: boolean; + foldValidity: boolean; + config: VerifierConfig; + }): Promise> { + const { + trajectory, + taskSpec, + rubric, + perCriterion, + foldFailure, + foldValidity, + config, + } = args; + void args.evidence; + + const taxonomyBlock = foldFailure + ? `\n${getTaxonomyText(1, 6, 4)}\n${getTaxonomyText(7, 8, 4)}\n` + : ""; + + const prompt = renderPrompt(FUSED_OUTCOME_PROMPT, { + task_definition: taskSpec.instruction, + init_url_context: buildInitUrlContext(taskSpec.initUrl), + action_history: this.formatActionHistory(trajectory, config), + outcome_evidence_summary: this.buildOutcomeEvidenceSummary( + trajectory, + taskSpec, + config, + ), + agent_predicted_output: + trajectory.finalAnswer ?? "(no final answer recorded)", + rubric_summary: this.formatScoredRubricSummary(rubric, perCriterion), + taxonomy_block: taxonomyBlock, + fold_failure_analysis: foldFailure ? "true" : "false", + fold_task_validity: foldValidity ? "true" : "false", + current_date: currentDateForTrajectory(trajectory), + }); + + try { + const client = this.getClient(); + const response = await client.createChatCompletion< + LLMParsedResponse + >({ + logger: this.logger, + options: { + messages: [ + { + role: "system", + content: + "You are an expert evaluator of web-navigation agent trajectories. Output only valid JSON conforming to the schema in the user message.", + }, + { role: "user", content: prompt }, + ], + response_model: { + name: "FusedOutcome", + schema: FusedOutcomeResponseSchema, + }, + }, + }); + return response.data as unknown as z.infer< + typeof FusedOutcomeResponseSchema + >; + } catch { + // Failure surfaces as a no-confidence result. + return { + outcome: { + primary_intent: taskSpec.instruction, + reasoning: + "Outcome LLM call failed; defaulting to output_success=false.", + output_success: false, + findings: [ + { + category: "verifier_uncertainty" as const, + severity: "warning" as const, + description: + "The outcome verification call did not return a parseable response.", + }, + ], + }, + }; + } + } + + /** + * Flat per-step evidence summary — fallback for trajectories with no + * probe screenshots, such as harness-adapter or stubbed trajectories. + */ + private buildEvidenceContext( + trajectory: Trajectory, + config: VerifierConfig, + opts: { includeImages?: boolean } = {}, + ): EvidenceContext { + if (trajectory.steps.length === 0) { + return { text: "(no steps captured)", images: [] }; + } + + const textLimit = config.truncation.buildEvidenceText; + const ariaLimit = config.truncation.buildEvidenceAria; + + const text = clampToTokenBudget( + trajectory.steps + .map((s, i) => { + const url = s.probeEvidence.url ? `, url=${s.probeEvidence.url}` : ""; + const hasScreenshot = + s.probeEvidence.screenshotPath || s.probeEvidence.screenshot + ? "yes" + : "no"; + const tier1 = s.agentEvidence.modalities + .map((m) => { + if (m.type === "text") + return `text(${m.content.slice(0, textLimit)})`; + if (m.type === "image") return `image(${m.bytes.length} bytes)`; + return `json(${safeJsonSnippet(m.content, 180)})`; + }) + .join(", "); + const toolOutput = safeJsonSnippet(s.toolOutput.result, 220); + // Include the post-step a11y dump when captured — textual ground + // truth for criteria that can't be verified from the visual probe + // alone (prices, names, list contents). Per-step cap keeps the + // total budget bounded. + const ariaSnippet = + typeof s.probeEvidence.ariaTree === "string" && + s.probeEvidence.ariaTree.length > 0 + ? `\n aria_tree: ${s.probeEvidence.ariaTree.slice(0, ariaLimit)}${ + s.probeEvidence.ariaTree.length > ariaLimit + ? "… [truncated]" + : "" + }` + : ""; + return `Screenshot ${i + 1} — step=${s.index}, action=${s.actionName}${url}, probe_screenshot=${hasScreenshot}\n tier1: ${tier1 || "(none)"}\n tool_output: ${toolOutput}${ariaSnippet}`; + }) + .join("\n\n"), + config.evidenceTokenBudget, + ); + + if (opts.includeImages === false) return { text, images: [] }; + + return { + text, + images: selectRecentImages(trajectory, config.outcomeMaxImages), + }; + } + + /** + * Compact text evidence for the one-call outcome verifier. + * + * Outcome-only does not run the rubric relevance selector, but it still needs + * enough saved-page text to avoid replacing trajectory facts with model + * memory. Select a bounded set of lexically relevant and recent steps, then + * include short excerpts around task/final-answer terms. + */ + private buildOutcomeEvidenceSummary( + trajectory: Trajectory, + taskSpec: TaskSpec, + config: VerifierConfig, + ): string { + if (trajectory.steps.length === 0) return "(no steps captured)"; + + const keywords = outcomeKeywords( + `${taskSpec.instruction}\n${trajectory.finalAnswer ?? ""}`, + ); + const lastImportantIndex = Math.max(0, trajectory.steps.length - 5); + + const candidates = trajectory.steps.map((step, position) => { + const url = step.probeEvidence.url ?? ""; + const ariaTree = step.probeEvidence.ariaTree ?? ""; + const toolOutput = safeJsonSnippet(step.toolOutput?.result, 600); + const actionArgs = safeJsonSnippet(step.actionArgs, 400); + const haystack = [ + step.actionName, + step.reasoning ?? "", + url, + actionArgs, + toolOutput, + ariaTree, + ] + .join("\n") + .toLowerCase(); + + let score = position >= lastImportantIndex ? 3 : 0; + if (url) score += 1; + if ( + /extract|observe|aria|navigate|click|type|search/i.test(step.actionName) + ) { + score += 1; + } + for (const keyword of keywords) { + if (haystack.includes(keyword)) { + score += keyword.length >= 8 ? 3 : 1; + } + } + + return { step, position, score }; + }); + + const selected = new Set(); + for (const candidate of [...candidates] + .sort((a, b) => b.score - a.score || a.position - b.position) + .slice(0, OUTCOME_EVIDENCE_MAX_STEPS)) { + selected.add(candidate.position); + } + + for ( + let i = Math.max(0, trajectory.steps.length - 4); + i < trajectory.steps.length; + i++ + ) { + selected.add(i); + } + + const sections = [...selected] + .sort((a, b) => a - b) + .map((position) => { + const step = trajectory.steps[position]; + const url = step.probeEvidence.url + ? ` url=${step.probeEvidence.url}` + : ""; + const reasoning = step.reasoning + ? `\n reasoning: ${step.reasoning.slice(0, 220)}` + : ""; + const toolOutput = step.toolOutput?.result + ? `\n tool_output: ${safeJsonSnippet(step.toolOutput.result, 320)}` + : ""; + const ariaExcerpt = step.probeEvidence.ariaTree + ? `\n page_excerpt: ${bestOutcomeExcerpt( + step.probeEvidence.ariaTree, + keywords, + OUTCOME_EVIDENCE_STEP_CHARS, + )}` + : ""; + return `Step ${step.index}: ${step.actionName}(${summarizeArgs( + step.actionArgs, + )})${url}${reasoning}${toolOutput}${ariaExcerpt}`; + }); + + return clampToTokenBudget( + sections.join("\n\n"), + config.outcomeEvidenceTokenBudget, + ); + } + + /** Generate a rubric from the task description alone. */ + async generateRubric(taskSpec: TaskSpec): Promise { + const prompt = renderPrompt(RUBRIC_GENERATION_PROMPT, { + task_id: taskSpec.instruction, + init_url_context: buildInitUrlContext(taskSpec.initUrl), + }); + + const maxAttempts = 3; + let lastError: unknown; + + for (let attempt = 0; attempt < maxAttempts; attempt++) { + try { + const client = this.getRubricGenClient(); + const response = await client.createChatCompletion< + LLMParsedResponse + >({ + logger: this.logger, + options: { + messages: [ + { + role: "system", + content: + "You are an expert rubric author. Output only valid JSON conforming to the schema requested in the user message. Do not include explanatory prose.", + }, + { role: "user", content: prompt }, + ], + response_model: { name: "Rubric", schema: RubricSchema }, + }, + }); + const data = response.data as unknown as z.infer; + const normalized = normalizeRubric({ + items: filterByTaskSpan( + data.items, + taskSpec.instruction, + this.logger, + ), + }); + if (!normalized) { + throw new Error("Rubric generation returned no rubric"); + } + return normalized; + } catch (err) { + lastError = err; + if (attempt === maxAttempts - 1) break; + await new Promise((resolve) => setTimeout(resolve, 500 * 2 ** attempt)); + } + } + + throw lastError instanceof Error ? lastError : new Error(String(lastError)); + } + + /** + * Identify all distinct failure points using taxonomy categories 1–6 + * (agent-controllable errors) and return the earliest one. Best-effort: + * returns undefined on LLM failure / unparseable output / no failures + * found, rather than blocking the rest of the pipeline. + */ + private async analyzeFailures(args: { + trajectory: Trajectory; + taskSpec: TaskSpec; + rubric: Rubric; + perCriterion: CriterionScore[]; + outcome: z.infer; + config: VerifierConfig; + }): Promise { + const { trajectory, taskSpec, rubric, perCriterion, outcome, config } = + args; + const evidenceContext = this.buildEvidenceContext(trajectory, config, { + includeImages: false, + }); + + const prompt = renderPrompt(FIRST_POINT_OF_FAILURE_PROMPT, { + task_definition: taskSpec.instruction, + init_url_context: buildInitUrlContext(taskSpec.initUrl), + action_history: this.formatActionHistory(trajectory, config), + predicted_output: trajectory.finalAnswer ?? "(no final answer recorded)", + rubric_summary: this.formatScoredRubricSummary(rubric, perCriterion), + evidence_summary: evidenceContext.text, + outcome_verification: `output_success=${outcome.output_success}\nprimary_intent=${outcome.primary_intent}\nreasoning=${outcome.reasoning}`, + }); + + const client = this.getClient(); + const response = await client.createChatCompletion< + LLMParsedResponse + >({ + logger: this.logger, + options: { + messages: [ + { + role: "system", + content: + "You are an expert failure analyst for computer-use web agents. Output only valid JSON conforming to the schema in the user message.", + }, + { role: "user", content: prompt }, + ], + response_model: { + name: "FailureAnalysis", + schema: FailureAnalysisSchema, + }, + }, + }); + + const data = response.data as unknown as z.infer< + typeof FailureAnalysisSchema + >; + if (!data.has_failure || data.failure_points.length === 0) return undefined; + + // Find the failure with the earliest step number: lowest min-step across + // all failure_points. + let best: { + minStep: number; + point: z.infer; + } | null = null; + for (const fp of data.failure_points) { + const steps = parseFailureStepNumbers(fp.step_numbers, { + maxStep: Math.max(0, trajectory.steps.length), + }); + if (steps.length === 0) continue; + const minStep = steps[0]; + if (best === null || minStep < best.minStep) { + best = { minStep, point: fp }; + } + } + if (best === null) return undefined; + + return { + stepIndex: best.minStep, + errorCode: best.point.error_code, + category: best.point.error_category, + description: `${best.point.error_type}: ${best.point.what_happened}`, + }; + } + + /** + * Classify the task across ambiguity (taxonomy category 7) and + * validity/feasibility (category 8). Pure task-level analysis; no + * trajectory context needed. Best-effort: returns undefined on LLM error. + */ + private async classifyTaskValidity( + taskSpec: TaskSpec, + ): Promise { + const prompt = renderPrompt(TASK_VALIDITY_PROMPT, { + task_definition: taskSpec.instruction, + url: taskSpec.initUrl ?? "(none)", + // For browser-driven tasks the app is always Edge/Chrome. The prompt + // accepts a free-form apps field; keeping it accurate matters less than + // anchoring the model with non-empty context. + apps: "Edge", + date: new Date().toISOString().slice(0, 10), + }); + + const client = this.getClient(); + const response = await client.createChatCompletion< + LLMParsedResponse + >({ + logger: this.logger, + options: { + messages: [ + { + role: "system", + content: + "You are an expert failure analyst for computer-use web agents. Output only valid JSON conforming to the schema in the user message.", + }, + { role: "user", content: prompt }, + ], + response_model: { + name: "TaskValidity", + schema: TaskValiditySchema, + }, + }, + }); + + const data = response.data as unknown as z.infer; + return { + isAmbiguous: data.is_ambiguous, + isInvalid: data.is_invalid, + ambiguityReason: + data.is_ambiguous && data.reasoning_is_ambiguous + ? data.reasoning_is_ambiguous + : undefined, + invalidReason: + data.is_invalid && data.reasoning_is_invalid + ? data.reasoning_is_invalid + : undefined, + }; + } + + /** + * Format the rubric with per-criterion rescored points + explanations. + * The outcome verifier reads this as advisory context — it sees how a + * separate scoring system viewed each criterion but forms its own result. + */ + private formatScoredRubricSummary( + rubric: Rubric, + scores: CriterionScore[], + ): string { + return rubric.items + .map((c, i) => { + const cond = c.condition ? ` [condition: ${c.condition}]` : ""; + const score = scores[i]; + const earned = score?.earnedPoints ?? "—"; + const explanation = score?.explanation ?? ""; + return `${i + 1}. ${c.criterion} (${earned}/${c.maxPoints} pts)${cond}\n Description: ${c.description}\n Score explanation: ${explanation}`; + }) + .join("\n\n"); + } + + /** + * Compact textual action history for embedding in prompts. One line per + * step. Full per-step detail lives in trajectory.json on disk. + */ + private formatActionHistory( + trajectory: Trajectory, + config: VerifierConfig, + ): string { + const reasoningLimit = config.truncation.actionHistoryReasoning; + const history = trajectory.steps + .map((s) => { + const argSummary = summarizeArgs(s.actionArgs); + const reasoning = (s.reasoning ?? "").slice(0, reasoningLimit); + const url = s.probeEvidence.url ? ` @ ${s.probeEvidence.url}` : ""; + return `Step ${s.index}: ${s.actionName}(${argSummary})${url}${reasoning ? `\n reasoning: ${reasoning}` : ""}`; + }) + .join("\n"); + return clampToTokenBudget(history, config.actionHistoryTokenBudget); + } +} + +interface EvidenceImage { + label: string; + bytes: Buffer; + mediaType: string; +} + +interface EvidenceContext { + text: string; + images: EvidenceImage[]; +} + +/** FIFO concurrency limiter; avoids a new dep. */ +function pLimit(concurrency: number): (fn: () => Promise) => Promise { + const n = Math.max(1, Math.floor(concurrency)); + let active = 0; + const queue: Array<() => void> = []; + const next = () => { + if (active >= n) return; + const job = queue.shift(); + if (job) { + active++; + job(); + } + }; + return (fn: () => Promise): Promise => + new Promise((resolve, reject) => { + queue.push(() => { + fn() + .then(resolve, reject) + .finally(() => { + active--; + next(); + }); + }); + next(); + }); +} + +function selectRecentImages( + trajectory: Trajectory, + limit: number, +): EvidenceImage[] { + if (limit <= 0) return []; + + const images: EvidenceImage[] = []; + const seen = new Set(); + + for (const step of [...trajectory.steps].reverse()) { + const candidates: EvidenceImage[] = []; + if (step.probeEvidence.screenshot) { + candidates.push({ + label: `step ${step.index} probe screenshot`, + bytes: step.probeEvidence.screenshot, + mediaType: "image/png", + }); + } + for (const modality of step.agentEvidence.modalities) { + if (modality.type === "image") { + candidates.push({ + label: `step ${step.index} agent image`, + bytes: modality.bytes, + mediaType: modality.mediaType, + }); + } + } + + for (const candidate of candidates) { + const key = `${candidate.mediaType}:${candidate.bytes.length}:${candidate.bytes.subarray(0, 32).toString("base64")}`; + if (seen.has(key)) continue; + seen.add(key); + images.push(candidate); + if (images.length >= limit) return images.reverse(); + } + } + + return images.reverse(); +} + +function currentDateForTrajectory(trajectory: Trajectory): string { + const firstStepDate = trajectory.steps.find( + (step) => typeof step.startedAt === "string" && step.startedAt.length >= 10, + )?.startedAt; + + return ( + trajectory.timing?.startedAt?.slice(0, 10) || + firstStepDate?.slice(0, 10) || + new Date().toISOString().slice(0, 10) + ); +} + +const OUTCOME_KEYWORD_STOPWORDS = new Set([ + "about", + "above", + "access", + "agent", + "also", + "answer", + "available", + "based", + "been", + "being", + "browser", + "class", + "click", + "correct", + "current", + "details", + "final", + "find", + "found", + "from", + "have", + "including", + "into", + "list", + "located", + "model", + "more", + "navigated", + "page", + "provided", + "request", + "requested", + "results", + "search", + "show", + "successfully", + "task", + "that", + "the", + "their", + "then", + "there", + "this", + "through", + "user", + "using", + "which", + "with", +]); + +function outcomeKeywords(text: string): string[] { + const counts = new Map(); + for (const match of text.toLowerCase().matchAll(/[a-z0-9][a-z0-9._/-]*/g)) { + const word = match[0].replace(/^[-_./]+|[-_./]+$/g, ""); + if (!word) continue; + if (OUTCOME_KEYWORD_STOPWORDS.has(word)) continue; + if (word.length < 4 && !/\d/.test(word)) continue; + counts.set(word, (counts.get(word) ?? 0) + 1); + } + + return [...counts.entries()] + .sort((a, b) => b[1] - a[1] || b[0].length - a[0].length) + .slice(0, 36) + .map(([word]) => word) + .sort((a, b) => b.length - a.length || a.localeCompare(b)); +} + +function bestOutcomeExcerpt( + text: string, + keywords: string[], + maxChars: number, +): string { + const compact = text.replace(/\s+/g, " ").trim(); + if (compact.length <= maxChars) return compact; + + const lower = compact.toLowerCase(); + let bestIndex = -1; + for (const keyword of keywords) { + const idx = lower.indexOf(keyword); + if (idx >= 0) { + bestIndex = idx; + break; + } + } + + if (bestIndex < 0) { + return `${compact.slice(0, maxChars)}... [truncated]`; + } + + const before = Math.floor(maxChars * 0.35); + const start = Math.max(0, bestIndex - before); + const end = Math.min(compact.length, start + maxChars); + const prefix = start > 0 ? "... " : ""; + const suffix = end < compact.length ? " ... [truncated]" : ""; + return `${prefix}${compact.slice(start, end)}${suffix}`; +} + +function clampToTokenBudget(text: string, tokenBudget: number): string { + const maxChars = Math.max(0, tokenBudget) * APPROX_CHARS_PER_TOKEN; + if (maxChars === 0 || text.length <= maxChars) return text; + + const keepHead = Math.floor(maxChars * 0.35); + const keepTail = Math.max(0, maxChars - keepHead - 120); + return [ + text.slice(0, keepHead).trimEnd(), + `\n...[truncated ${text.length - keepHead - keepTail} chars to fit verifier context budget]...\n`, + text.slice(text.length - keepTail).trimStart(), + ].join(""); +} + + +function filterByTaskSpan( + items: z.infer[], + taskInstruction: string, + logger: (line: LogLine) => void, +): z.infer[] { + const normalizedTask = normalizeForSpanMatch(taskInstruction); + const kept: z.infer[] = []; + const dropped: { criterion: string; reason: string }[] = []; + + for (const item of items) { + const span = item.task_span?.trim(); + if (!span) { + dropped.push({ + criterion: item.criterion, + reason: "missing task_span", + }); + continue; + } + + if ( + span === "" || + normalizedTask.includes(normalizeForSpanMatch(span)) + ) { + kept.push(item); + continue; + } + + dropped.push({ + criterion: item.criterion, + reason: `task_span ${JSON.stringify(span)} not found in task instruction`, + }); + } + + if (dropped.length > 0) { + logger({ + category: "v3-evaluator", + message: "rubric: dropped hallucinated criteria via task_span filter", + auxiliary: { + droppedCount: { value: String(dropped.length), type: "integer" }, + dropped: { + value: JSON.stringify(dropped), + type: "object", + }, + }, + }); + } + + return kept; +} + +function normalizeForSpanMatch(value: string): string { + return value.toLowerCase().replace(/\s+/g, " ").trim(); +} + +function safeJsonSnippet(value: unknown, maxChars: number): string { + let raw: string; + try { + raw = typeof value === "string" ? value : JSON.stringify(value); + } catch { + raw = String(value); + } + if (raw === undefined) return "(undefined)"; + return raw.length > maxChars ? `${raw.slice(0, maxChars)}...` : raw; +} + +function summarizeArgs(args: Record): string { + const keys = Object.keys(args); + if (keys.length === 0) return ""; + return keys + .slice(0, 3) + .map((k) => { + const v = args[k]; + if (typeof v === "string") return `${k}: ${v.slice(0, 60)}`; + if (typeof v === "number" || typeof v === "boolean") return `${k}: ${v}`; + return `${k}: ${typeof v}`; + }) + .join(", "); +} diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index ec602d04d..7b95e2f0a 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -94,8 +94,9 @@ function normalizeResultLabel(label?: string): string { * * Reverses the recorder's serialization tweaks: * - `probeEvidence.screenshotPath` → read file into `probeEvidence.screenshot`. - * - Image modalities in `agentEvidence.modalities` carry `bytesBase64` on - * disk (human-readable JSON) instead of raw Buffer; we decode back. + * - Image modalities in `agentEvidence.modalities` carry `imagePath` on disk + * (relative to the trajectory dir). Legacy `bytesBase64` is also accepted + * for trajectories written before the externalization change. * * @param dir absolute or cwd-relative path to a `//` directory. */ @@ -115,9 +116,11 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { | { type: "image"; mediaType: string; - // On-disk form (recorder writes base64); accept either to - // tolerate hand-edited fixtures. + // Accept both shapes: + // imagePath — new, relative path under the trajectory dir + // bytesBase64 — legacy, inline base64 bytes?: unknown; + imagePath?: string; bytesBase64?: string; } | { type: "json"; content: unknown } @@ -138,7 +141,7 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { if (outside) { throw new Error( - `Trajectory screenshotPath escapes trajectory directory: ${candidate}`, + `Trajectory asset path escapes trajectory directory: ${candidate}`, ); } @@ -158,21 +161,39 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { } } - // Decode image modalities from base64 back to Buffer. + // Rehydrate tier-1 image modalities. New shape: resolve imagePath → Buffer. + // Legacy shape: decode bytesBase64 → Buffer. if (step.agentEvidence?.modalities) { - step.agentEvidence.modalities = step.agentEvidence.modalities.map((m) => { - // The on-disk shape carries bytesBase64 instead of bytes, so we look - // through `unknown` here rather than rely on the typed union. - const raw = m as unknown as { bytesBase64?: string }; - if (m.type === "image" && typeof raw.bytesBase64 === "string") { - return { - type: "image" as const, + const resolved: AgentEvidenceModality[] = []; + for (const m of step.agentEvidence.modalities) { + if (m.type !== "image") { + resolved.push(m as AgentEvidenceModality); + continue; + } + const raw = m as unknown as { + imagePath?: string; + bytesBase64?: string; + }; + if (typeof raw.imagePath === "string") { + const imagePath = resolveWithinTrajectoryDir(raw.imagePath); + try { + const bytes = await fs.readFile(imagePath); + resolved.push({ type: "image", bytes, mediaType: m.mediaType }); + } catch { + // Missing tier-1 image: drop this modality. evidence_insufficient + // handles the downstream effect. + } + } else if (typeof raw.bytesBase64 === "string") { + resolved.push({ + type: "image", bytes: Buffer.from(raw.bytesBase64, "base64"), mediaType: m.mediaType, - }; + }); + } else { + resolved.push(m as AgentEvidenceModality); } - return m as AgentEvidenceModality; - }); + } + step.agentEvidence.modalities = resolved; } } diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts index dff475cfc..99673f18c 100644 --- a/packages/core/lib/v3/verifier/types.ts +++ b/packages/core/lib/v3/verifier/types.ts @@ -5,6 +5,8 @@ * Hybrid runs preserve tool-return text/JSON evidence, while CUA runs preserve * screenshots sent to the provider plus independent harness probes. */ +import type { LLMClient } from "../llm/LLMClient.js"; +import type { LogLine } from "../types/public/logs.js"; /** Token usage for one or more LLM calls. Matches AgentResult.usage shape. */ export interface TrajectoryUsage { @@ -134,13 +136,15 @@ export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error"; * The on-disk layout is one directory per task: * * .trajectories/// - * ├── task_data.json — TaskSpec + result metadata - * ├── trajectory.json — this object, with screenshotPath instead of bytes - * ├── screenshot_1.png — probeEvidence.screenshot for step 1, etc. + * ├── task_data.json — TaskSpec + result metadata + * ├── trajectory.json — this object, with image paths instead of bytes + * ├── screenshots/ + * │ ├── probe/.png — tier-2 probe screenshot per step + * │ └── agent/.png — tier-1 image the model received per step * ├── scores/ - * │ └── result.json — Result from V3Evaluator.verify() - * ├── core.log — captured action log - * └── times.json — step timing + token usage + * │ └── result.json — Result from V3Evaluator.verify() + * ├── core.log — captured action log + * └── times.json — step timing + token usage */ export interface Trajectory { task: TaskSpec; @@ -213,13 +217,13 @@ export interface EvidenceLoadResult { }; } -/** Options for evidence loading. Mainly env override hooks for tests. */ +/** Options for evidence loading; primarily test seams over the defaults. */ export interface EvidenceLoadOptions { - /** Override VERIFIER_SSIM_THRESHOLD. */ + /** SSIM similarity threshold for dedup (default 0.75). */ ssimThreshold?: number; - /** Override VERIFIER_MSE_THRESHOLD. */ + /** MSE similarity threshold for dedup (default 30). */ mseThreshold?: number; - /** Override VERIFIER_IMAGE_RESIZE. */ + /** Scale factor applied before relevance scoring (default 0.7). */ imageResize?: number; } @@ -305,16 +309,18 @@ export interface VerifierFinding { /** Stable debugging summary emitted by verifier backends. */ export interface VerifierRawSteps { backend?: "legacy" | "verifier"; + reason?: string; primaryIntent?: string; reasoning?: string; rubricSource?: "precomputed" | "generated" | "none"; - approach?: "a" | "b"; + approach?: "a" | "b" | "outcome-only"; optionalsMode?: "folded" | "separate" | "skip"; totalEarned?: number; totalMax?: number; evidenceImages?: number; evidenceTexts?: number; evidenceOriginalScreenshots?: number; + screenshotsAttached?: number; legacyEvaluation?: string; screenshotCount?: number; } @@ -323,8 +329,12 @@ export interface VerifierRawSteps { export interface TaskValidity { /** True if the task is underspecified / has multiple valid interpretations. */ isAmbiguous: boolean; + /** Explanation for why the task is ambiguous, when available. */ + ambiguityReason?: string; /** True if the task is impossible / illegal / NSFW / otherwise infeasible. */ isInvalid: boolean; + /** Explanation for why the task is invalid, when available. */ + invalidReason?: string; /** Optional sub-codes from the task-classification taxonomy. */ ambiguityCodes?: string[]; invalidTaskCodes?: string[]; @@ -375,3 +385,82 @@ export interface EvaluationResult { export interface Verifier { verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise; } + +export interface RubricVerifierOptions { + /** Factory that returns a configured LLMClient. Called per pipeline step so callers can supply step-specific clients. */ + getClient: () => LLMClient; + /** Optional factory for rubric generation so callers can route it to a stronger model. */ + getRubricGenClient?: () => LLMClient; + /** Logger; defaults to a no-op so the verifier stays quiet inside V3Evaluator. */ + logger?: (line: LogLine) => void; + /** + * Override any verifier knob. Env vars supply the defaults; values here win. + * Useful for tests and for cross-verify sweeps that want different budgets + * per run. + */ + config?: Partial; +} + +/** + * Resolved verifier knobs. Constructed once from env (and optional overrides) + * by RubricVerifier's constructor; subsequent verify() calls can pass a + * Partial to shift any field. + */ +export interface VerifierConfig { + /** Which pipeline path to take: per-criterion (a), fused (b), or skip rubric entirely (outcome-only). */ + approach: "a" | "b" | "outcome-only"; + /** Folded (in fused call), separate (own calls), or skip (omit). */ + optionalSteps: "folded" | "separate" | "skip"; + /** Top-K evidence points selected per criterion. */ + topK: number; + /** Batch size for the relevance-scoring LLM call. */ + relevanceBatchSize: number; + /** Image cap on the outcome-only path. */ + outcomeMaxImages: number; + /** Concurrent LLM calls across batches / criteria. */ + maxParallel: number; + /** Token budgets for the three evidence channels. */ + evidenceTokenBudget: number; + outcomeEvidenceTokenBudget: number; + actionHistoryTokenBudget: number; + /** Per-section character limits applied during evidence-text assembly. */ + truncation: { + /** Master switch: when true, all per-section limits go to MAX_SAFE_INTEGER. */ + disabled: boolean; + evidenceTextPreview: number; + groupedEvidenceText: number; + buildEvidenceText: number; + buildEvidenceAria: number; + actionHistoryReasoning: number; + }; +} + +export interface ErrorTaxonomySubCategory { + /** Sub-code (e.g., "2.3"). */ + code: string; + /** Human-readable name (e.g., "Output fabrication"). */ + name: string; + /** Detailed description ported from the .md. Markdown formatting preserved. */ + description: string; +} + +export interface ErrorTaxonomyCategory { + /** Top-level number (1-8). */ + number: number; + /** Top-level name (e.g., "Hallucination Errors"). */ + name: string; + /** One-sentence summary of the category. */ + summary: string; + /** Sub-categories. The last one is always an "Other" catch-all. */ + subCategories: ErrorTaxonomySubCategory[]; +} + +export interface ParseFailureStepNumbersOptions { + /** + * Maximum unique step numbers to expand from ranges. Protects the verifier + * from malformed model output such as "0-2147483647". + */ + maxExpandedSteps?: number; + /** Optional inclusive upper bound for accepted step numbers. */ + maxStep?: number; +} diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts index dd60fe720..67ec1fc9b 100644 --- a/packages/core/lib/v3Evaluator.ts +++ b/packages/core/lib/v3Evaluator.ts @@ -5,8 +5,11 @@ import type { EvaluationResult as LegacyEvaluationResult, } from "./v3/types/private/evaluator.js"; import { V3 } from "./v3/v3.js"; +import type { LLMClient } from "./v3/llm/LLMClient.js"; +import { LLMProvider } from "./v3/llm/LLMProvider.js"; import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js"; import { LegacyV3Evaluator } from "./v3LegacyEvaluator.js"; +import { RubricVerifier } from "./v3/verifier/rubricVerifier.js"; import type { Trajectory, TaskSpec, @@ -46,7 +49,10 @@ type NormalizedConstructorOptions = { }; export class V3Evaluator implements Verifier { + private readonly v3: V3; private readonly backend: V3EvaluatorBackend; + private readonly modelName: AvailableModel; + private readonly modelClientOptions: ClientOptions | { apiKey: string }; private readonly legacyEvaluator: LegacyV3Evaluator; constructor( @@ -62,6 +68,16 @@ export class V3Evaluator implements Verifier { ); this.backend = resolveEvaluatorBackend(normalizedOptions.backend); + this.v3 = v3; + this.modelName = + normalizedOptions.modelName || + ("google/gemini-2.5-flash" as AvailableModel); + this.modelClientOptions = normalizedOptions.modelClientOptions || { + apiKey: + process.env.GEMINI_API_KEY || + process.env.GOOGLE_GENERATIVE_AI_API_KEY || + "", + }; this.legacyEvaluator = new LegacyV3Evaluator( v3, normalizedOptions.modelName, @@ -87,7 +103,11 @@ export class V3Evaluator implements Verifier { return this.verifyTrajectoryWithLegacyEvaluator(trajectory, taskSpec); } - return this.unavailableVerifierBackend("verify"); + const verifier = new RubricVerifier({ + getClient: () => this.getClient(), + getRubricGenClient: () => this.getRubricGenClient(), + }); + return verifier.verify(trajectory, taskSpec); } async generateRubric(taskSpec: TaskSpec): Promise { @@ -97,13 +117,17 @@ export class V3Evaluator implements Verifier { ); } - if (this.backend === "verifier") { - return this.unavailableVerifierBackend("generateRubric"); + if (this.backend === "legacy") { + return { + items: [legacyTaskCompletionCriterion(taskSpec)], + }; } - return { - items: [legacyTaskCompletionCriterion(taskSpec)], - }; + const verifier = new RubricVerifier({ + getClient: () => this.getClient(), + getRubricGenClient: () => this.getRubricGenClient(), + }); + return verifier.generateRubric(taskSpec); } private getLegacyBackend(methodName: string): LegacyV3Evaluator { @@ -116,7 +140,34 @@ export class V3Evaluator implements Verifier { private unavailableVerifierBackend(methodName: string): never { throw new StagehandInvalidArgumentError( - `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use "legacy" or install the verifier backend PR.`, + `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend only supports verify() and generateRubric(). Use "legacy" for ask()/batchAsk().`, + ); + } + + private getClient(): LLMClient { + const provider = new LLMProvider(this.v3.logger); + return provider.getClient(this.modelName, this.modelClientOptions); + } + + private getRubricGenClient(): LLMClient { + const override = process.env.VERIFIER_RUBRIC_MODEL as + | AvailableModel + | undefined; + if (!override) return this.getClient(); + + const provider = new LLMProvider(this.v3.logger); + const overrideProvider = override.includes("/") + ? override.slice(0, override.indexOf("/")) + : undefined; + const defaultProvider = this.modelName.includes("/") + ? this.modelName.slice(0, this.modelName.indexOf("/")) + : undefined; + const sameProvider = + overrideProvider !== undefined && overrideProvider === defaultProvider; + + return provider.getClient( + override, + sameProvider ? this.modelClientOptions : undefined, ); } diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts index b97c93ba2..50446f50d 100644 --- a/packages/core/tests/unit/v3-evaluator.test.ts +++ b/packages/core/tests/unit/v3-evaluator.test.ts @@ -5,7 +5,7 @@ import type { V3 } from "../../lib/v3/v3.js"; import type { TaskSpec, Trajectory } from "../../lib/v3/verifier/index.js"; describe("V3Evaluator verifier facade", () => { - it("rejects verifier backend before the verifier PR is installed", async () => { + it("rejects ask when configured for the verifier backend", async () => { const evaluator = new V3Evaluator({} as V3, { backend: "verifier", }); @@ -13,39 +13,67 @@ describe("V3Evaluator verifier facade", () => { await expect( evaluator.ask({ question: "Was the task completed?" }), ).rejects.toThrow( - "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", + "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend only supports verify() and generateRubric()", ); }); - it("rejects verify when the verifier backend is selected before the verifier PR is installed", async () => { + it("returns a verifier result for empty trajectories without LLM calls", async () => { const taskSpec: TaskSpec = { - id: "verifier-unavailable", + id: "empty-verifier", instruction: "Complete the task", }; const evaluator = new V3Evaluator({} as V3, { backend: "verifier", }); - await expect( - evaluator.verify(makeTrajectory(taskSpec), taskSpec), - ).rejects.toThrow( - "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", + const result = await evaluator.verify( + makeEmptyTrajectory(taskSpec), + taskSpec, ); + + expect(result.outcomeSuccess).toBe(false); + expect(result.rawSteps).toMatchObject({ + reason: "empty-trajectory", + rubricSource: "none", + }); }); - it("rejects generateRubric when the verifier backend is selected before the verifier PR is installed", async () => { + it("generates rubrics through the verifier backend", async () => { + const createChatCompletion = vi.fn().mockResolvedValue({ + data: { + items: [ + { + criterion: "Complete the task", + task_span: "Complete the task", + description: "Full credit if the task is complete.", + max_points: 1, + justification: "", + earned_points: "", + }, + ], + }, + }); const evaluator = new V3Evaluator({} as V3, { backend: "verifier", }); + Object.defineProperty(evaluator, "getRubricGenClient", { + value: () => ({ createChatCompletion }), + }); - await expect( - evaluator.generateRubric({ - id: "rubric-unavailable", - instruction: "Complete the task", - }), - ).rejects.toThrow( - "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", - ); + const rubric = await evaluator.generateRubric({ + id: "rubric", + instruction: "Complete the task", + }); + + expect(rubric).toEqual({ + items: [ + { + criterion: "Complete the task", + description: "Full credit if the task is complete.", + maxPoints: 1, + }, + ], + }); }); it("maps legacy YES evaluations with trajectory screenshots to a successful result", async () => { diff --git a/packages/core/tests/unit/verifier-failure-step-parser.test.ts b/packages/core/tests/unit/verifier-failure-step-parser.test.ts new file mode 100644 index 000000000..bafe79758 --- /dev/null +++ b/packages/core/tests/unit/verifier-failure-step-parser.test.ts @@ -0,0 +1,26 @@ +import { describe, expect, it } from "vitest"; + +import { parseFailureStepNumbers } from "../../lib/v3/verifier/prompts/firstPointOfFailure.js"; + +describe("parseFailureStepNumbers", () => { + it("parses singleton, range, and comma-separated step references", () => { + expect(parseFailureStepNumbers("5,7-9,12")).toEqual([5, 7, 8, 9, 12]); + }); + + it("caps expanded ranges from malformed model output", () => { + const steps = parseFailureStepNumbers("0-2147483647"); + + expect(steps).toHaveLength(1000); + expect(steps[0]).toBe(0); + expect(steps[999]).toBe(999); + }); + + it("honors the caller's trajectory step bound", () => { + expect( + parseFailureStepNumbers("0-2147483647", { + maxExpandedSteps: 1000, + maxStep: 3, + }), + ).toEqual([0, 1, 2, 3]); + }); +}); diff --git a/packages/core/tests/unit/verifier-rubric.test.ts b/packages/core/tests/unit/verifier-rubric.test.ts new file mode 100644 index 000000000..ff693c448 --- /dev/null +++ b/packages/core/tests/unit/verifier-rubric.test.ts @@ -0,0 +1,164 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; + +import { RubricVerifier } from "../../lib/v3/verifier/rubricVerifier.js"; +import type { LLMClient } from "../../lib/v3/llm/LLMClient.js"; +import type { TaskSpec, Trajectory } from "../../lib/v3/verifier/types.js"; + +describe("RubricVerifier", () => { + const previousEnv = { + approach: process.env.VERIFIER_APPROACH, + retries: process.env.VERIFIER_RUBRIC_RETRIES, + requireTaskSpan: process.env.VERIFIER_RUBRIC_REQUIRE_TASK_SPAN, + }; + + afterEach(() => { + restoreEnv("VERIFIER_APPROACH", previousEnv.approach); + restoreEnv("VERIFIER_RUBRIC_RETRIES", previousEnv.retries); + restoreEnv( + "VERIFIER_RUBRIC_REQUIRE_TASK_SPAN", + previousEnv.requireTaskSpan, + ); + }); + + it("retries rubric generation and filters criteria outside the task span", async () => { + process.env.VERIFIER_RUBRIC_RETRIES = "2"; + const createChatCompletion = vi + .fn() + .mockRejectedValueOnce(new Error("temporary parse failure")) + .mockResolvedValueOnce({ + data: { + items: [ + { + criterion: "Identify the most recent paper", + task_span: "most recent paper", + description: + "Full credit for identifying the most recent relevant paper.", + max_points: 4, + justification: "", + earned_points: "", + }, + { + criterion: "Output the abstract", + task_span: "abstract", + description: "This criterion is not requested by the task.", + max_points: 1, + justification: "", + earned_points: "", + }, + ], + }, + }); + const verifier = new RubricVerifier({ + getClient: () => throwingClient(), + getRubricGenClient: () => + ({ createChatCompletion }) as unknown as LLMClient, + logger: vi.fn(), + }); + + const rubric = await verifier.generateRubric({ + id: "arxiv", + instruction: + "Search arXiv for the most recent paper on retrieval-augmented generation.", + }); + + expect(createChatCompletion).toHaveBeenCalledTimes(2); + expect(rubric).toEqual({ + items: [ + { + criterion: "Identify the most recent paper", + description: + "Full credit for identifying the most recent relevant paper.", + maxPoints: 4, + }, + ], + }); + }); + + it("supports outcome-only verification without generating a rubric", async () => { + process.env.VERIFIER_APPROACH = "outcome-only"; + const createChatCompletion = vi.fn().mockResolvedValue({ + data: { + outcome: { + primary_intent: "Complete the task", + reasoning: "The final page and answer show completion.", + output_success: true, + findings: [], + }, + task_validity: { + is_ambiguous: false, + ambiguity_reason: "", + is_invalid: false, + invalid_reason: "", + }, + }, + }); + const verifier = new RubricVerifier({ + getClient: () => ({ createChatCompletion }) as unknown as LLMClient, + getRubricGenClient: () => throwingClient(), + }); + const taskSpec: TaskSpec = { + id: "outcome", + instruction: "Complete the task", + }; + + const result = await verifier.verify( + makeTrajectory(taskSpec, Buffer.from("screenshot")), + taskSpec, + ); + + expect(createChatCompletion).toHaveBeenCalledTimes(1); + expect(result).toMatchObject({ + outcomeSuccess: true, + explanation: "The final page and answer show completion.", + rawSteps: { + approach: "outcome-only", + screenshotsAttached: 1, + }, + }); + expect(result.processScore).toBeUndefined(); + expect(result.perCriterion).toBeUndefined(); + }); +}); + +function restoreEnv(name: string, value: string | undefined): void { + if (value === undefined) { + delete process.env[name]; + } else { + process.env[name] = value; + } +} + +function throwingClient(): LLMClient { + return { + createChatCompletion: vi.fn().mockRejectedValue(new Error("unexpected")), + } as unknown as LLMClient; +} + +function makeTrajectory(task: TaskSpec, screenshot: Buffer): Trajectory { + return { + task, + steps: [ + { + index: 0, + actionName: "act", + actionArgs: {}, + reasoning: "I completed the task.", + agentEvidence: { modalities: [] }, + probeEvidence: { screenshot }, + toolOutput: { ok: true, result: "done" }, + startedAt: new Date(0).toISOString(), + finishedAt: new Date(0).toISOString(), + }, + ], + finalAnswer: "Done.", + status: "complete", + usage: { + input_tokens: 0, + output_tokens: 0, + }, + timing: { + startedAt: new Date(0).toISOString(), + endedAt: new Date(0).toISOString(), + }, + }; +}