-
Notifications
You must be signed in to change notification settings - Fork 1.5k
feat(verifier): add verifier evaluator shell and types #2130
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
miguelg719
merged 14 commits into
miguelgonzalez/verifier-01-evaluator-compat
from
miguelgonzalez/verifier-02-backend-routing
May 20, 2026
Merged
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
6e46909
feat(verifier): add verifier evaluator shell
miguelg719 363f7cd
fix(verifier): normalize public rubric naming
miguelg719 de0feb6
style(verifier): format rubric normalizer
miguelg719 b3bad21
chore(verifier): remove upstream verifier references
miguelg719 cf45b5a
docs(verifier): remove rollout comments from public types
miguelg719 b670666
refactor(verifier): consolidate public types
miguelg719 94d136d
refactor(verifier): remove rollout stub reason
miguelg719 6277411
refactor(verifier): remove proxy type barrels
miguelg719 0166b7e
fix(verifier): keep rubric earned points numeric
miguelg719 444f0da
fix(verifier): constrain trajectory screenshot paths
miguelg719 e698937
test(verifier): cover trajectory normalization boundaries
miguelg719 6443275
test(verifier): cover evaluator facade helpers
miguelg719 36ed362
fix(verifier): clean public result API
miguelg719 50cdf0a
docs(verifier): align evaluator changeset wording
miguelg719 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| --- | ||
| "@browserbasehq/stagehand": patch | ||
| --- | ||
|
|
||
| Add verifier trajectory, rubric, and evaluation-result types with normalized public naming. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| /** | ||
| * Public re-exports for the verifier subsystem. | ||
| */ | ||
| export type { | ||
| AgentEvidence, | ||
| AgentEvidenceModality, | ||
| CriterionScore, | ||
| EvaluationResult, | ||
| FirstPointOfFailure, | ||
| ProbeEvidence, | ||
| Rubric, | ||
| RubricCriterion, | ||
| TaskSpec, | ||
| TaskValidity, | ||
| ToolOutput, | ||
| Trajectory, | ||
| TrajectoryStatus, | ||
| TrajectoryStep, | ||
| TrajectoryUsage, | ||
| Verifier, | ||
| VerifierFinding, | ||
| VerifierRawSteps, | ||
| } from "./types.js"; | ||
| export { | ||
| loadTrajectoryFromDisk, | ||
| nextResultFilename, | ||
| normalizeRubric, | ||
| } from "./trajectory.js"; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,189 @@ | ||
| import type { | ||
| AgentEvidenceModality, | ||
| ProbeEvidence, | ||
| Rubric, | ||
| Trajectory, | ||
| TrajectoryStep, | ||
| } from "./types.js"; | ||
|
|
||
| type RawRubricCriterion = { | ||
| criterion: unknown; | ||
| description: unknown; | ||
| max_points?: unknown; | ||
| maxPoints?: unknown; | ||
| condition?: unknown; | ||
| }; | ||
|
|
||
| type RawRubric = { | ||
| items?: unknown; | ||
| }; | ||
|
|
||
| /** | ||
| * Convert dataset or generated rubric JSON into the public Stagehand shape. | ||
| * Snake-case dataset fields are accepted here so serialized quirks do not leak | ||
| * into the canonical rubric type. | ||
| */ | ||
| export function normalizeRubric(rubric: unknown): Rubric | undefined { | ||
| if (rubric == null) return undefined; | ||
| if (typeof rubric !== "object") { | ||
| throw new TypeError("Rubric must be an object"); | ||
| } | ||
|
|
||
| const rawRubric = rubric as RawRubric; | ||
| if (!Array.isArray(rawRubric.items)) { | ||
| throw new TypeError("Rubric is missing an items array"); | ||
| } | ||
|
|
||
| return { | ||
| items: rawRubric.items.map((item) => { | ||
| const criterion = normalizeRequiredString(item.criterion, "criterion"); | ||
| const description = normalizeRequiredString( | ||
| item.description, | ||
| "description", | ||
| ); | ||
| const maxPoints = normalizeMaxPoints(item); | ||
|
|
||
| if (typeof maxPoints !== "number" || !Number.isFinite(maxPoints)) { | ||
| throw new TypeError( | ||
| `Rubric criterion "${criterion}" is missing a numeric maxPoints value`, | ||
| ); | ||
| } | ||
|
|
||
| return { | ||
| criterion, | ||
| description, | ||
| maxPoints, | ||
| ...(typeof item.condition === "string" && { | ||
| condition: item.condition, | ||
| }), | ||
| }; | ||
| }), | ||
| }; | ||
| } | ||
|
|
||
| function normalizeRequiredString(value: unknown, fieldName: string): string { | ||
| if (typeof value === "string" && value.length) { | ||
| return value; | ||
| } | ||
|
|
||
| throw new TypeError(`Rubric criterion is missing a ${fieldName} value`); | ||
| } | ||
|
|
||
| function normalizeMaxPoints(item: RawRubricCriterion): unknown { | ||
| return item.maxPoints ?? item.max_points; | ||
| } | ||
|
|
||
| function normalizeResultLabel(label?: string): string { | ||
| return (label ?? `rescore-${new Date().toISOString()}`).replace( | ||
| /[^A-Za-z0-9._-]/g, | ||
| "_", | ||
| ); | ||
| } | ||
|
|
||
| // ───────────────────────────────────────────────────────────────────────────── | ||
| // On-disk loader | ||
| // ───────────────────────────────────────────────────────────────────────────── | ||
|
|
||
| /** | ||
| * Hydrate a Trajectory from the on-disk directory layout written by | ||
| * TrajectoryRecorder.persist(). Used by the offline re-scoring CLI (`bench | ||
| * verify`) and by any consumer that wants to feed a saved trajectory back | ||
| * into V3Evaluator.verify() without running an agent. | ||
| * | ||
| * Reverses the recorder's serialization tweaks: | ||
| * - `probeEvidence.screenshotPath` → read file into `probeEvidence.screenshot`. | ||
| * - Image modalities in `agentEvidence.modalities` carry `bytesBase64` on | ||
| * disk (human-readable JSON) instead of raw Buffer; we decode back. | ||
| * | ||
| * @param dir absolute or cwd-relative path to a `<run-id>/<task-id>/` directory. | ||
| */ | ||
| export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> { | ||
| const fs = await import("node:fs/promises"); | ||
| const path = await import("node:path"); | ||
| const trajectoryDir = path.resolve(dir); | ||
|
|
||
| const trajectoryPath = path.join(trajectoryDir, "trajectory.json"); | ||
| const raw = await fs.readFile(trajectoryPath, "utf8"); | ||
| const parsed = JSON.parse(raw) as Trajectory & { | ||
| steps: Array< | ||
| TrajectoryStep & { | ||
| agentEvidence: { | ||
| modalities: Array< | ||
| | { type: "text"; content: string } | ||
| | { | ||
| type: "image"; | ||
| mediaType: string; | ||
| // On-disk form (recorder writes base64); accept either to | ||
| // tolerate hand-edited fixtures. | ||
| bytes?: unknown; | ||
| bytesBase64?: string; | ||
| } | ||
| | { type: "json"; content: unknown } | ||
| >; | ||
| }; | ||
| probeEvidence: ProbeEvidence; | ||
| } | ||
| >; | ||
| }; | ||
|
|
||
| const resolveWithinTrajectoryDir = (candidate: string): string => { | ||
| const resolved = path.resolve(trajectoryDir, candidate); | ||
| const relative = path.relative(trajectoryDir, resolved); | ||
| const outside = | ||
| relative === ".." || | ||
| relative.startsWith(`..${path.sep}`) || | ||
| path.isAbsolute(relative); | ||
|
|
||
| if (outside) { | ||
| throw new Error( | ||
| `Trajectory screenshotPath escapes trajectory directory: ${candidate}`, | ||
| ); | ||
| } | ||
|
|
||
| return resolved; | ||
| }; | ||
|
|
||
| for (const step of parsed.steps) { | ||
| // Rehydrate tier-2 probe screenshot from its on-disk file reference. | ||
| const probe = step.probeEvidence; | ||
| if (probe?.screenshotPath && !probe.screenshot) { | ||
| const resolved = resolveWithinTrajectoryDir(probe.screenshotPath); | ||
| try { | ||
| probe.screenshot = await fs.readFile(resolved); | ||
| } catch { | ||
| // Missing screenshot file: leave probe.screenshot unset. The verifier's | ||
| // evidence_insufficient path will handle it. | ||
| } | ||
| } | ||
|
|
||
| // Decode image modalities from base64 back to Buffer. | ||
| if (step.agentEvidence?.modalities) { | ||
| step.agentEvidence.modalities = step.agentEvidence.modalities.map((m) => { | ||
| // The on-disk shape carries bytesBase64 instead of bytes, so we look | ||
| // through `unknown` here rather than rely on the typed union. | ||
| const raw = m as unknown as { bytesBase64?: string }; | ||
| if (m.type === "image" && typeof raw.bytesBase64 === "string") { | ||
| return { | ||
| type: "image" as const, | ||
| bytes: Buffer.from(raw.bytesBase64, "base64"), | ||
| mediaType: m.mediaType, | ||
| }; | ||
| } | ||
| return m as AgentEvidenceModality; | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| return parsed; | ||
| } | ||
|
|
||
| /** | ||
| * Build a `result*.json` filename for persisted evaluator output. | ||
| * | ||
| * Convention: the live run writes `result.json`; offline re-score attempts use | ||
| * a label-based name (e.g., `result_rescore-2026-05-11.json`) so they coexist | ||
| * without collisions and remain easy to diff. | ||
| */ | ||
| export function nextResultFilename(label?: string): string { | ||
| return `result_${normalizeResultLabel(label)}.json`; | ||
| } |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.