From d024dcd6fd34386cad070fc1111c1f21b6d20b28 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 13 Mar 2026 09:00:09 +0000 Subject: [PATCH 01/10] feat(core): add built-in assertion factories for Eval() API --- packages/core/src/evaluation/assertions.ts | 101 +++++++++++++++ .../core/test/evaluation/assertions.test.ts | 120 ++++++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 packages/core/src/evaluation/assertions.ts create mode 100644 packages/core/test/evaluation/assertions.test.ts diff --git a/packages/core/src/evaluation/assertions.ts b/packages/core/src/evaluation/assertions.ts new file mode 100644 index 00000000..d7f3803b --- /dev/null +++ b/packages/core/src/evaluation/assertions.ts @@ -0,0 +1,101 @@ +/** + * Built-in assertion factories for the Eval() API. + * + * Each factory returns an AssertFn — a plain function that takes + * { input, output, expectedOutput, criteria, metadata } and returns + * { name, score }. These wrap the same logic as the built-in evaluator + * types but are usable as inline functions in the assert array. + */ + +/** Context passed to inline assertion functions */ +export interface AssertContext { + readonly input: string; + readonly output: string; + readonly expectedOutput?: string; + readonly criteria?: string; + readonly metadata?: Record; +} + +/** Result from an inline assertion function */ +export interface AssertResult { + readonly name: string; + readonly score: number; + readonly metadata?: Record; +} + +/** Inline assertion function signature */ +export type AssertFn = (ctx: AssertContext) => AssertResult | Promise; + +/** Checks if output contains the given substring. */ +export function Contains(value: string): AssertFn { + return ({ output }) => ({ + name: 'contains', + score: output.includes(value) ? 1.0 : 0.0, + }); +} + +/** Case-insensitive contains check. */ +export function IContains(value: string): AssertFn { + const lower = value.toLowerCase(); + return ({ output }) => ({ + name: 'icontains', + score: output.toLowerCase().includes(lower) ? 1.0 : 0.0, + }); +} + +/** Checks if output contains ALL of the given substrings. */ +export function ContainsAll(values: readonly string[]): AssertFn { + return ({ output }) => ({ + name: 'contains-all', + score: values.every((v) => output.includes(v)) ? 1.0 : 0.0, + }); +} + +/** Checks if output contains ANY of the given substrings. */ +export function ContainsAny(values: readonly string[]): AssertFn { + return ({ output }) => ({ + name: 'contains-any', + score: values.some((v) => output.includes(v)) ? 1.0 : 0.0, + }); +} + +/** Checks if trimmed output exactly equals trimmed expectedOutput. */ +export const ExactMatch: AssertFn = ({ output, expectedOutput }) => ({ + name: 'exact-match', + score: expectedOutput !== undefined && output.trim() === expectedOutput.trim() ? 1.0 : 0.0, +}); + +/** Checks if trimmed output starts with the given value. */ +export function StartsWith(value: string): AssertFn { + return ({ output }) => ({ + name: 'starts-with', + score: output.trim().startsWith(value.trim()) ? 1.0 : 0.0, + }); +} + +/** Checks if trimmed output ends with the given value. */ +export function EndsWith(value: string): AssertFn { + return ({ output }) => ({ + name: 'ends-with', + score: output.trim().endsWith(value.trim()) ? 1.0 : 0.0, + }); +} + +/** Checks if output matches the given regex pattern. */ +export function Regex(pattern: string, flags?: string): AssertFn { + const re = new RegExp(pattern, flags); + return ({ output }) => ({ + name: 'regex', + score: re.test(output) ? 1.0 : 0.0, + }); +} + +/** Checks if output is valid JSON. */ +export const IsJson: AssertFn = ({ output }) => { + try { + JSON.parse(output); + return { name: 'is-json', score: 1.0 }; + } catch { + return { name: 'is-json', score: 0.0 }; + } +}; diff --git a/packages/core/test/evaluation/assertions.test.ts b/packages/core/test/evaluation/assertions.test.ts new file mode 100644 index 00000000..42bb1bd4 --- /dev/null +++ b/packages/core/test/evaluation/assertions.test.ts @@ -0,0 +1,120 @@ +import { describe, expect, it } from 'vitest'; +import { + Contains, + ContainsAll, + ContainsAny, + EndsWith, + ExactMatch, + IContains, + IsJson, + Regex, + StartsWith, +} from '../../src/evaluation/assertions.js'; + +const ctx = (output: string, expectedOutput?: string) => ({ + input: 'test-input', + output, + expectedOutput, +}); + +describe('Contains', () => { + it('scores 1.0 when output contains value', () => { + const fn = Contains('hello'); + const result = fn(ctx('hello world')); + expect(result.score).toBe(1.0); + expect(result.name).toBe('contains'); + }); + + it('scores 0.0 when output does not contain value', () => { + const fn = Contains('goodbye'); + const result = fn(ctx('hello world')); + expect(result.score).toBe(0.0); + }); +}); + +describe('IContains', () => { + it('scores 1.0 case-insensitively', () => { + const fn = IContains('HELLO'); + const result = fn(ctx('hello world')); + expect(result.score).toBe(1.0); + }); +}); + +describe('ContainsAll', () => { + it('scores 1.0 when all values present', () => { + const fn = ContainsAll(['hello', 'world']); + expect(fn(ctx('hello world')).score).toBe(1.0); + }); + + it('scores 0.0 when any value missing', () => { + const fn = ContainsAll(['hello', 'goodbye']); + expect(fn(ctx('hello world')).score).toBe(0.0); + }); +}); + +describe('ContainsAny', () => { + it('scores 1.0 when any value present', () => { + const fn = ContainsAny(['goodbye', 'world']); + expect(fn(ctx('hello world')).score).toBe(1.0); + }); + + it('scores 0.0 when no values present', () => { + const fn = ContainsAny(['goodbye', 'farewell']); + expect(fn(ctx('hello world')).score).toBe(0.0); + }); +}); + +describe('ExactMatch', () => { + it('scores 1.0 on exact match (trimmed)', () => { + expect(ExactMatch(ctx('hello', 'hello')).score).toBe(1.0); + expect(ExactMatch(ctx(' hello ', 'hello')).score).toBe(1.0); + }); + + it('scores 0.0 on mismatch', () => { + expect(ExactMatch(ctx('hello', 'world')).score).toBe(0.0); + }); + + it('scores 0.0 when no expectedOutput', () => { + expect(ExactMatch(ctx('hello')).score).toBe(0.0); + }); +}); + +describe('StartsWith', () => { + it('scores 1.0 when output starts with value', () => { + expect(StartsWith('hello')(ctx('hello world')).score).toBe(1.0); + }); + + it('scores 0.0 when it does not', () => { + expect(StartsWith('world')(ctx('hello world')).score).toBe(0.0); + }); +}); + +describe('EndsWith', () => { + it('scores 1.0 when output ends with value', () => { + expect(EndsWith('world')(ctx('hello world')).score).toBe(1.0); + }); +}); + +describe('Regex', () => { + it('scores 1.0 on match', () => { + expect(Regex('\\d+')(ctx('abc 123 def')).score).toBe(1.0); + }); + + it('scores 0.0 on no match', () => { + expect(Regex('^\\d+$')(ctx('abc')).score).toBe(0.0); + }); + + it('supports flags', () => { + expect(Regex('HELLO', 'i')(ctx('hello')).score).toBe(1.0); + }); +}); + +describe('IsJson', () => { + it('scores 1.0 for valid JSON', () => { + expect(IsJson(ctx('{"a": 1}')).score).toBe(1.0); + }); + + it('scores 0.0 for invalid JSON', () => { + expect(IsJson(ctx('not json')).score).toBe(0.0); + }); +}); From c4cc915be09ee4bf1163b4c10ce4bf630ba44cff Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 13 Mar 2026 09:00:56 +0000 Subject: [PATCH 02/10] feat(core): add function provider for Eval() task functions --- .../evaluation/providers/function-provider.ts | 27 ++++++++++++ .../providers/function-provider.test.ts | 43 +++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 packages/core/src/evaluation/providers/function-provider.ts create mode 100644 packages/core/test/evaluation/providers/function-provider.test.ts diff --git a/packages/core/src/evaluation/providers/function-provider.ts b/packages/core/src/evaluation/providers/function-provider.ts new file mode 100644 index 00000000..27bd5b37 --- /dev/null +++ b/packages/core/src/evaluation/providers/function-provider.ts @@ -0,0 +1,27 @@ +import type { Provider, ProviderKind, ProviderRequest, ProviderResponse } from './types.js'; + +/** + * Wraps a user-provided task function as a Provider. + * Used by Eval() when `task` is specified instead of `target`. + */ +export function createFunctionProvider( + taskFn: (input: string) => string | Promise, +): Provider { + return { + id: 'function-provider', + kind: 'mock' as ProviderKind, + targetName: 'custom-task', + async invoke(request: ProviderRequest): Promise { + const startTime = new Date().toISOString(); + const start = Date.now(); + const result = await taskFn(request.question); + const endTime = new Date().toISOString(); + return { + output: [{ role: 'assistant', content: result }], + durationMs: Date.now() - start, + startTime, + endTime, + }; + }, + }; +} diff --git a/packages/core/test/evaluation/providers/function-provider.test.ts b/packages/core/test/evaluation/providers/function-provider.test.ts new file mode 100644 index 00000000..71a5cae7 --- /dev/null +++ b/packages/core/test/evaluation/providers/function-provider.test.ts @@ -0,0 +1,43 @@ +import { describe, expect, it } from 'vitest'; +import { createFunctionProvider } from '../../../src/evaluation/providers/function-provider.js'; + +describe('createFunctionProvider', () => { + it('wraps a sync task function as a Provider', async () => { + const provider = createFunctionProvider((input) => `Echo: ${input}`); + + expect(provider.id).toBe('function-provider'); + expect(provider.targetName).toBe('custom-task'); + + const response = await provider.invoke({ question: 'hello' }); + expect(response.output).toHaveLength(1); + expect(response.output![0].role).toBe('assistant'); + expect(response.output![0].content).toBe('Echo: hello'); + }); + + it('wraps an async task function', async () => { + const provider = createFunctionProvider(async (input) => { + return `Async: ${input}`; + }); + + const response = await provider.invoke({ question: 'world' }); + expect(response.output![0].content).toBe('Async: world'); + }); + + it('measures duration', async () => { + const provider = createFunctionProvider(async () => { + await new Promise((r) => setTimeout(r, 50)); + return 'done'; + }); + + const response = await provider.invoke({ question: 'test' }); + expect(response.durationMs).toBeGreaterThanOrEqual(40); + }); + + it('propagates errors from the task function', async () => { + const provider = createFunctionProvider(() => { + throw new Error('task failed'); + }); + + await expect(provider.invoke({ question: 'test' })).rejects.toThrow('task failed'); + }); +}); From ffeadcf11ffe24c02b3749dcc9b7c0101fc71fd4 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 13 Mar 2026 09:03:42 +0000 Subject: [PATCH 03/10] feat(core): add Eval() function with registry and execution --- packages/core/src/evaluation/eval-api.ts | 262 ++++++++++++++++++ .../core/test/evaluation/eval-api.test.ts | 153 ++++++++++ 2 files changed, 415 insertions(+) create mode 100644 packages/core/src/evaluation/eval-api.ts create mode 100644 packages/core/test/evaluation/eval-api.test.ts diff --git a/packages/core/src/evaluation/eval-api.ts b/packages/core/src/evaluation/eval-api.ts new file mode 100644 index 00000000..85d8e82b --- /dev/null +++ b/packages/core/src/evaluation/eval-api.ts @@ -0,0 +1,262 @@ +/** + * Declarative Eval() API for single-file TypeScript evaluations. + * + * Eval() registers an eval in a global registry and returns a Promise + * with results. Serves both CLI discovery (module-level, no await) and + * programmatic use (await for results). + * + * @example Single-file eval with built-in target + * ```typescript + * import { Eval, Contains } from '@agentv/core'; + * + * Eval('my-eval', { + * data: [{ input: 'What is 2+2?', expectedOutput: '4' }], + * target: { provider: 'claude_agent' }, + * assert: [Contains('4')], + * }); + * ``` + * + * @module + */ + +import path from 'node:path'; +import type { TargetDefinition } from './providers/types.js'; +import type { EvalRunResult, EvalSummary, EvalAssertionInput } from './evaluate.js'; +import type { AssertFn, AssertContext, AssertResult } from './assertions.js'; +import type { EvalTest, EvaluatorConfig, EvaluationResult } from './types.js'; + +// ─── Types ─────────────────────────────────────────────────────────── + +/** Data item — aligns with YAML test fields (camelCase) */ +export interface EvalDataItem { + readonly id?: string; + readonly input: string | readonly { role: string; content: string }[]; + readonly expectedOutput?: string; + readonly criteria?: string; + readonly metadata?: Record; +} + +/** Assert entry: inline function, built-in factory result, or assertion config */ +export type AssertEntry = AssertFn | EvalAssertionInput; + +/** Configuration for Eval() */ +export interface EvalOptions { + readonly data: + | readonly EvalDataItem[] + | (() => readonly EvalDataItem[] | Promise); + readonly target?: TargetDefinition; + readonly task?: (input: string) => string | Promise; + readonly assert: readonly AssertEntry[]; + readonly metadata?: Record; + readonly workers?: number; + readonly agentTimeoutMs?: number; +} + +// ─── Registry ──────────────────────────────────────────────────────── + +interface RegisteredEval { + readonly name: string; + readonly options: EvalOptions; + readonly promise: Promise; +} + +const evalRegistry = new Map(); + +/** Get all registered evals (used by CLI to discover evals in .ts files). */ +export function getRegisteredEvals(): ReadonlyMap { + return evalRegistry; +} + +/** Clear the registry (used in tests). */ +export function clearEvalRegistry(): void { + evalRegistry.clear(); +} + +// ─── Eval() ────────────────────────────────────────────────────────── + +/** + * Declare and run an evaluation. + * + * Registers the eval in a global registry (for CLI discovery) and + * returns a Promise with results (for programmatic use). + */ +export function Eval(name: string, options: EvalOptions): Promise { + // Validate + if (evalRegistry.has(name)) { + throw new Error(`Eval "${name}" already registered`); + } + if (options.task && options.target) { + throw new Error('Cannot specify both "task" and "target" — use one or the other.'); + } + if (!options.task && !options.target) { + throw new Error('Must specify either "task" or "target".'); + } + + // Create the execution promise + const promise = runEval(name, options); + + evalRegistry.set(name, { name, options, promise }); + return promise; +} + +// ─── Internal execution ────────────────────────────────────────────── + +async function runEval(name: string, options: EvalOptions): Promise { + // Lazy imports to avoid circular dependencies and keep module lightweight + const { runEvaluation } = await import('./orchestrator.js'); + const { resolveTargetDefinition } = await import('./providers/targets.js'); + const { findGitRoot } = await import('./file-utils.js'); + const { createFunctionProvider } = await import('./providers/function-provider.js'); + + // Resolve data + const data = typeof options.data === 'function' ? await options.data() : options.data; + + // Partition assert entries into inline functions and config objects + const configAsserts: EvalAssertionInput[] = []; + + for (let i = 0; i < options.assert.length; i++) { + const entry = options.assert[i]; + if (typeof entry === 'function') { + // Register inline function and create a placeholder assertion config + configAsserts.push({ + type: 'inline-assert', + name: `inline-assert-${i}`, + }); + } else { + configAsserts.push(entry as EvalAssertionInput); + } + } + + // Store inline functions for the inline-assert evaluator to pick up + const inlineFns = options.assert + .filter((entry): entry is AssertFn => typeof entry === 'function'); + setInlineAssertFns(inlineFns); + + // Resolve target + const targetDef: TargetDefinition = options.target ?? { name: 'custom-task', provider: 'mock' }; + const resolvedTarget = resolveTargetDefinition(targetDef); + + // Build providerFactory for task functions + let providerFactory: ((target: any) => any) | undefined; + if (options.task) { + const taskProvider = createFunctionProvider(options.task); + providerFactory = () => taskProvider; + } + + // Convert EvalAssertionInput[] to EvaluatorConfig[] + const evaluatorConfigs = configAsserts.map((a, i) => { + const { type: rawType, ...rest } = a; + return { + ...rest, + name: a.name ?? `${rawType}_${i}`, + type: rawType.replace(/_/g, '-'), + } as unknown as EvaluatorConfig; + }); + + // Convert data items to EvalTest[] + const evalCases: EvalTest[] = data.map((item, i) => { + const input = + typeof item.input === 'string' + ? ([{ role: 'user' as const, content: item.input }] as EvalTest['input']) + : (item.input as unknown as EvalTest['input']); + + const question = + typeof item.input === 'string' + ? item.input + : ((item.input.find((m) => m.role === 'user')?.content as string) ?? ''); + + const expectedOutput = item.expectedOutput + ? ([{ role: 'assistant' as const, content: item.expectedOutput }] as EvalTest['expected_output']) + : []; + + return { + id: item.id ? `${name}/${item.id}` : `${name}/${i}`, + criteria: item.criteria ?? '', + question: String(question), + input, + input_segments: [], + expected_output: expectedOutput, + reference_answer: item.expectedOutput, + guideline_paths: [], + guideline_patterns: [], + file_paths: [], + evaluators: evaluatorConfigs.length > 0 ? evaluatorConfigs : undefined, + metadata: item.metadata, + }; + }); + + const startTime = Date.now(); + const repoRoot = (await findGitRoot(process.cwd())) ?? process.cwd(); + const testFilePath = path.join(process.cwd(), '__eval_api__.yaml'); + + const collectedResults: EvaluationResult[] = []; + + await runEvaluation({ + testFilePath, + repoRoot, + target: resolvedTarget, + maxRetries: 2, + agentTimeoutMs: options.agentTimeoutMs, + maxConcurrency: options.workers ?? 3, + evalCases, + ...(providerFactory ? { providerFactory } : {}), + onResult: async (result) => { + collectedResults.push(result); + }, + }); + + // Clear inline functions after execution + setInlineAssertFns([]); + + const durationMs = Date.now() - startTime; + return { + results: collectedResults, + summary: computeSummary(collectedResults, durationMs), + }; +} + +// ─── Summary computation ───────────────────────────────────────────── + +function computeSummary(results: readonly EvaluationResult[], durationMs: number): EvalSummary { + const total = results.length; + let passed = 0; + let failed = 0; + let borderline = 0; + let scoreSum = 0; + + for (const r of results) { + scoreSum += r.score; + if (r.score >= 0.8) { + passed++; + } else if (r.score < 0.5) { + failed++; + } else { + borderline++; + } + } + + return { + total, + passed, + failed, + borderline, + durationMs, + meanScore: total > 0 ? scoreSum / total : 0, + }; +} + +// ─── Inline assert function storage ────────────────────────────────── + +let _inlineAssertFns: AssertFn[] = []; + +export function setInlineAssertFns(fns: AssertFn[]): void { + _inlineAssertFns = fns; +} + +export function getInlineAssertFns(): AssertFn[] { + return _inlineAssertFns; +} + +// Re-export types +export type { AssertContext, AssertResult, AssertFn } from './assertions.js'; +export type { EvalRunResult, EvalSummary } from './evaluate.js'; diff --git a/packages/core/test/evaluation/eval-api.test.ts b/packages/core/test/evaluation/eval-api.test.ts new file mode 100644 index 00000000..3c89881a --- /dev/null +++ b/packages/core/test/evaluation/eval-api.test.ts @@ -0,0 +1,153 @@ +import { afterEach, describe, expect, it } from 'vitest'; +import { + Eval, + clearEvalRegistry, + getRegisteredEvals, + type EvalDataItem, +} from '../../src/evaluation/eval-api.js'; +import { Contains } from '../../src/evaluation/assertions.js'; + +afterEach(() => { + clearEvalRegistry(); +}); + +describe('Eval() registration', () => { + it('registers an eval in the global registry', () => { + Eval('test-eval', { + data: [{ input: 'hello', expectedOutput: 'world' }], + target: { name: 'default', provider: 'mock', response: 'world' }, + assert: [Contains('world')], + }); + + const registry = getRegisteredEvals(); + expect(registry.size).toBe(1); + expect(registry.has('test-eval')).toBe(true); + }); + + it('throws on duplicate eval names', () => { + Eval('dup', { + data: [{ input: 'a' }], + target: { name: 'default', provider: 'mock' }, + assert: [Contains('a')], + }); + + expect(() => { + Eval('dup', { + data: [{ input: 'b' }], + target: { name: 'default', provider: 'mock' }, + assert: [Contains('b')], + }); + }).toThrow('Eval "dup" already registered'); + }); + + it('throws when both task and target are provided', () => { + expect(() => { + Eval('bad', { + data: [{ input: 'a' }], + target: { name: 'default', provider: 'mock' }, + task: async (input) => input, + assert: [Contains('a')], + }); + }).toThrow('Cannot specify both "task" and "target"'); + }); + + it('throws when neither task nor target is provided', () => { + expect(() => { + Eval('bad', { + data: [{ input: 'a' }], + assert: [Contains('a')], + }); + }).toThrow('Must specify either "task" or "target"'); + }); +}); + +describe('Eval() execution with mock target', () => { + it('returns results when awaited', async () => { + const result = await Eval('exec-test', { + data: [ + { id: 'case-1', input: 'hello', expectedOutput: 'world' }, + ], + target: { name: 'default', provider: 'mock', response: 'world' }, + assert: [Contains('world')], + }); + + expect(result.results).toHaveLength(1); + expect(result.summary.total).toBe(1); + expect(result.summary.passed).toBe(1); + }); + + it('auto-generates test IDs from eval name and index', async () => { + const result = await Eval('auto-id', { + data: [ + { input: 'first' }, + { input: 'second' }, + ], + target: { name: 'default', provider: 'mock', response: 'first second' }, + assert: [Contains('first')], + }); + + expect(result.results[0].testId).toBe('auto-id/0'); + expect(result.results[1].testId).toBe('auto-id/1'); + }); + + it('uses explicit data item IDs in test IDs', async () => { + const result = await Eval('explicit-id', { + data: [ + { id: 'my-case', input: 'hello' }, + ], + target: { name: 'default', provider: 'mock', response: 'hello' }, + assert: [Contains('hello')], + }); + + expect(result.results[0].testId).toBe('explicit-id/my-case'); + }); + + it('supports inline assertion functions', async () => { + const result = await Eval('inline-fn', { + data: [{ input: 'test', expectedOutput: 'test' }], + target: { name: 'default', provider: 'mock', response: 'test' }, + assert: [ + ({ output, expectedOutput }) => ({ + name: 'custom', + score: output === expectedOutput ? 1.0 : 0.0, + }), + ], + }); + + expect(result.summary.passed).toBe(1); + }); + + it('supports async data factory', async () => { + const result = await Eval('async-data', { + data: async () => [{ input: 'hello' }], + target: { name: 'default', provider: 'mock', response: 'hello' }, + assert: [Contains('hello')], + }); + + expect(result.results).toHaveLength(1); + }); + + it('supports task function instead of target', async () => { + const result = await Eval('task-fn', { + data: [{ input: 'hello' }], + task: async (input) => `Echo: ${input}`, + assert: [Contains('Echo: hello')], + }); + + expect(result.summary.passed).toBe(1); + }); + + it('supports mixing inline functions and assertion configs', async () => { + const result = await Eval('mixed', { + data: [{ input: 'hello world' }], + target: { name: 'default', provider: 'mock', response: 'hello world' }, + assert: [ + Contains('hello'), + ({ output }) => ({ name: 'has-world', score: output.includes('world') ? 1.0 : 0.0 }), + { type: 'contains', value: 'world' }, + ], + }); + + expect(result.summary.passed).toBe(1); + }); +}); From 8fc604386dfbaf205c5de93f014d8b40ff75d2c5 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 13 Mar 2026 09:12:17 +0000 Subject: [PATCH 04/10] feat(core): add inline-assert evaluator type for Eval() API --- packages/core/src/evaluation/eval-api.ts | 56 +++++++------ .../evaluation/evaluators/inline-assert.ts | 38 +++++++++ .../evaluation/registry/builtin-evaluators.ts | 21 ++++- .../evaluators/inline-assert.test.ts | 84 +++++++++++++++++++ 4 files changed, 173 insertions(+), 26 deletions(-) create mode 100644 packages/core/src/evaluation/evaluators/inline-assert.ts create mode 100644 packages/core/test/evaluation/evaluators/inline-assert.test.ts diff --git a/packages/core/src/evaluation/eval-api.ts b/packages/core/src/evaluation/eval-api.ts index 85d8e82b..bbba49a4 100644 --- a/packages/core/src/evaluation/eval-api.ts +++ b/packages/core/src/evaluation/eval-api.ts @@ -99,6 +99,14 @@ export function Eval(name: string, options: EvalOptions): Promise return promise; } +// ─── Inline function storage via symbol ────────────────────────────── + +/** + * Symbol used to attach inline AssertFn references to EvaluatorConfig objects. + * This avoids global state and race conditions between concurrent Eval() calls. + */ +export const INLINE_ASSERT_FN = Symbol.for('agentv.inline-assert-fn'); + // ─── Internal execution ────────────────────────────────────────────── async function runEval(name: string, options: EvalOptions): Promise { @@ -112,26 +120,30 @@ async function runEval(name: string, options: EvalOptions): Promise typeof entry === 'function'); - setInlineAssertFns(inlineFns); - // Resolve target const targetDef: TargetDefinition = options.target ?? { name: 'custom-task', provider: 'mock' }; const resolvedTarget = resolveTargetDefinition(targetDef); @@ -143,16 +155,6 @@ async function runEval(name: string, options: EvalOptions): Promise taskProvider; } - // Convert EvalAssertionInput[] to EvaluatorConfig[] - const evaluatorConfigs = configAsserts.map((a, i) => { - const { type: rawType, ...rest } = a; - return { - ...rest, - name: a.name ?? `${rawType}_${i}`, - type: rawType.replace(/_/g, '-'), - } as unknown as EvaluatorConfig; - }); - // Convert data items to EvalTest[] const evalCases: EvalTest[] = data.map((item, i) => { const input = @@ -169,12 +171,19 @@ async function runEval(name: string, options: EvalOptions): Promise m.role === 'user' && typeof m.content === 'string') + .map((m) => ({ type: 'text' as const, value: m.content })); + return { id: item.id ? `${name}/${item.id}` : `${name}/${i}`, criteria: item.criteria ?? '', question: String(question), input, - input_segments: [], + input_segments: inputSegments, expected_output: expectedOutput, reference_answer: item.expectedOutput, guideline_paths: [], @@ -205,9 +214,6 @@ async function runEval(name: string, options: EvalOptions): Promise { + const result = await this.fn({ + input: context.evalCase.question, + output: context.candidate, + expectedOutput: context.evalCase.reference_answer, + criteria: context.evalCase.criteria, + metadata: context.evalCase.metadata, + }); + + const score = Math.max(0, Math.min(1, result.score)); + + return { + score, + verdict: scoreToVerdict(score), + hits: score >= 0.8 ? [result.name] : [], + misses: score < 0.5 ? [result.name] : [], + expectedAspectCount: 1, + reasoning: undefined, + details: result.metadata ? (result.metadata as any) : undefined, + }; + } +} diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts index e992acce..517af233 100644 --- a/packages/core/src/evaluation/registry/builtin-evaluators.ts +++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts @@ -30,6 +30,8 @@ import { runRegexAssertion, runStartsWithAssertion, } from '../evaluators.js'; +import { InlineAssertEvaluator } from '../evaluators/inline-assert.js'; +import { INLINE_ASSERT_FN, getInlineAssertFns } from '../eval-api.js'; import { resolveCustomPrompt } from '../evaluators/prompt-resolution.js'; import type { Provider } from '../providers/types.js'; import type { ToolTrajectoryEvaluatorConfig } from '../trace.js'; @@ -421,7 +423,24 @@ export function createBuiltinRegistry(): EvaluatorRegistry { .register('ends-with', endsWithFactory) .register('regex', regexFactory) .register('is-json', isJsonFactory) - .register('equals', equalsFactory); + .register('equals', equalsFactory) + .register('inline-assert', (config, _context) => { + // Prefer the function attached directly to the config via symbol (concurrent-safe) + const symbolFn = (config as any)[INLINE_ASSERT_FN] as + | import('../assertions.js').AssertFn + | undefined; + if (symbolFn) { + return new InlineAssertEvaluator(symbolFn, config.name ?? 'inline-assert'); + } + // Fallback: legacy global storage + const fns = getInlineAssertFns(); + const index = Number.parseInt(config.name?.replace('inline-assert-', '') ?? '0', 10); + const fn = fns[index]; + if (!fn) { + throw new Error(`No inline assert function found at index ${index}`); + } + return new InlineAssertEvaluator(fn, config.name ?? `inline-assert-${index}`); + }); return registry; } diff --git a/packages/core/test/evaluation/evaluators/inline-assert.test.ts b/packages/core/test/evaluation/evaluators/inline-assert.test.ts new file mode 100644 index 00000000..757b6c07 --- /dev/null +++ b/packages/core/test/evaluation/evaluators/inline-assert.test.ts @@ -0,0 +1,84 @@ +import { describe, expect, it } from 'vitest'; +import { InlineAssertEvaluator } from '../../../src/evaluation/evaluators/inline-assert.js'; +import type { AssertFn } from '../../../src/evaluation/assertions.js'; + +describe('InlineAssertEvaluator', () => { + it('runs an inline assert function and returns EvaluationScore', async () => { + const fn: AssertFn = ({ output }) => ({ + name: 'test-assert', + score: output.includes('hello') ? 1.0 : 0.0, + }); + + const evaluator = new InlineAssertEvaluator(fn, 'test-assert'); + + const score = await evaluator.evaluate({ + evalCase: { + id: 'test', + question: 'greet me', + criteria: '', + input: [{ role: 'user', content: 'greet me' }], + input_segments: [], + expected_output: [], + guideline_paths: [], + file_paths: [], + reference_answer: 'hello world', + }, + candidate: 'hello world', + } as any); + + expect(score.score).toBe(1.0); + expect(score.verdict).toBe('pass'); + }); + + it('handles async assert functions', async () => { + const fn: AssertFn = async ({ output }) => ({ + name: 'async-assert', + score: output.length > 0 ? 1.0 : 0.0, + }); + + const evaluator = new InlineAssertEvaluator(fn, 'async-assert'); + + const score = await evaluator.evaluate({ + evalCase: { + id: 'test', + question: 'test', + criteria: '', + input: [{ role: 'user', content: 'test' }], + input_segments: [], + expected_output: [], + guideline_paths: [], + file_paths: [], + }, + candidate: 'some output', + } as any); + + expect(score.score).toBe(1.0); + }); + + it('returns fail verdict for score 0', async () => { + const fn: AssertFn = () => ({ + name: 'always-fail', + score: 0.0, + }); + + const evaluator = new InlineAssertEvaluator(fn, 'always-fail'); + + const score = await evaluator.evaluate({ + evalCase: { + id: 'test', + question: 'test', + criteria: '', + input: [{ role: 'user', content: 'test' }], + input_segments: [], + expected_output: [], + guideline_paths: [], + file_paths: [], + }, + candidate: 'output', + } as any); + + expect(score.score).toBe(0.0); + expect(score.verdict).toBe('fail'); + expect(score.misses).toContain('always-fail'); + }); +}); From cf65eaf3e06ee3e0ea0632ce82c246363fee913c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 13 Mar 2026 09:13:10 +0000 Subject: [PATCH 05/10] feat(core): export Eval() API and assertion factories from @agentv/core --- packages/core/src/index.ts | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 0fdb7496..469877a2 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -13,6 +13,30 @@ export { type EvalRunResult, type EvalSummary, } from './evaluation/evaluate.js'; + +// Eval() API — single-file TypeScript evaluations +export { + Eval, + getRegisteredEvals, + clearEvalRegistry, + type EvalDataItem, + type EvalOptions, + type AssertEntry, +} from './evaluation/eval-api.js'; +export { + Contains, + IContains, + ContainsAll, + ContainsAny, + ExactMatch, + StartsWith, + EndsWith, + Regex, + IsJson, + type AssertFn, + type AssertContext, + type AssertResult, +} from './evaluation/assertions.js'; export { defineConfig, loadTsConfig, From 648f2dfdb00f63673e71175bc555b3fc74d57180 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 13 Mar 2026 09:17:39 +0000 Subject: [PATCH 06/10] feat(cli): support TypeScript eval files in agentv eval command Detect .ts/.js/.mts/.mjs files and route them through the Eval() API instead of the YAML pipeline. Imports the file, discovers registered evals, awaits their promises, and collects results for summary output. --- apps/cli/src/commands/eval/run-eval.ts | 73 ++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index aee3a5ff..7aa21afe 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -800,10 +800,75 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise console.log(`Trace file: ${path.resolve(options.traceFile)}`); } + const allResults: EvaluationResult[] = []; + + // ── TypeScript eval file support ────────────────────────────────── + // Detect .ts/.js files and handle them via the Eval() API instead of the YAML pipeline + const tsEvalFiles = resolvedTestFiles.filter( + (f) => f.endsWith('.ts') || f.endsWith('.js') || f.endsWith('.mts') || f.endsWith('.mjs'), + ); + const yamlEvalFiles = resolvedTestFiles.filter( + (f) => !tsEvalFiles.includes(f), + ); + + if (tsEvalFiles.length > 0) { + const { getRegisteredEvals, clearEvalRegistry } = await import('@agentv/core'); + + for (const tsFile of tsEvalFiles) { + await ensureFileExists(tsFile, 'TypeScript eval file'); + clearEvalRegistry(); + + // Import the TS file — Eval() calls register during import + await import(pathToFileURL(tsFile).href); + + const registered = getRegisteredEvals(); + if (registered.size === 0) { + throw new Error(`No Eval() calls found in ${tsFile}`); + } + + // Wait for all registered eval promises and collect results + for (const [evalName, entry] of registered) { + try { + const evalResult = await entry.promise; + // Print summary per eval + const passCount = evalResult.summary.passed; + const totalCount = evalResult.summary.total; + const meanScore = evalResult.summary.meanScore.toFixed(2); + console.log( + ` ${evalName}: ${passCount}/${totalCount} passed (mean score: ${meanScore})`, + ); + + // Collect results for overall summary + allResults.push(...evalResult.results); + } catch (err) { + console.error( + ` ${evalName}: ERROR — ${err instanceof Error ? err.message : String(err)}`, + ); + } + } + + clearEvalRegistry(); + } + + // If there are no YAML files, write results and return + if (yamlEvalFiles.length === 0) { + for (const result of allResults) { + await outputWriter.append(result); + } + await outputWriter.close().catch(() => undefined); + const summary = calculateEvaluationSummary(allResults); + console.log(formatEvaluationSummary(summary)); + if (allResults.length > 0) { + console.log(`\nResults written to: ${outputPath}`); + } + return; + } + } + + // ── YAML eval file pipeline (existing) ───────────────────────────── // Determine cache state after loading file metadata (need YAML config) // We defer cache creation until after file metadata is loaded const evaluationRunner = await resolveEvaluationRunner(); - const allResults: EvaluationResult[] = []; const seenEvalCases = new Set(); const displayIdTracker = createDisplayIdTracker(); @@ -811,7 +876,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise const totalWorkers = options.workers ?? DEFAULT_WORKERS; const fileConcurrency = Math.min( Math.max(1, totalWorkers), - Math.max(1, resolvedTestFiles.length), + Math.max(1, yamlEvalFiles.length), ); const perFileWorkers = options.workers ? Math.max(1, Math.floor(totalWorkers / fileConcurrency)) @@ -833,7 +898,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise readonly failOnError?: FailOnError; } >(); - for (const testFilePath of resolvedTestFiles) { + for (const testFilePath of yamlEvalFiles) { const meta = await prepareFileMetadata({ testFilePath, repoRoot, @@ -936,7 +1001,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise } try { - await runWithLimit(resolvedTestFiles, fileConcurrency, async (testFilePath) => { + await runWithLimit(yamlEvalFiles, fileConcurrency, async (testFilePath) => { const targetPrep = fileMetadata.get(testFilePath); if (!targetPrep) { throw new Error(`Missing metadata for ${testFilePath}`); From 998b7beb5bfa30f343670abd84d77fc09eeffa02 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 13 Mar 2026 09:24:25 +0000 Subject: [PATCH 07/10] feat(cli): accept TypeScript and JavaScript files in eval path resolution Extend resolveEvalPaths to recognize .ts, .js, .mts, and .mjs extensions alongside YAML and JSONL, enabling `agentv eval path/to/eval.ts`. --- apps/cli/src/commands/eval/shared.ts | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/apps/cli/src/commands/eval/shared.ts b/apps/cli/src/commands/eval/shared.ts index 586d2f6f..653ada24 100644 --- a/apps/cli/src/commands/eval/shared.ts +++ b/apps/cli/src/commands/eval/shared.ts @@ -3,6 +3,9 @@ import { access, stat } from 'node:fs/promises'; import path from 'node:path'; import fg from 'fast-glob'; +/** Supported eval file extensions: YAML, JSONL, and TypeScript/JavaScript */ +const EVAL_FILE_RE = /\.(ya?ml|jsonl|ts|js|mts|mjs)$/i; + export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promise { const normalizedInputs = evalPaths.map((value) => value?.trim()).filter((value) => value); if (normalizedInputs.length === 0) { @@ -19,7 +22,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis : path.resolve(cwd, pattern); try { const stats = await stat(candidatePath); - if (stats.isFile() && /\.(ya?ml|jsonl)$/i.test(candidatePath)) { + if (stats.isFile() && EVAL_FILE_RE.test(candidatePath)) { results.add(candidatePath); continue; } @@ -37,13 +40,13 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis followSymbolicLinks: true, }); - const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl)$/i.test(filePath)); - if (yamlMatches.length === 0) { + const evalMatches = matches.filter((filePath) => EVAL_FILE_RE.test(filePath)); + if (evalMatches.length === 0) { unmatched.push(pattern); continue; } - for (const filePath of yamlMatches) { + for (const filePath of evalMatches) { results.add(path.normalize(filePath)); } } @@ -52,7 +55,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis throw new Error( `No eval files matched: ${unmatched.join( ', ', - )}. Provide YAML or JSONL paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl").`, + )}. Provide YAML, JSONL, or TypeScript paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.eval.ts").`, ); } From 1bdf58cb62eacff6a60766a594ae58beac043507 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 13 Mar 2026 09:24:32 +0000 Subject: [PATCH 08/10] chore: fix lint errors in Eval() API implementation Address biome lint issues: import ordering, noExplicitAny suppressions, non-null assertions replaced with optional chaining, proper JsonObject typing, and formatting fixes. --- apps/cli/src/commands/eval/run-eval.ts | 9 ++----- packages/core/src/evaluation/eval-api.ts | 26 +++++++++++-------- .../evaluation/evaluators/inline-assert.ts | 5 ++-- .../evaluation/registry/builtin-evaluators.ts | 3 ++- .../core/test/evaluation/eval-api.test.ts | 17 ++++-------- .../evaluators/inline-assert.test.ts | 5 +++- .../providers/function-provider.test.ts | 6 ++--- 7 files changed, 34 insertions(+), 37 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 7aa21afe..247cf806 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -807,9 +807,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise const tsEvalFiles = resolvedTestFiles.filter( (f) => f.endsWith('.ts') || f.endsWith('.js') || f.endsWith('.mts') || f.endsWith('.mjs'), ); - const yamlEvalFiles = resolvedTestFiles.filter( - (f) => !tsEvalFiles.includes(f), - ); + const yamlEvalFiles = resolvedTestFiles.filter((f) => !tsEvalFiles.includes(f)); if (tsEvalFiles.length > 0) { const { getRegisteredEvals, clearEvalRegistry } = await import('@agentv/core'); @@ -874,10 +872,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise // Derive file-level concurrency from worker count (global) when provided const totalWorkers = options.workers ?? DEFAULT_WORKERS; - const fileConcurrency = Math.min( - Math.max(1, totalWorkers), - Math.max(1, yamlEvalFiles.length), - ); + const fileConcurrency = Math.min(Math.max(1, totalWorkers), Math.max(1, yamlEvalFiles.length)); const perFileWorkers = options.workers ? Math.max(1, Math.floor(totalWorkers / fileConcurrency)) : undefined; diff --git a/packages/core/src/evaluation/eval-api.ts b/packages/core/src/evaluation/eval-api.ts index bbba49a4..2dc4560b 100644 --- a/packages/core/src/evaluation/eval-api.ts +++ b/packages/core/src/evaluation/eval-api.ts @@ -20,10 +20,11 @@ */ import path from 'node:path'; -import type { TargetDefinition } from './providers/types.js'; -import type { EvalRunResult, EvalSummary, EvalAssertionInput } from './evaluate.js'; -import type { AssertFn, AssertContext, AssertResult } from './assertions.js'; -import type { EvalTest, EvaluatorConfig, EvaluationResult } from './types.js'; +import type { AssertContext, AssertFn, AssertResult } from './assertions.js'; +import type { EvalAssertionInput, EvalRunResult, EvalSummary } from './evaluate.js'; +import type { ResolvedTarget } from './providers/targets.js'; +import type { Provider, TargetDefinition } from './providers/types.js'; +import type { EvalTest, EvaluationResult, EvaluatorConfig } from './types.js'; // ─── Types ─────────────────────────────────────────────────────────── @@ -149,7 +150,7 @@ async function runEval(name: string, options: EvalOptions): Promise any) | undefined; + let providerFactory: ((target: ResolvedTarget) => Provider) | undefined; if (options.task) { const taskProvider = createFunctionProvider(options.task); providerFactory = () => taskProvider; @@ -168,15 +169,18 @@ async function runEval(name: string, options: EvalOptions): Promise m.role === 'user')?.content as string) ?? ''); const expectedOutput = item.expectedOutput - ? ([{ role: 'assistant' as const, content: item.expectedOutput }] as EvalTest['expected_output']) + ? ([ + { role: 'assistant' as const, content: item.expectedOutput }, + ] as EvalTest['expected_output']) : []; // Build input_segments so buildPromptInputs can extract the question - const inputSegments = typeof item.input === 'string' - ? [{ type: 'text' as const, value: item.input }] - : (item.input as readonly { role: string; content: string }[]) - .filter((m) => m.role === 'user' && typeof m.content === 'string') - .map((m) => ({ type: 'text' as const, value: m.content })); + const inputSegments = + typeof item.input === 'string' + ? [{ type: 'text' as const, value: item.input }] + : (item.input as readonly { role: string; content: string }[]) + .filter((m) => m.role === 'user' && typeof m.content === 'string') + .map((m) => ({ type: 'text' as const, value: m.content })); return { id: item.id ? `${name}/${item.id}` : `${name}/${i}`, diff --git a/packages/core/src/evaluation/evaluators/inline-assert.ts b/packages/core/src/evaluation/evaluators/inline-assert.ts index 1a5b5352..8963e74a 100644 --- a/packages/core/src/evaluation/evaluators/inline-assert.ts +++ b/packages/core/src/evaluation/evaluators/inline-assert.ts @@ -1,6 +1,7 @@ import type { AssertFn } from '../assertions.js'; -import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; +import type { JsonObject } from '../types.js'; import { scoreToVerdict } from './scoring.js'; +import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; /** * Evaluator that wraps an inline AssertFn and runs it in-process. @@ -32,7 +33,7 @@ export class InlineAssertEvaluator implements Evaluator { misses: score < 0.5 ? [result.name] : [], expectedAspectCount: 1, reasoning: undefined, - details: result.metadata ? (result.metadata as any) : undefined, + details: result.metadata ? (result.metadata as JsonObject) : undefined, }; } } diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts index 517af233..de5586f0 100644 --- a/packages/core/src/evaluation/registry/builtin-evaluators.ts +++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts @@ -7,6 +7,7 @@ */ import { readFileSync } from 'node:fs'; +import { INLINE_ASSERT_FN, getInlineAssertFns } from '../eval-api.js'; import { AgentJudgeEvaluator, CodeEvaluator, @@ -31,7 +32,6 @@ import { runStartsWithAssertion, } from '../evaluators.js'; import { InlineAssertEvaluator } from '../evaluators/inline-assert.js'; -import { INLINE_ASSERT_FN, getInlineAssertFns } from '../eval-api.js'; import { resolveCustomPrompt } from '../evaluators/prompt-resolution.js'; import type { Provider } from '../providers/types.js'; import type { ToolTrajectoryEvaluatorConfig } from '../trace.js'; @@ -426,6 +426,7 @@ export function createBuiltinRegistry(): EvaluatorRegistry { .register('equals', equalsFactory) .register('inline-assert', (config, _context) => { // Prefer the function attached directly to the config via symbol (concurrent-safe) + // biome-ignore lint/suspicious/noExplicitAny: symbol key access requires dynamic cast const symbolFn = (config as any)[INLINE_ASSERT_FN] as | import('../assertions.js').AssertFn | undefined; diff --git a/packages/core/test/evaluation/eval-api.test.ts b/packages/core/test/evaluation/eval-api.test.ts index 3c89881a..fc15ddde 100644 --- a/packages/core/test/evaluation/eval-api.test.ts +++ b/packages/core/test/evaluation/eval-api.test.ts @@ -1,11 +1,11 @@ import { afterEach, describe, expect, it } from 'vitest'; +import { Contains } from '../../src/evaluation/assertions.js'; import { Eval, + type EvalDataItem, clearEvalRegistry, getRegisteredEvals, - type EvalDataItem, } from '../../src/evaluation/eval-api.js'; -import { Contains } from '../../src/evaluation/assertions.js'; afterEach(() => { clearEvalRegistry(); @@ -64,9 +64,7 @@ describe('Eval() registration', () => { describe('Eval() execution with mock target', () => { it('returns results when awaited', async () => { const result = await Eval('exec-test', { - data: [ - { id: 'case-1', input: 'hello', expectedOutput: 'world' }, - ], + data: [{ id: 'case-1', input: 'hello', expectedOutput: 'world' }], target: { name: 'default', provider: 'mock', response: 'world' }, assert: [Contains('world')], }); @@ -78,10 +76,7 @@ describe('Eval() execution with mock target', () => { it('auto-generates test IDs from eval name and index', async () => { const result = await Eval('auto-id', { - data: [ - { input: 'first' }, - { input: 'second' }, - ], + data: [{ input: 'first' }, { input: 'second' }], target: { name: 'default', provider: 'mock', response: 'first second' }, assert: [Contains('first')], }); @@ -92,9 +87,7 @@ describe('Eval() execution with mock target', () => { it('uses explicit data item IDs in test IDs', async () => { const result = await Eval('explicit-id', { - data: [ - { id: 'my-case', input: 'hello' }, - ], + data: [{ id: 'my-case', input: 'hello' }], target: { name: 'default', provider: 'mock', response: 'hello' }, assert: [Contains('hello')], }); diff --git a/packages/core/test/evaluation/evaluators/inline-assert.test.ts b/packages/core/test/evaluation/evaluators/inline-assert.test.ts index 757b6c07..504e1b00 100644 --- a/packages/core/test/evaluation/evaluators/inline-assert.test.ts +++ b/packages/core/test/evaluation/evaluators/inline-assert.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest'; -import { InlineAssertEvaluator } from '../../../src/evaluation/evaluators/inline-assert.js'; import type { AssertFn } from '../../../src/evaluation/assertions.js'; +import { InlineAssertEvaluator } from '../../../src/evaluation/evaluators/inline-assert.js'; describe('InlineAssertEvaluator', () => { it('runs an inline assert function and returns EvaluationScore', async () => { @@ -24,6 +24,7 @@ describe('InlineAssertEvaluator', () => { reference_answer: 'hello world', }, candidate: 'hello world', + // biome-ignore lint/suspicious/noExplicitAny: partial context for unit testing } as any); expect(score.score).toBe(1.0); @@ -50,6 +51,7 @@ describe('InlineAssertEvaluator', () => { file_paths: [], }, candidate: 'some output', + // biome-ignore lint/suspicious/noExplicitAny: partial context for unit testing } as any); expect(score.score).toBe(1.0); @@ -75,6 +77,7 @@ describe('InlineAssertEvaluator', () => { file_paths: [], }, candidate: 'output', + // biome-ignore lint/suspicious/noExplicitAny: partial context for unit testing } as any); expect(score.score).toBe(0.0); diff --git a/packages/core/test/evaluation/providers/function-provider.test.ts b/packages/core/test/evaluation/providers/function-provider.test.ts index 71a5cae7..f4116e38 100644 --- a/packages/core/test/evaluation/providers/function-provider.test.ts +++ b/packages/core/test/evaluation/providers/function-provider.test.ts @@ -10,8 +10,8 @@ describe('createFunctionProvider', () => { const response = await provider.invoke({ question: 'hello' }); expect(response.output).toHaveLength(1); - expect(response.output![0].role).toBe('assistant'); - expect(response.output![0].content).toBe('Echo: hello'); + expect(response.output?.[0].role).toBe('assistant'); + expect(response.output?.[0].content).toBe('Echo: hello'); }); it('wraps an async task function', async () => { @@ -20,7 +20,7 @@ describe('createFunctionProvider', () => { }); const response = await provider.invoke({ question: 'world' }); - expect(response.output![0].content).toBe('Async: world'); + expect(response.output?.[0].content).toBe('Async: world'); }); it('measures duration', async () => { From 14e6c107019aa78c95b3bcf3c34dfaabfab4bb43 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 13 Mar 2026 09:24:38 +0000 Subject: [PATCH 09/10] feat: add TypeScript Eval() API example Demonstrates single-file eval with mock target, built-in Contains() assertion, and inline assertion function. --- .../features/sdk-eval-api/evals/basic.eval.ts | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 examples/features/sdk-eval-api/evals/basic.eval.ts diff --git a/examples/features/sdk-eval-api/evals/basic.eval.ts b/examples/features/sdk-eval-api/evals/basic.eval.ts new file mode 100644 index 00000000..fd5082a1 --- /dev/null +++ b/examples/features/sdk-eval-api/evals/basic.eval.ts @@ -0,0 +1,26 @@ +import { Contains, Eval } from '@agentv/core'; + +Eval('sdk-example/basic', { + data: [ + { + id: 'greeting', + input: 'Say hello', + expectedOutput: 'hello', + criteria: 'Response should contain a greeting', + }, + { + id: 'math', + input: 'What is 2+2?', + expectedOutput: '4', + criteria: 'Response should contain the correct answer', + }, + ], + target: { name: 'default', provider: 'mock', response: 'hello, the answer is 4' }, + assert: [ + Contains('hello'), + ({ output, expectedOutput }) => ({ + name: 'has-expected', + score: output.includes(expectedOutput ?? '') ? 1.0 : 0.0, + }), + ], +}); From aba42b3f6422160acb2aad786150a8753cfa1317 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 13 Mar 2026 09:52:31 +0000 Subject: [PATCH 10/10] refactor(core): address Eval() API code review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove dead code: legacy setInlineAssertFns/getInlineAssertFns and the fallback path in builtin-evaluators registry - Remove evaluate() from public exports, migrate example to Eval() - Add InlineAssertEvaluatorConfig to type system, remove unsafe casts - Replace fake __eval_api__.yaml path with virtual marker - Deduplicate computeSummary() — export from evaluate.ts, import in eval-api.ts - Fix mixed TS+YAML output: write TS results to outputWriter immediately - Add ExactMatch usage to sdk-eval-api example - Add borderline score test case (0.5 <= score < 0.8) --- apps/cli/src/commands/eval/run-eval.ts | 10 +-- .../features/sdk-eval-api/evals/basic.eval.ts | 14 ++++- .../features/sdk-programmatic-api/evaluate.ts | 27 +++----- packages/core/src/evaluation/eval-api.ts | 61 ++++--------------- packages/core/src/evaluation/evaluate.ts | 6 +- .../evaluation/registry/builtin-evaluators.ts | 18 ++---- packages/core/src/evaluation/types.ts | 18 +++++- packages/core/src/index.ts | 13 ++-- .../core/test/evaluation/eval-api.test.ts | 19 ++++++ 9 files changed, 92 insertions(+), 94 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 247cf806..3017bf81 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -836,7 +836,10 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise ` ${evalName}: ${passCount}/${totalCount} passed (mean score: ${meanScore})`, ); - // Collect results for overall summary + // Write results to output and collect for overall summary + for (const result of evalResult.results) { + await outputWriter.append(result); + } allResults.push(...evalResult.results); } catch (err) { console.error( @@ -848,11 +851,8 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise clearEvalRegistry(); } - // If there are no YAML files, write results and return + // If there are no YAML files, finalize output and return if (yamlEvalFiles.length === 0) { - for (const result of allResults) { - await outputWriter.append(result); - } await outputWriter.close().catch(() => undefined); const summary = calculateEvaluationSummary(allResults); console.log(formatEvaluationSummary(summary)); diff --git a/examples/features/sdk-eval-api/evals/basic.eval.ts b/examples/features/sdk-eval-api/evals/basic.eval.ts index fd5082a1..56ffff8e 100644 --- a/examples/features/sdk-eval-api/evals/basic.eval.ts +++ b/examples/features/sdk-eval-api/evals/basic.eval.ts @@ -1,4 +1,4 @@ -import { Contains, Eval } from '@agentv/core'; +import { Contains, Eval, ExactMatch } from '@agentv/core'; Eval('sdk-example/basic', { data: [ @@ -24,3 +24,15 @@ Eval('sdk-example/basic', { }), ], }); + +Eval('sdk-example/exact', { + data: [ + { + id: 'exact-check', + input: 'Echo back: hello world', + expectedOutput: 'hello world', + }, + ], + target: { name: 'default', provider: 'mock', response: 'hello world' }, + assert: [ExactMatch], +}); diff --git a/examples/features/sdk-programmatic-api/evaluate.ts b/examples/features/sdk-programmatic-api/evaluate.ts index 43e63088..0995b9a8 100644 --- a/examples/features/sdk-programmatic-api/evaluate.ts +++ b/examples/features/sdk-programmatic-api/evaluate.ts @@ -1,38 +1,29 @@ /** * Programmatic API Example * - * Uses evaluate() from @agentv/core to run evaluations as a library. + * Uses Eval() from @agentv/core to run evaluations as a library. * No YAML needed — tests defined inline with full type safety. * * Run: bun run evaluate.ts * (Uses 'default' target from .agentv/targets.yaml and .env credentials) */ -import { evaluate } from '@agentv/core'; +import { Contains, Eval } from '@agentv/core'; -const { results, summary } = await evaluate({ - tests: [ +const { results, summary } = await Eval('programmatic-api-example', { + data: [ { id: 'greeting', input: 'Say hello and introduce yourself briefly.', - expected_output: "Hello! I'm an AI assistant here to help you.", - assert: [{ type: 'contains', value: 'Hello' }], + expectedOutput: "Hello! I'm an AI assistant here to help you.", }, { id: 'json-output', - input: [ - { role: 'system', content: 'Respond only with valid JSON. No markdown.' }, - { role: 'user', content: 'Return a JSON object with a "status" field set to "ok".' }, - ], - expected_output: '{"status": "ok"}', - assert: [ - { type: 'is-json', required: true }, - { type: 'contains', value: 'ok' }, - ], + input: 'Return a JSON object with a "status" field set to "ok".', + expectedOutput: '{"status": "ok"}', }, ], - onResult: (result) => { - console.log(` ${result.testId}: score=${result.score.toFixed(2)}`); - }, + target: { provider: 'mock', response: 'Hello! I am an AI assistant. {"status": "ok"}' }, + assert: [Contains('Hello'), { type: 'contains', value: 'ok' }], }); console.log('\n--- Summary ---'); diff --git a/packages/core/src/evaluation/eval-api.ts b/packages/core/src/evaluation/eval-api.ts index 2dc4560b..2e7dafbf 100644 --- a/packages/core/src/evaluation/eval-api.ts +++ b/packages/core/src/evaluation/eval-api.ts @@ -19,12 +19,17 @@ * @module */ -import path from 'node:path'; import type { AssertContext, AssertFn, AssertResult } from './assertions.js'; +import { computeSummary } from './evaluate.js'; import type { EvalAssertionInput, EvalRunResult, EvalSummary } from './evaluate.js'; import type { ResolvedTarget } from './providers/targets.js'; import type { Provider, TargetDefinition } from './providers/types.js'; -import type { EvalTest, EvaluationResult, EvaluatorConfig } from './types.js'; +import type { + EvalTest, + EvaluationResult, + EvaluatorConfig, + InlineAssertEvaluatorConfig, +} from './types.js'; // ─── Types ─────────────────────────────────────────────────────────── @@ -127,11 +132,11 @@ async function runEval(name: string, options: EvalOptions): Promise= 0.8) { - passed++; - } else if (r.score < 0.5) { - failed++; - } else { - borderline++; - } - } - - return { - total, - passed, - failed, - borderline, - durationMs, - meanScore: total > 0 ? scoreSum / total : 0, - }; -} - -// ─── Legacy inline assert function storage (for backward compat) ───── - -let _inlineAssertFns: AssertFn[] = []; - -export function setInlineAssertFns(fns: AssertFn[]): void { - _inlineAssertFns = fns; -} - -export function getInlineAssertFns(): AssertFn[] { - return _inlineAssertFns; -} - // Re-export types export type { AssertContext, AssertResult, AssertFn } from './assertions.js'; export type { EvalRunResult, EvalSummary } from './evaluate.js'; diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts index f6043ba8..ad581611 100644 --- a/packages/core/src/evaluation/evaluate.ts +++ b/packages/core/src/evaluation/evaluate.ts @@ -305,8 +305,12 @@ function mapAssertionType(type: string): string { /** * Compute summary statistics from evaluation results. + * Shared by both evaluate() and Eval() APIs. */ -function computeSummary(results: readonly EvaluationResult[], durationMs: number): EvalSummary { +export function computeSummary( + results: readonly EvaluationResult[], + durationMs: number, +): EvalSummary { const total = results.length; let passed = 0; let failed = 0; diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts index de5586f0..20ec6f02 100644 --- a/packages/core/src/evaluation/registry/builtin-evaluators.ts +++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts @@ -7,7 +7,7 @@ */ import { readFileSync } from 'node:fs'; -import { INLINE_ASSERT_FN, getInlineAssertFns } from '../eval-api.js'; +import { INLINE_ASSERT_FN } from '../eval-api.js'; import { AgentJudgeEvaluator, CodeEvaluator, @@ -425,22 +425,16 @@ export function createBuiltinRegistry(): EvaluatorRegistry { .register('is-json', isJsonFactory) .register('equals', equalsFactory) .register('inline-assert', (config, _context) => { - // Prefer the function attached directly to the config via symbol (concurrent-safe) // biome-ignore lint/suspicious/noExplicitAny: symbol key access requires dynamic cast - const symbolFn = (config as any)[INLINE_ASSERT_FN] as + const fn = (config as any)[INLINE_ASSERT_FN] as | import('../assertions.js').AssertFn | undefined; - if (symbolFn) { - return new InlineAssertEvaluator(symbolFn, config.name ?? 'inline-assert'); - } - // Fallback: legacy global storage - const fns = getInlineAssertFns(); - const index = Number.parseInt(config.name?.replace('inline-assert-', '') ?? '0', 10); - const fn = fns[index]; if (!fn) { - throw new Error(`No inline assert function found at index ${index}`); + throw new Error( + `No inline assert function found on config for "${config.name}". Inline assert functions must be attached via INLINE_ASSERT_FN symbol.`, + ); } - return new InlineAssertEvaluator(fn, config.name ?? `inline-assert-${index}`); + return new InlineAssertEvaluator(fn, config.name ?? 'inline-assert'); }); return registry; diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 4ad65239..3fa7a4c6 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -171,6 +171,7 @@ const EVALUATOR_KIND_VALUES = [ 'is-json', 'equals', 'rubrics', + 'inline-assert', ] as const; export type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number]; @@ -723,6 +724,20 @@ export type RubricsEvaluatorConfig = { readonly negate?: boolean; }; +/** + * Configuration for the inline-assert evaluator. + * Used by the Eval() API to wrap inline assertion functions. + * The actual function is attached via the INLINE_ASSERT_FN symbol at runtime. + */ +export type InlineAssertEvaluatorConfig = { + readonly name: string; + readonly type: 'inline-assert'; + readonly weight?: number; + readonly required?: boolean | number; + /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ + readonly negate?: boolean; +}; + export type EvaluatorConfig = | CodeEvaluatorConfig | LlmJudgeEvaluatorConfig @@ -745,7 +760,8 @@ export type EvaluatorConfig = | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig - | RubricsEvaluatorConfig; + | RubricsEvaluatorConfig + | InlineAssertEvaluatorConfig; /** * Eval test definition sourced from AgentV specs. diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 469877a2..9aff4890 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -5,13 +5,12 @@ export * from './evaluation/file-utils.js'; export * from './evaluation/providers/index.js'; export * from './evaluation/evaluators.js'; export * from './evaluation/orchestrator.js'; -export { - evaluate, - type EvalConfig, - type EvalTestInput, - type EvalAssertionInput, - type EvalRunResult, - type EvalSummary, +export type { + EvalConfig, + EvalTestInput, + EvalAssertionInput, + EvalRunResult, + EvalSummary, } from './evaluation/evaluate.js'; // Eval() API — single-file TypeScript evaluations diff --git a/packages/core/test/evaluation/eval-api.test.ts b/packages/core/test/evaluation/eval-api.test.ts index fc15ddde..7108a9ab 100644 --- a/packages/core/test/evaluation/eval-api.test.ts +++ b/packages/core/test/evaluation/eval-api.test.ts @@ -143,4 +143,23 @@ describe('Eval() execution with mock target', () => { expect(result.summary.passed).toBe(1); }); + + it('classifies borderline scores (0.5 <= score < 0.8) correctly', async () => { + const result = await Eval('borderline', { + data: [{ input: 'hello world' }], + target: { name: 'default', provider: 'mock', response: 'hello world' }, + assert: [ + // One passes (score 1.0), one fails (score 0.0) — average score is 0.5, which is borderline + Contains('hello'), + Contains('missing-text'), + ], + }); + + // With two assertions where one passes and one fails, the average score should be 0.5 + // Score 0.5 is borderline (>= 0.5 and < 0.8) + expect(result.summary.total).toBe(1); + expect(result.summary.borderline).toBe(1); + expect(result.summary.passed).toBe(0); + expect(result.summary.failed).toBe(0); + }); });