diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index aee3a5ff..3017bf81 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -800,19 +800,79 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise console.log(`Trace file: ${path.resolve(options.traceFile)}`); } + const allResults: EvaluationResult[] = []; + + // ── TypeScript eval file support ────────────────────────────────── + // Detect .ts/.js files and handle them via the Eval() API instead of the YAML pipeline + const tsEvalFiles = resolvedTestFiles.filter( + (f) => f.endsWith('.ts') || f.endsWith('.js') || f.endsWith('.mts') || f.endsWith('.mjs'), + ); + const yamlEvalFiles = resolvedTestFiles.filter((f) => !tsEvalFiles.includes(f)); + + if (tsEvalFiles.length > 0) { + const { getRegisteredEvals, clearEvalRegistry } = await import('@agentv/core'); + + for (const tsFile of tsEvalFiles) { + await ensureFileExists(tsFile, 'TypeScript eval file'); + clearEvalRegistry(); + + // Import the TS file — Eval() calls register during import + await import(pathToFileURL(tsFile).href); + + const registered = getRegisteredEvals(); + if (registered.size === 0) { + throw new Error(`No Eval() calls found in ${tsFile}`); + } + + // Wait for all registered eval promises and collect results + for (const [evalName, entry] of registered) { + try { + const evalResult = await entry.promise; + // Print summary per eval + const passCount = evalResult.summary.passed; + const totalCount = evalResult.summary.total; + const meanScore = evalResult.summary.meanScore.toFixed(2); + console.log( + ` ${evalName}: ${passCount}/${totalCount} passed (mean score: ${meanScore})`, + ); + + // Write results to output and collect for overall summary + for (const result of evalResult.results) { + await outputWriter.append(result); + } + allResults.push(...evalResult.results); + } catch (err) { + console.error( + ` ${evalName}: ERROR — ${err instanceof Error ? err.message : String(err)}`, + ); + } + } + + clearEvalRegistry(); + } + + // If there are no YAML files, finalize output and return + if (yamlEvalFiles.length === 0) { + await outputWriter.close().catch(() => undefined); + const summary = calculateEvaluationSummary(allResults); + console.log(formatEvaluationSummary(summary)); + if (allResults.length > 0) { + console.log(`\nResults written to: ${outputPath}`); + } + return; + } + } + + // ── YAML eval file pipeline (existing) ───────────────────────────── // Determine cache state after loading file metadata (need YAML config) // We defer cache creation until after file metadata is loaded const evaluationRunner = await resolveEvaluationRunner(); - const allResults: EvaluationResult[] = []; const seenEvalCases = new Set(); const displayIdTracker = createDisplayIdTracker(); // Derive file-level concurrency from worker count (global) when provided const totalWorkers = options.workers ?? DEFAULT_WORKERS; - const fileConcurrency = Math.min( - Math.max(1, totalWorkers), - Math.max(1, resolvedTestFiles.length), - ); + const fileConcurrency = Math.min(Math.max(1, totalWorkers), Math.max(1, yamlEvalFiles.length)); const perFileWorkers = options.workers ? Math.max(1, Math.floor(totalWorkers / fileConcurrency)) : undefined; @@ -833,7 +893,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise readonly failOnError?: FailOnError; } >(); - for (const testFilePath of resolvedTestFiles) { + for (const testFilePath of yamlEvalFiles) { const meta = await prepareFileMetadata({ testFilePath, repoRoot, @@ -936,7 +996,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise } try { - await runWithLimit(resolvedTestFiles, fileConcurrency, async (testFilePath) => { + await runWithLimit(yamlEvalFiles, fileConcurrency, async (testFilePath) => { const targetPrep = fileMetadata.get(testFilePath); if (!targetPrep) { throw new Error(`Missing metadata for ${testFilePath}`); diff --git a/apps/cli/src/commands/eval/shared.ts b/apps/cli/src/commands/eval/shared.ts index 586d2f6f..653ada24 100644 --- a/apps/cli/src/commands/eval/shared.ts +++ b/apps/cli/src/commands/eval/shared.ts @@ -3,6 +3,9 @@ import { access, stat } from 'node:fs/promises'; import path from 'node:path'; import fg from 'fast-glob'; +/** Supported eval file extensions: YAML, JSONL, and TypeScript/JavaScript */ +const EVAL_FILE_RE = /\.(ya?ml|jsonl|ts|js|mts|mjs)$/i; + export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promise { const normalizedInputs = evalPaths.map((value) => value?.trim()).filter((value) => value); if (normalizedInputs.length === 0) { @@ -19,7 +22,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis : path.resolve(cwd, pattern); try { const stats = await stat(candidatePath); - if (stats.isFile() && /\.(ya?ml|jsonl)$/i.test(candidatePath)) { + if (stats.isFile() && EVAL_FILE_RE.test(candidatePath)) { results.add(candidatePath); continue; } @@ -37,13 +40,13 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis followSymbolicLinks: true, }); - const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl)$/i.test(filePath)); - if (yamlMatches.length === 0) { + const evalMatches = matches.filter((filePath) => EVAL_FILE_RE.test(filePath)); + if (evalMatches.length === 0) { unmatched.push(pattern); continue; } - for (const filePath of yamlMatches) { + for (const filePath of evalMatches) { results.add(path.normalize(filePath)); } } @@ -52,7 +55,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis throw new Error( `No eval files matched: ${unmatched.join( ', ', - )}. Provide YAML or JSONL paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl").`, + )}. Provide YAML, JSONL, or TypeScript paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.eval.ts").`, ); } diff --git a/examples/features/sdk-eval-api/evals/basic.eval.ts b/examples/features/sdk-eval-api/evals/basic.eval.ts new file mode 100644 index 00000000..56ffff8e --- /dev/null +++ b/examples/features/sdk-eval-api/evals/basic.eval.ts @@ -0,0 +1,38 @@ +import { Contains, Eval, ExactMatch } from '@agentv/core'; + +Eval('sdk-example/basic', { + data: [ + { + id: 'greeting', + input: 'Say hello', + expectedOutput: 'hello', + criteria: 'Response should contain a greeting', + }, + { + id: 'math', + input: 'What is 2+2?', + expectedOutput: '4', + criteria: 'Response should contain the correct answer', + }, + ], + target: { name: 'default', provider: 'mock', response: 'hello, the answer is 4' }, + assert: [ + Contains('hello'), + ({ output, expectedOutput }) => ({ + name: 'has-expected', + score: output.includes(expectedOutput ?? '') ? 1.0 : 0.0, + }), + ], +}); + +Eval('sdk-example/exact', { + data: [ + { + id: 'exact-check', + input: 'Echo back: hello world', + expectedOutput: 'hello world', + }, + ], + target: { name: 'default', provider: 'mock', response: 'hello world' }, + assert: [ExactMatch], +}); diff --git a/examples/features/sdk-programmatic-api/evaluate.ts b/examples/features/sdk-programmatic-api/evaluate.ts index 43e63088..0995b9a8 100644 --- a/examples/features/sdk-programmatic-api/evaluate.ts +++ b/examples/features/sdk-programmatic-api/evaluate.ts @@ -1,38 +1,29 @@ /** * Programmatic API Example * - * Uses evaluate() from @agentv/core to run evaluations as a library. + * Uses Eval() from @agentv/core to run evaluations as a library. * No YAML needed — tests defined inline with full type safety. * * Run: bun run evaluate.ts * (Uses 'default' target from .agentv/targets.yaml and .env credentials) */ -import { evaluate } from '@agentv/core'; +import { Contains, Eval } from '@agentv/core'; -const { results, summary } = await evaluate({ - tests: [ +const { results, summary } = await Eval('programmatic-api-example', { + data: [ { id: 'greeting', input: 'Say hello and introduce yourself briefly.', - expected_output: "Hello! I'm an AI assistant here to help you.", - assert: [{ type: 'contains', value: 'Hello' }], + expectedOutput: "Hello! I'm an AI assistant here to help you.", }, { id: 'json-output', - input: [ - { role: 'system', content: 'Respond only with valid JSON. No markdown.' }, - { role: 'user', content: 'Return a JSON object with a "status" field set to "ok".' }, - ], - expected_output: '{"status": "ok"}', - assert: [ - { type: 'is-json', required: true }, - { type: 'contains', value: 'ok' }, - ], + input: 'Return a JSON object with a "status" field set to "ok".', + expectedOutput: '{"status": "ok"}', }, ], - onResult: (result) => { - console.log(` ${result.testId}: score=${result.score.toFixed(2)}`); - }, + target: { provider: 'mock', response: 'Hello! I am an AI assistant. {"status": "ok"}' }, + assert: [Contains('Hello'), { type: 'contains', value: 'ok' }], }); console.log('\n--- Summary ---'); diff --git a/packages/core/src/evaluation/assertions.ts b/packages/core/src/evaluation/assertions.ts new file mode 100644 index 00000000..d7f3803b --- /dev/null +++ b/packages/core/src/evaluation/assertions.ts @@ -0,0 +1,101 @@ +/** + * Built-in assertion factories for the Eval() API. + * + * Each factory returns an AssertFn — a plain function that takes + * { input, output, expectedOutput, criteria, metadata } and returns + * { name, score }. These wrap the same logic as the built-in evaluator + * types but are usable as inline functions in the assert array. + */ + +/** Context passed to inline assertion functions */ +export interface AssertContext { + readonly input: string; + readonly output: string; + readonly expectedOutput?: string; + readonly criteria?: string; + readonly metadata?: Record; +} + +/** Result from an inline assertion function */ +export interface AssertResult { + readonly name: string; + readonly score: number; + readonly metadata?: Record; +} + +/** Inline assertion function signature */ +export type AssertFn = (ctx: AssertContext) => AssertResult | Promise; + +/** Checks if output contains the given substring. */ +export function Contains(value: string): AssertFn { + return ({ output }) => ({ + name: 'contains', + score: output.includes(value) ? 1.0 : 0.0, + }); +} + +/** Case-insensitive contains check. */ +export function IContains(value: string): AssertFn { + const lower = value.toLowerCase(); + return ({ output }) => ({ + name: 'icontains', + score: output.toLowerCase().includes(lower) ? 1.0 : 0.0, + }); +} + +/** Checks if output contains ALL of the given substrings. */ +export function ContainsAll(values: readonly string[]): AssertFn { + return ({ output }) => ({ + name: 'contains-all', + score: values.every((v) => output.includes(v)) ? 1.0 : 0.0, + }); +} + +/** Checks if output contains ANY of the given substrings. */ +export function ContainsAny(values: readonly string[]): AssertFn { + return ({ output }) => ({ + name: 'contains-any', + score: values.some((v) => output.includes(v)) ? 1.0 : 0.0, + }); +} + +/** Checks if trimmed output exactly equals trimmed expectedOutput. */ +export const ExactMatch: AssertFn = ({ output, expectedOutput }) => ({ + name: 'exact-match', + score: expectedOutput !== undefined && output.trim() === expectedOutput.trim() ? 1.0 : 0.0, +}); + +/** Checks if trimmed output starts with the given value. */ +export function StartsWith(value: string): AssertFn { + return ({ output }) => ({ + name: 'starts-with', + score: output.trim().startsWith(value.trim()) ? 1.0 : 0.0, + }); +} + +/** Checks if trimmed output ends with the given value. */ +export function EndsWith(value: string): AssertFn { + return ({ output }) => ({ + name: 'ends-with', + score: output.trim().endsWith(value.trim()) ? 1.0 : 0.0, + }); +} + +/** Checks if output matches the given regex pattern. */ +export function Regex(pattern: string, flags?: string): AssertFn { + const re = new RegExp(pattern, flags); + return ({ output }) => ({ + name: 'regex', + score: re.test(output) ? 1.0 : 0.0, + }); +} + +/** Checks if output is valid JSON. */ +export const IsJson: AssertFn = ({ output }) => { + try { + JSON.parse(output); + return { name: 'is-json', score: 1.0 }; + } catch { + return { name: 'is-json', score: 0.0 }; + } +}; diff --git a/packages/core/src/evaluation/eval-api.ts b/packages/core/src/evaluation/eval-api.ts new file mode 100644 index 00000000..2e7dafbf --- /dev/null +++ b/packages/core/src/evaluation/eval-api.ts @@ -0,0 +1,235 @@ +/** + * Declarative Eval() API for single-file TypeScript evaluations. + * + * Eval() registers an eval in a global registry and returns a Promise + * with results. Serves both CLI discovery (module-level, no await) and + * programmatic use (await for results). + * + * @example Single-file eval with built-in target + * ```typescript + * import { Eval, Contains } from '@agentv/core'; + * + * Eval('my-eval', { + * data: [{ input: 'What is 2+2?', expectedOutput: '4' }], + * target: { provider: 'claude_agent' }, + * assert: [Contains('4')], + * }); + * ``` + * + * @module + */ + +import type { AssertContext, AssertFn, AssertResult } from './assertions.js'; +import { computeSummary } from './evaluate.js'; +import type { EvalAssertionInput, EvalRunResult, EvalSummary } from './evaluate.js'; +import type { ResolvedTarget } from './providers/targets.js'; +import type { Provider, TargetDefinition } from './providers/types.js'; +import type { + EvalTest, + EvaluationResult, + EvaluatorConfig, + InlineAssertEvaluatorConfig, +} from './types.js'; + +// ─── Types ─────────────────────────────────────────────────────────── + +/** Data item — aligns with YAML test fields (camelCase) */ +export interface EvalDataItem { + readonly id?: string; + readonly input: string | readonly { role: string; content: string }[]; + readonly expectedOutput?: string; + readonly criteria?: string; + readonly metadata?: Record; +} + +/** Assert entry: inline function, built-in factory result, or assertion config */ +export type AssertEntry = AssertFn | EvalAssertionInput; + +/** Configuration for Eval() */ +export interface EvalOptions { + readonly data: + | readonly EvalDataItem[] + | (() => readonly EvalDataItem[] | Promise); + readonly target?: TargetDefinition; + readonly task?: (input: string) => string | Promise; + readonly assert: readonly AssertEntry[]; + readonly metadata?: Record; + readonly workers?: number; + readonly agentTimeoutMs?: number; +} + +// ─── Registry ──────────────────────────────────────────────────────── + +interface RegisteredEval { + readonly name: string; + readonly options: EvalOptions; + readonly promise: Promise; +} + +const evalRegistry = new Map(); + +/** Get all registered evals (used by CLI to discover evals in .ts files). */ +export function getRegisteredEvals(): ReadonlyMap { + return evalRegistry; +} + +/** Clear the registry (used in tests). */ +export function clearEvalRegistry(): void { + evalRegistry.clear(); +} + +// ─── Eval() ────────────────────────────────────────────────────────── + +/** + * Declare and run an evaluation. + * + * Registers the eval in a global registry (for CLI discovery) and + * returns a Promise with results (for programmatic use). + */ +export function Eval(name: string, options: EvalOptions): Promise { + // Validate + if (evalRegistry.has(name)) { + throw new Error(`Eval "${name}" already registered`); + } + if (options.task && options.target) { + throw new Error('Cannot specify both "task" and "target" — use one or the other.'); + } + if (!options.task && !options.target) { + throw new Error('Must specify either "task" or "target".'); + } + + // Create the execution promise + const promise = runEval(name, options); + + evalRegistry.set(name, { name, options, promise }); + return promise; +} + +// ─── Inline function storage via symbol ────────────────────────────── + +/** + * Symbol used to attach inline AssertFn references to EvaluatorConfig objects. + * This avoids global state and race conditions between concurrent Eval() calls. + */ +export const INLINE_ASSERT_FN = Symbol.for('agentv.inline-assert-fn'); + +// ─── Internal execution ────────────────────────────────────────────── + +async function runEval(name: string, options: EvalOptions): Promise { + // Lazy imports to avoid circular dependencies and keep module lightweight + const { runEvaluation } = await import('./orchestrator.js'); + const { resolveTargetDefinition } = await import('./providers/targets.js'); + const { findGitRoot } = await import('./file-utils.js'); + const { createFunctionProvider } = await import('./providers/function-provider.js'); + + // Resolve data + const data = typeof options.data === 'function' ? await options.data() : options.data; + + // Partition assert entries into inline functions and config objects + const evaluatorConfigs: EvaluatorConfig[] = []; + + for (let i = 0; i < options.assert.length; i++) { + const entry = options.assert[i]; + if (typeof entry === 'function') { + // Inline function: create an evaluator config with the function attached via symbol + const base: InlineAssertEvaluatorConfig = { + type: 'inline-assert', + name: `inline-assert-${i}`, + }; + const config = Object.assign(base, { [INLINE_ASSERT_FN]: entry as AssertFn }); + evaluatorConfigs.push(config); + } else { + // Config object: normalize type and pass through + const a = entry as EvalAssertionInput; + const { type: rawType, ...rest } = a; + evaluatorConfigs.push({ + ...rest, + name: a.name ?? `${rawType}_${i}`, + type: rawType.replace(/_/g, '-'), + } as EvaluatorConfig); + } + } + + // Resolve target + const targetDef: TargetDefinition = options.target ?? { name: 'custom-task', provider: 'mock' }; + const resolvedTarget = resolveTargetDefinition(targetDef); + + // Build providerFactory for task functions + let providerFactory: ((target: ResolvedTarget) => Provider) | undefined; + if (options.task) { + const taskProvider = createFunctionProvider(options.task); + providerFactory = () => taskProvider; + } + + // Convert data items to EvalTest[] + const evalCases: EvalTest[] = data.map((item, i) => { + const input = + typeof item.input === 'string' + ? ([{ role: 'user' as const, content: item.input }] as EvalTest['input']) + : (item.input as unknown as EvalTest['input']); + + const question = + typeof item.input === 'string' + ? item.input + : ((item.input.find((m) => m.role === 'user')?.content as string) ?? ''); + + const expectedOutput = item.expectedOutput + ? ([ + { role: 'assistant' as const, content: item.expectedOutput }, + ] as EvalTest['expected_output']) + : []; + + // Build input_segments so buildPromptInputs can extract the question + const inputSegments = + typeof item.input === 'string' + ? [{ type: 'text' as const, value: item.input }] + : (item.input as readonly { role: string; content: string }[]) + .filter((m) => m.role === 'user' && typeof m.content === 'string') + .map((m) => ({ type: 'text' as const, value: m.content })); + + return { + id: item.id ? `${name}/${item.id}` : `${name}/${i}`, + criteria: item.criteria ?? '', + question: String(question), + input, + input_segments: inputSegments, + expected_output: expectedOutput, + reference_answer: item.expectedOutput, + guideline_paths: [], + guideline_patterns: [], + file_paths: [], + evaluators: evaluatorConfigs.length > 0 ? evaluatorConfigs : undefined, + metadata: item.metadata, + }; + }); + + const startTime = Date.now(); + const repoRoot = (await findGitRoot(process.cwd())) ?? process.cwd(); + const testFilePath = ''; + + const collectedResults: EvaluationResult[] = []; + + await runEvaluation({ + testFilePath, + repoRoot, + target: resolvedTarget, + maxRetries: 2, + agentTimeoutMs: options.agentTimeoutMs, + maxConcurrency: options.workers ?? 3, + evalCases, + ...(providerFactory ? { providerFactory } : {}), + onResult: async (result) => { + collectedResults.push(result); + }, + }); + + const durationMs = Date.now() - startTime; + return { + results: collectedResults, + summary: computeSummary(collectedResults, durationMs), + }; +} + +// Re-export types +export type { AssertContext, AssertResult, AssertFn } from './assertions.js'; +export type { EvalRunResult, EvalSummary } from './evaluate.js'; diff --git a/packages/core/src/evaluation/evaluate.ts b/packages/core/src/evaluation/evaluate.ts index f6043ba8..ad581611 100644 --- a/packages/core/src/evaluation/evaluate.ts +++ b/packages/core/src/evaluation/evaluate.ts @@ -305,8 +305,12 @@ function mapAssertionType(type: string): string { /** * Compute summary statistics from evaluation results. + * Shared by both evaluate() and Eval() APIs. */ -function computeSummary(results: readonly EvaluationResult[], durationMs: number): EvalSummary { +export function computeSummary( + results: readonly EvaluationResult[], + durationMs: number, +): EvalSummary { const total = results.length; let passed = 0; let failed = 0; diff --git a/packages/core/src/evaluation/evaluators/inline-assert.ts b/packages/core/src/evaluation/evaluators/inline-assert.ts new file mode 100644 index 00000000..8963e74a --- /dev/null +++ b/packages/core/src/evaluation/evaluators/inline-assert.ts @@ -0,0 +1,39 @@ +import type { AssertFn } from '../assertions.js'; +import type { JsonObject } from '../types.js'; +import { scoreToVerdict } from './scoring.js'; +import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; + +/** + * Evaluator that wraps an inline AssertFn and runs it in-process. + * No subprocess, no stdin/stdout — just calls the function directly. + */ +export class InlineAssertEvaluator implements Evaluator { + readonly kind = 'inline-assert'; + + constructor( + private readonly fn: AssertFn, + private readonly name: string, + ) {} + + async evaluate(context: EvaluationContext): Promise { + const result = await this.fn({ + input: context.evalCase.question, + output: context.candidate, + expectedOutput: context.evalCase.reference_answer, + criteria: context.evalCase.criteria, + metadata: context.evalCase.metadata, + }); + + const score = Math.max(0, Math.min(1, result.score)); + + return { + score, + verdict: scoreToVerdict(score), + hits: score >= 0.8 ? [result.name] : [], + misses: score < 0.5 ? [result.name] : [], + expectedAspectCount: 1, + reasoning: undefined, + details: result.metadata ? (result.metadata as JsonObject) : undefined, + }; + } +} diff --git a/packages/core/src/evaluation/providers/function-provider.ts b/packages/core/src/evaluation/providers/function-provider.ts new file mode 100644 index 00000000..27bd5b37 --- /dev/null +++ b/packages/core/src/evaluation/providers/function-provider.ts @@ -0,0 +1,27 @@ +import type { Provider, ProviderKind, ProviderRequest, ProviderResponse } from './types.js'; + +/** + * Wraps a user-provided task function as a Provider. + * Used by Eval() when `task` is specified instead of `target`. + */ +export function createFunctionProvider( + taskFn: (input: string) => string | Promise, +): Provider { + return { + id: 'function-provider', + kind: 'mock' as ProviderKind, + targetName: 'custom-task', + async invoke(request: ProviderRequest): Promise { + const startTime = new Date().toISOString(); + const start = Date.now(); + const result = await taskFn(request.question); + const endTime = new Date().toISOString(); + return { + output: [{ role: 'assistant', content: result }], + durationMs: Date.now() - start, + startTime, + endTime, + }; + }, + }; +} diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts index e992acce..20ec6f02 100644 --- a/packages/core/src/evaluation/registry/builtin-evaluators.ts +++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts @@ -7,6 +7,7 @@ */ import { readFileSync } from 'node:fs'; +import { INLINE_ASSERT_FN } from '../eval-api.js'; import { AgentJudgeEvaluator, CodeEvaluator, @@ -30,6 +31,7 @@ import { runRegexAssertion, runStartsWithAssertion, } from '../evaluators.js'; +import { InlineAssertEvaluator } from '../evaluators/inline-assert.js'; import { resolveCustomPrompt } from '../evaluators/prompt-resolution.js'; import type { Provider } from '../providers/types.js'; import type { ToolTrajectoryEvaluatorConfig } from '../trace.js'; @@ -421,7 +423,19 @@ export function createBuiltinRegistry(): EvaluatorRegistry { .register('ends-with', endsWithFactory) .register('regex', regexFactory) .register('is-json', isJsonFactory) - .register('equals', equalsFactory); + .register('equals', equalsFactory) + .register('inline-assert', (config, _context) => { + // biome-ignore lint/suspicious/noExplicitAny: symbol key access requires dynamic cast + const fn = (config as any)[INLINE_ASSERT_FN] as + | import('../assertions.js').AssertFn + | undefined; + if (!fn) { + throw new Error( + `No inline assert function found on config for "${config.name}". Inline assert functions must be attached via INLINE_ASSERT_FN symbol.`, + ); + } + return new InlineAssertEvaluator(fn, config.name ?? 'inline-assert'); + }); return registry; } diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 4ad65239..3fa7a4c6 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -171,6 +171,7 @@ const EVALUATOR_KIND_VALUES = [ 'is-json', 'equals', 'rubrics', + 'inline-assert', ] as const; export type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number]; @@ -723,6 +724,20 @@ export type RubricsEvaluatorConfig = { readonly negate?: boolean; }; +/** + * Configuration for the inline-assert evaluator. + * Used by the Eval() API to wrap inline assertion functions. + * The actual function is attached via the INLINE_ASSERT_FN symbol at runtime. + */ +export type InlineAssertEvaluatorConfig = { + readonly name: string; + readonly type: 'inline-assert'; + readonly weight?: number; + readonly required?: boolean | number; + /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ + readonly negate?: boolean; +}; + export type EvaluatorConfig = | CodeEvaluatorConfig | LlmJudgeEvaluatorConfig @@ -745,7 +760,8 @@ export type EvaluatorConfig = | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig - | RubricsEvaluatorConfig; + | RubricsEvaluatorConfig + | InlineAssertEvaluatorConfig; /** * Eval test definition sourced from AgentV specs. diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 0fdb7496..9aff4890 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -5,14 +5,37 @@ export * from './evaluation/file-utils.js'; export * from './evaluation/providers/index.js'; export * from './evaluation/evaluators.js'; export * from './evaluation/orchestrator.js'; -export { - evaluate, - type EvalConfig, - type EvalTestInput, - type EvalAssertionInput, - type EvalRunResult, - type EvalSummary, +export type { + EvalConfig, + EvalTestInput, + EvalAssertionInput, + EvalRunResult, + EvalSummary, } from './evaluation/evaluate.js'; + +// Eval() API — single-file TypeScript evaluations +export { + Eval, + getRegisteredEvals, + clearEvalRegistry, + type EvalDataItem, + type EvalOptions, + type AssertEntry, +} from './evaluation/eval-api.js'; +export { + Contains, + IContains, + ContainsAll, + ContainsAny, + ExactMatch, + StartsWith, + EndsWith, + Regex, + IsJson, + type AssertFn, + type AssertContext, + type AssertResult, +} from './evaluation/assertions.js'; export { defineConfig, loadTsConfig, diff --git a/packages/core/test/evaluation/assertions.test.ts b/packages/core/test/evaluation/assertions.test.ts new file mode 100644 index 00000000..42bb1bd4 --- /dev/null +++ b/packages/core/test/evaluation/assertions.test.ts @@ -0,0 +1,120 @@ +import { describe, expect, it } from 'vitest'; +import { + Contains, + ContainsAll, + ContainsAny, + EndsWith, + ExactMatch, + IContains, + IsJson, + Regex, + StartsWith, +} from '../../src/evaluation/assertions.js'; + +const ctx = (output: string, expectedOutput?: string) => ({ + input: 'test-input', + output, + expectedOutput, +}); + +describe('Contains', () => { + it('scores 1.0 when output contains value', () => { + const fn = Contains('hello'); + const result = fn(ctx('hello world')); + expect(result.score).toBe(1.0); + expect(result.name).toBe('contains'); + }); + + it('scores 0.0 when output does not contain value', () => { + const fn = Contains('goodbye'); + const result = fn(ctx('hello world')); + expect(result.score).toBe(0.0); + }); +}); + +describe('IContains', () => { + it('scores 1.0 case-insensitively', () => { + const fn = IContains('HELLO'); + const result = fn(ctx('hello world')); + expect(result.score).toBe(1.0); + }); +}); + +describe('ContainsAll', () => { + it('scores 1.0 when all values present', () => { + const fn = ContainsAll(['hello', 'world']); + expect(fn(ctx('hello world')).score).toBe(1.0); + }); + + it('scores 0.0 when any value missing', () => { + const fn = ContainsAll(['hello', 'goodbye']); + expect(fn(ctx('hello world')).score).toBe(0.0); + }); +}); + +describe('ContainsAny', () => { + it('scores 1.0 when any value present', () => { + const fn = ContainsAny(['goodbye', 'world']); + expect(fn(ctx('hello world')).score).toBe(1.0); + }); + + it('scores 0.0 when no values present', () => { + const fn = ContainsAny(['goodbye', 'farewell']); + expect(fn(ctx('hello world')).score).toBe(0.0); + }); +}); + +describe('ExactMatch', () => { + it('scores 1.0 on exact match (trimmed)', () => { + expect(ExactMatch(ctx('hello', 'hello')).score).toBe(1.0); + expect(ExactMatch(ctx(' hello ', 'hello')).score).toBe(1.0); + }); + + it('scores 0.0 on mismatch', () => { + expect(ExactMatch(ctx('hello', 'world')).score).toBe(0.0); + }); + + it('scores 0.0 when no expectedOutput', () => { + expect(ExactMatch(ctx('hello')).score).toBe(0.0); + }); +}); + +describe('StartsWith', () => { + it('scores 1.0 when output starts with value', () => { + expect(StartsWith('hello')(ctx('hello world')).score).toBe(1.0); + }); + + it('scores 0.0 when it does not', () => { + expect(StartsWith('world')(ctx('hello world')).score).toBe(0.0); + }); +}); + +describe('EndsWith', () => { + it('scores 1.0 when output ends with value', () => { + expect(EndsWith('world')(ctx('hello world')).score).toBe(1.0); + }); +}); + +describe('Regex', () => { + it('scores 1.0 on match', () => { + expect(Regex('\\d+')(ctx('abc 123 def')).score).toBe(1.0); + }); + + it('scores 0.0 on no match', () => { + expect(Regex('^\\d+$')(ctx('abc')).score).toBe(0.0); + }); + + it('supports flags', () => { + expect(Regex('HELLO', 'i')(ctx('hello')).score).toBe(1.0); + }); +}); + +describe('IsJson', () => { + it('scores 1.0 for valid JSON', () => { + expect(IsJson(ctx('{"a": 1}')).score).toBe(1.0); + }); + + it('scores 0.0 for invalid JSON', () => { + expect(IsJson(ctx('not json')).score).toBe(0.0); + }); +}); diff --git a/packages/core/test/evaluation/eval-api.test.ts b/packages/core/test/evaluation/eval-api.test.ts new file mode 100644 index 00000000..7108a9ab --- /dev/null +++ b/packages/core/test/evaluation/eval-api.test.ts @@ -0,0 +1,165 @@ +import { afterEach, describe, expect, it } from 'vitest'; +import { Contains } from '../../src/evaluation/assertions.js'; +import { + Eval, + type EvalDataItem, + clearEvalRegistry, + getRegisteredEvals, +} from '../../src/evaluation/eval-api.js'; + +afterEach(() => { + clearEvalRegistry(); +}); + +describe('Eval() registration', () => { + it('registers an eval in the global registry', () => { + Eval('test-eval', { + data: [{ input: 'hello', expectedOutput: 'world' }], + target: { name: 'default', provider: 'mock', response: 'world' }, + assert: [Contains('world')], + }); + + const registry = getRegisteredEvals(); + expect(registry.size).toBe(1); + expect(registry.has('test-eval')).toBe(true); + }); + + it('throws on duplicate eval names', () => { + Eval('dup', { + data: [{ input: 'a' }], + target: { name: 'default', provider: 'mock' }, + assert: [Contains('a')], + }); + + expect(() => { + Eval('dup', { + data: [{ input: 'b' }], + target: { name: 'default', provider: 'mock' }, + assert: [Contains('b')], + }); + }).toThrow('Eval "dup" already registered'); + }); + + it('throws when both task and target are provided', () => { + expect(() => { + Eval('bad', { + data: [{ input: 'a' }], + target: { name: 'default', provider: 'mock' }, + task: async (input) => input, + assert: [Contains('a')], + }); + }).toThrow('Cannot specify both "task" and "target"'); + }); + + it('throws when neither task nor target is provided', () => { + expect(() => { + Eval('bad', { + data: [{ input: 'a' }], + assert: [Contains('a')], + }); + }).toThrow('Must specify either "task" or "target"'); + }); +}); + +describe('Eval() execution with mock target', () => { + it('returns results when awaited', async () => { + const result = await Eval('exec-test', { + data: [{ id: 'case-1', input: 'hello', expectedOutput: 'world' }], + target: { name: 'default', provider: 'mock', response: 'world' }, + assert: [Contains('world')], + }); + + expect(result.results).toHaveLength(1); + expect(result.summary.total).toBe(1); + expect(result.summary.passed).toBe(1); + }); + + it('auto-generates test IDs from eval name and index', async () => { + const result = await Eval('auto-id', { + data: [{ input: 'first' }, { input: 'second' }], + target: { name: 'default', provider: 'mock', response: 'first second' }, + assert: [Contains('first')], + }); + + expect(result.results[0].testId).toBe('auto-id/0'); + expect(result.results[1].testId).toBe('auto-id/1'); + }); + + it('uses explicit data item IDs in test IDs', async () => { + const result = await Eval('explicit-id', { + data: [{ id: 'my-case', input: 'hello' }], + target: { name: 'default', provider: 'mock', response: 'hello' }, + assert: [Contains('hello')], + }); + + expect(result.results[0].testId).toBe('explicit-id/my-case'); + }); + + it('supports inline assertion functions', async () => { + const result = await Eval('inline-fn', { + data: [{ input: 'test', expectedOutput: 'test' }], + target: { name: 'default', provider: 'mock', response: 'test' }, + assert: [ + ({ output, expectedOutput }) => ({ + name: 'custom', + score: output === expectedOutput ? 1.0 : 0.0, + }), + ], + }); + + expect(result.summary.passed).toBe(1); + }); + + it('supports async data factory', async () => { + const result = await Eval('async-data', { + data: async () => [{ input: 'hello' }], + target: { name: 'default', provider: 'mock', response: 'hello' }, + assert: [Contains('hello')], + }); + + expect(result.results).toHaveLength(1); + }); + + it('supports task function instead of target', async () => { + const result = await Eval('task-fn', { + data: [{ input: 'hello' }], + task: async (input) => `Echo: ${input}`, + assert: [Contains('Echo: hello')], + }); + + expect(result.summary.passed).toBe(1); + }); + + it('supports mixing inline functions and assertion configs', async () => { + const result = await Eval('mixed', { + data: [{ input: 'hello world' }], + target: { name: 'default', provider: 'mock', response: 'hello world' }, + assert: [ + Contains('hello'), + ({ output }) => ({ name: 'has-world', score: output.includes('world') ? 1.0 : 0.0 }), + { type: 'contains', value: 'world' }, + ], + }); + + expect(result.summary.passed).toBe(1); + }); + + it('classifies borderline scores (0.5 <= score < 0.8) correctly', async () => { + const result = await Eval('borderline', { + data: [{ input: 'hello world' }], + target: { name: 'default', provider: 'mock', response: 'hello world' }, + assert: [ + // One passes (score 1.0), one fails (score 0.0) — average score is 0.5, which is borderline + Contains('hello'), + Contains('missing-text'), + ], + }); + + // With two assertions where one passes and one fails, the average score should be 0.5 + // Score 0.5 is borderline (>= 0.5 and < 0.8) + expect(result.summary.total).toBe(1); + expect(result.summary.borderline).toBe(1); + expect(result.summary.passed).toBe(0); + expect(result.summary.failed).toBe(0); + }); +}); diff --git a/packages/core/test/evaluation/evaluators/inline-assert.test.ts b/packages/core/test/evaluation/evaluators/inline-assert.test.ts new file mode 100644 index 00000000..504e1b00 --- /dev/null +++ b/packages/core/test/evaluation/evaluators/inline-assert.test.ts @@ -0,0 +1,87 @@ +import { describe, expect, it } from 'vitest'; +import type { AssertFn } from '../../../src/evaluation/assertions.js'; +import { InlineAssertEvaluator } from '../../../src/evaluation/evaluators/inline-assert.js'; + +describe('InlineAssertEvaluator', () => { + it('runs an inline assert function and returns EvaluationScore', async () => { + const fn: AssertFn = ({ output }) => ({ + name: 'test-assert', + score: output.includes('hello') ? 1.0 : 0.0, + }); + + const evaluator = new InlineAssertEvaluator(fn, 'test-assert'); + + const score = await evaluator.evaluate({ + evalCase: { + id: 'test', + question: 'greet me', + criteria: '', + input: [{ role: 'user', content: 'greet me' }], + input_segments: [], + expected_output: [], + guideline_paths: [], + file_paths: [], + reference_answer: 'hello world', + }, + candidate: 'hello world', + // biome-ignore lint/suspicious/noExplicitAny: partial context for unit testing + } as any); + + expect(score.score).toBe(1.0); + expect(score.verdict).toBe('pass'); + }); + + it('handles async assert functions', async () => { + const fn: AssertFn = async ({ output }) => ({ + name: 'async-assert', + score: output.length > 0 ? 1.0 : 0.0, + }); + + const evaluator = new InlineAssertEvaluator(fn, 'async-assert'); + + const score = await evaluator.evaluate({ + evalCase: { + id: 'test', + question: 'test', + criteria: '', + input: [{ role: 'user', content: 'test' }], + input_segments: [], + expected_output: [], + guideline_paths: [], + file_paths: [], + }, + candidate: 'some output', + // biome-ignore lint/suspicious/noExplicitAny: partial context for unit testing + } as any); + + expect(score.score).toBe(1.0); + }); + + it('returns fail verdict for score 0', async () => { + const fn: AssertFn = () => ({ + name: 'always-fail', + score: 0.0, + }); + + const evaluator = new InlineAssertEvaluator(fn, 'always-fail'); + + const score = await evaluator.evaluate({ + evalCase: { + id: 'test', + question: 'test', + criteria: '', + input: [{ role: 'user', content: 'test' }], + input_segments: [], + expected_output: [], + guideline_paths: [], + file_paths: [], + }, + candidate: 'output', + // biome-ignore lint/suspicious/noExplicitAny: partial context for unit testing + } as any); + + expect(score.score).toBe(0.0); + expect(score.verdict).toBe('fail'); + expect(score.misses).toContain('always-fail'); + }); +}); diff --git a/packages/core/test/evaluation/providers/function-provider.test.ts b/packages/core/test/evaluation/providers/function-provider.test.ts new file mode 100644 index 00000000..f4116e38 --- /dev/null +++ b/packages/core/test/evaluation/providers/function-provider.test.ts @@ -0,0 +1,43 @@ +import { describe, expect, it } from 'vitest'; +import { createFunctionProvider } from '../../../src/evaluation/providers/function-provider.js'; + +describe('createFunctionProvider', () => { + it('wraps a sync task function as a Provider', async () => { + const provider = createFunctionProvider((input) => `Echo: ${input}`); + + expect(provider.id).toBe('function-provider'); + expect(provider.targetName).toBe('custom-task'); + + const response = await provider.invoke({ question: 'hello' }); + expect(response.output).toHaveLength(1); + expect(response.output?.[0].role).toBe('assistant'); + expect(response.output?.[0].content).toBe('Echo: hello'); + }); + + it('wraps an async task function', async () => { + const provider = createFunctionProvider(async (input) => { + return `Async: ${input}`; + }); + + const response = await provider.invoke({ question: 'world' }); + expect(response.output?.[0].content).toBe('Async: world'); + }); + + it('measures duration', async () => { + const provider = createFunctionProvider(async () => { + await new Promise((r) => setTimeout(r, 50)); + return 'done'; + }); + + const response = await provider.invoke({ question: 'test' }); + expect(response.durationMs).toBeGreaterThanOrEqual(40); + }); + + it('propagates errors from the task function', async () => { + const provider = createFunctionProvider(() => { + throw new Error('task failed'); + }); + + await expect(provider.invoke({ question: 'test' })).rejects.toThrow('task failed'); + }); +});