Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 67 additions & 7 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -800,19 +800,79 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>
console.log(`Trace file: ${path.resolve(options.traceFile)}`);
}

const allResults: EvaluationResult[] = [];

// ── TypeScript eval file support ──────────────────────────────────
// Detect .ts/.js files and handle them via the Eval() API instead of the YAML pipeline
const tsEvalFiles = resolvedTestFiles.filter(
(f) => f.endsWith('.ts') || f.endsWith('.js') || f.endsWith('.mts') || f.endsWith('.mjs'),
);
const yamlEvalFiles = resolvedTestFiles.filter((f) => !tsEvalFiles.includes(f));

if (tsEvalFiles.length > 0) {
const { getRegisteredEvals, clearEvalRegistry } = await import('@agentv/core');

for (const tsFile of tsEvalFiles) {
await ensureFileExists(tsFile, 'TypeScript eval file');
clearEvalRegistry();

// Import the TS file — Eval() calls register during import
await import(pathToFileURL(tsFile).href);

const registered = getRegisteredEvals();
if (registered.size === 0) {
throw new Error(`No Eval() calls found in ${tsFile}`);
}

// Wait for all registered eval promises and collect results
for (const [evalName, entry] of registered) {
try {
const evalResult = await entry.promise;
// Print summary per eval
const passCount = evalResult.summary.passed;
const totalCount = evalResult.summary.total;
const meanScore = evalResult.summary.meanScore.toFixed(2);
console.log(
` ${evalName}: ${passCount}/${totalCount} passed (mean score: ${meanScore})`,
);

// Write results to output and collect for overall summary
for (const result of evalResult.results) {
await outputWriter.append(result);
}
allResults.push(...evalResult.results);
} catch (err) {
console.error(
` ${evalName}: ERROR — ${err instanceof Error ? err.message : String(err)}`,
);
}
}

clearEvalRegistry();
}

// If there are no YAML files, finalize output and return
if (yamlEvalFiles.length === 0) {
await outputWriter.close().catch(() => undefined);
const summary = calculateEvaluationSummary(allResults);
console.log(formatEvaluationSummary(summary));
if (allResults.length > 0) {
console.log(`\nResults written to: ${outputPath}`);
}
return;
}
}

// ── YAML eval file pipeline (existing) ─────────────────────────────
// Determine cache state after loading file metadata (need YAML config)
// We defer cache creation until after file metadata is loaded
const evaluationRunner = await resolveEvaluationRunner();
const allResults: EvaluationResult[] = [];
const seenEvalCases = new Set<string>();
const displayIdTracker = createDisplayIdTracker();

// Derive file-level concurrency from worker count (global) when provided
const totalWorkers = options.workers ?? DEFAULT_WORKERS;
const fileConcurrency = Math.min(
Math.max(1, totalWorkers),
Math.max(1, resolvedTestFiles.length),
);
const fileConcurrency = Math.min(Math.max(1, totalWorkers), Math.max(1, yamlEvalFiles.length));
const perFileWorkers = options.workers
? Math.max(1, Math.floor(totalWorkers / fileConcurrency))
: undefined;
Expand All @@ -833,7 +893,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>
readonly failOnError?: FailOnError;
}
>();
for (const testFilePath of resolvedTestFiles) {
for (const testFilePath of yamlEvalFiles) {
const meta = await prepareFileMetadata({
testFilePath,
repoRoot,
Expand Down Expand Up @@ -936,7 +996,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>
}

try {
await runWithLimit(resolvedTestFiles, fileConcurrency, async (testFilePath) => {
await runWithLimit(yamlEvalFiles, fileConcurrency, async (testFilePath) => {
const targetPrep = fileMetadata.get(testFilePath);
if (!targetPrep) {
throw new Error(`Missing metadata for ${testFilePath}`);
Expand Down
13 changes: 8 additions & 5 deletions apps/cli/src/commands/eval/shared.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ import { access, stat } from 'node:fs/promises';
import path from 'node:path';
import fg from 'fast-glob';

/** Supported eval file extensions: YAML, JSONL, and TypeScript/JavaScript */
const EVAL_FILE_RE = /\.(ya?ml|jsonl|ts|js|mts|mjs)$/i;

export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promise<string[]> {
const normalizedInputs = evalPaths.map((value) => value?.trim()).filter((value) => value);
if (normalizedInputs.length === 0) {
Expand All @@ -19,7 +22,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
: path.resolve(cwd, pattern);
try {
const stats = await stat(candidatePath);
if (stats.isFile() && /\.(ya?ml|jsonl)$/i.test(candidatePath)) {
if (stats.isFile() && EVAL_FILE_RE.test(candidatePath)) {
results.add(candidatePath);
continue;
}
Expand All @@ -37,13 +40,13 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
followSymbolicLinks: true,
});

const yamlMatches = matches.filter((filePath) => /\.(ya?ml|jsonl)$/i.test(filePath));
if (yamlMatches.length === 0) {
const evalMatches = matches.filter((filePath) => EVAL_FILE_RE.test(filePath));
if (evalMatches.length === 0) {
unmatched.push(pattern);
continue;
}

for (const filePath of yamlMatches) {
for (const filePath of evalMatches) {
results.add(path.normalize(filePath));
}
}
Expand All @@ -52,7 +55,7 @@ export async function resolveEvalPaths(evalPaths: string[], cwd: string): Promis
throw new Error(
`No eval files matched: ${unmatched.join(
', ',
)}. Provide YAML or JSONL paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.jsonl").`,
)}. Provide YAML, JSONL, or TypeScript paths or globs (e.g., "evals/**/*.yaml", "evals/**/*.eval.ts").`,
);
}

Expand Down
38 changes: 38 additions & 0 deletions examples/features/sdk-eval-api/evals/basic.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import { Contains, Eval, ExactMatch } from '@agentv/core';

Eval('sdk-example/basic', {
data: [
{
id: 'greeting',
input: 'Say hello',
expectedOutput: 'hello',
criteria: 'Response should contain a greeting',
},
{
id: 'math',
input: 'What is 2+2?',
expectedOutput: '4',
criteria: 'Response should contain the correct answer',
},
],
target: { name: 'default', provider: 'mock', response: 'hello, the answer is 4' },
assert: [
Contains('hello'),
({ output, expectedOutput }) => ({
name: 'has-expected',
score: output.includes(expectedOutput ?? '') ? 1.0 : 0.0,
}),
],
});

Eval('sdk-example/exact', {
data: [
{
id: 'exact-check',
input: 'Echo back: hello world',
expectedOutput: 'hello world',
},
],
target: { name: 'default', provider: 'mock', response: 'hello world' },
assert: [ExactMatch],
});
27 changes: 9 additions & 18 deletions examples/features/sdk-programmatic-api/evaluate.ts
Original file line number Diff line number Diff line change
@@ -1,38 +1,29 @@
/**
* Programmatic API Example
*
* Uses evaluate() from @agentv/core to run evaluations as a library.
* Uses Eval() from @agentv/core to run evaluations as a library.
* No YAML needed — tests defined inline with full type safety.
*
* Run: bun run evaluate.ts
* (Uses 'default' target from .agentv/targets.yaml and .env credentials)
*/
import { evaluate } from '@agentv/core';
import { Contains, Eval } from '@agentv/core';

const { results, summary } = await evaluate({
tests: [
const { results, summary } = await Eval('programmatic-api-example', {
data: [
{
id: 'greeting',
input: 'Say hello and introduce yourself briefly.',
expected_output: "Hello! I'm an AI assistant here to help you.",
assert: [{ type: 'contains', value: 'Hello' }],
expectedOutput: "Hello! I'm an AI assistant here to help you.",
},
{
id: 'json-output',
input: [
{ role: 'system', content: 'Respond only with valid JSON. No markdown.' },
{ role: 'user', content: 'Return a JSON object with a "status" field set to "ok".' },
],
expected_output: '{"status": "ok"}',
assert: [
{ type: 'is-json', required: true },
{ type: 'contains', value: 'ok' },
],
input: 'Return a JSON object with a "status" field set to "ok".',
expectedOutput: '{"status": "ok"}',
},
],
onResult: (result) => {
console.log(` ${result.testId}: score=${result.score.toFixed(2)}`);
},
target: { provider: 'mock', response: 'Hello! I am an AI assistant. {"status": "ok"}' },
assert: [Contains('Hello'), { type: 'contains', value: 'ok' }],
});

console.log('\n--- Summary ---');
Expand Down
101 changes: 101 additions & 0 deletions packages/core/src/evaluation/assertions.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/**
* Built-in assertion factories for the Eval() API.
*
* Each factory returns an AssertFn — a plain function that takes
* { input, output, expectedOutput, criteria, metadata } and returns
* { name, score }. These wrap the same logic as the built-in evaluator
* types but are usable as inline functions in the assert array.
*/

/** Context passed to inline assertion functions */
export interface AssertContext {
readonly input: string;
readonly output: string;
readonly expectedOutput?: string;
readonly criteria?: string;
readonly metadata?: Record<string, unknown>;
}

/** Result from an inline assertion function */
export interface AssertResult {
readonly name: string;
readonly score: number;
readonly metadata?: Record<string, unknown>;
}

/** Inline assertion function signature */
export type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;

/** Checks if output contains the given substring. */
export function Contains(value: string): AssertFn {
return ({ output }) => ({
name: 'contains',
score: output.includes(value) ? 1.0 : 0.0,
});
}

/** Case-insensitive contains check. */
export function IContains(value: string): AssertFn {
const lower = value.toLowerCase();
return ({ output }) => ({
name: 'icontains',
score: output.toLowerCase().includes(lower) ? 1.0 : 0.0,
});
}

/** Checks if output contains ALL of the given substrings. */
export function ContainsAll(values: readonly string[]): AssertFn {
return ({ output }) => ({
name: 'contains-all',
score: values.every((v) => output.includes(v)) ? 1.0 : 0.0,
});
}

/** Checks if output contains ANY of the given substrings. */
export function ContainsAny(values: readonly string[]): AssertFn {
return ({ output }) => ({
name: 'contains-any',
score: values.some((v) => output.includes(v)) ? 1.0 : 0.0,
});
}

/** Checks if trimmed output exactly equals trimmed expectedOutput. */
export const ExactMatch: AssertFn = ({ output, expectedOutput }) => ({
name: 'exact-match',
score: expectedOutput !== undefined && output.trim() === expectedOutput.trim() ? 1.0 : 0.0,
});

/** Checks if trimmed output starts with the given value. */
export function StartsWith(value: string): AssertFn {
return ({ output }) => ({
name: 'starts-with',
score: output.trim().startsWith(value.trim()) ? 1.0 : 0.0,
});
}

/** Checks if trimmed output ends with the given value. */
export function EndsWith(value: string): AssertFn {
return ({ output }) => ({
name: 'ends-with',
score: output.trim().endsWith(value.trim()) ? 1.0 : 0.0,
});
}

/** Checks if output matches the given regex pattern. */
export function Regex(pattern: string, flags?: string): AssertFn {
const re = new RegExp(pattern, flags);
return ({ output }) => ({
name: 'regex',
score: re.test(output) ? 1.0 : 0.0,
});
}

/** Checks if output is valid JSON. */
export const IsJson: AssertFn = ({ output }) => {
try {
JSON.parse(output);
return { name: 'is-json', score: 1.0 };
} catch {
return { name: 'is-json', score: 0.0 };
}
};
Loading