diff --git a/.gitignore b/.gitignore index 60bb874..6bdbc50 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,9 @@ data/subprocess-ledger.jsonl .verify-tmp/ fixtures/demo-app/.verify-tmp/ +# Verify state directories (created by test runs) +fixtures/demo-app/.verify/ + # Real-world harvest cache (fetched data, 24h TTL, never committed) .verify-cache/ diff --git a/improve-directive.md b/improve-directive.md new file mode 100644 index 0000000..56f31bb --- /dev/null +++ b/improve-directive.md @@ -0,0 +1,33 @@ +# Improve Directive + + + + + + + + + +## Custom Instructions + + diff --git a/scripts/harness/improve-directive.ts b/scripts/harness/improve-directive.ts new file mode 100644 index 0000000..66f81c4 --- /dev/null +++ b/scripts/harness/improve-directive.ts @@ -0,0 +1,211 @@ +/** + * Directive-Driven Improvement — `improve-directive.md` + * ===================================================== + * + * Inspired by AutoAgent's `program.md` pattern: instead of hardcoding + * improvement strategy in TypeScript, operators write a Markdown directive + * that the LLM diagnosis + fix prompts consume. + * + * This lets users say "prioritize security gate accuracy" or "focus on + * reducing false positives" without changing source code. + * + * The directive is optional. When absent, the improve loop runs with its + * default strategy (fix all dirty scenarios, prefer minimal edits). + */ + +import { readFileSync, existsSync } from 'fs'; +import { join } from 'path'; + +// ============================================================================= +// DIRECTIVE LOADING +// ============================================================================= + +export interface ImproveDirective { + /** Raw markdown content */ + raw: string; + /** Extracted priority gates (if any) */ + priorityGates: string[]; + /** Extracted focus mode: 'false_positives' | 'false_negatives' | 'all' */ + focus: 'false_positives' | 'false_negatives' | 'all'; + /** Max edit complexity preference: 'minimal' | 'moderate' | 'aggressive' */ + editStyle: 'minimal' | 'moderate' | 'aggressive'; + /** Custom instructions injected verbatim into LLM prompts */ + customInstructions: string; +} + +const DEFAULT_DIRECTIVE: ImproveDirective = { + raw: '', + priorityGates: [], + focus: 'all', + editStyle: 'minimal', + customInstructions: '', +}; + +/** + * Load and parse the improve directive from `improve-directive.md`. + * + * The directive supports structured YAML-like headers and free-form + * instructions: + * + * ```markdown + * # Improve Directive + * + * priority-gates: security, grounding, http + * focus: false_positives + * edit-style: minimal + * + * ## Custom Instructions + * + * When fixing security gate false positives, prefer tightening the + * detection regex over adding new special cases. The security gate + * should never whitelist known-dangerous patterns. + * ``` + */ +export function loadDirective(packageRoot: string, directivePath?: string): ImproveDirective { + const path = directivePath + ? join(packageRoot, directivePath) + : join(packageRoot, 'improve-directive.md'); + + if (!existsSync(path)) return DEFAULT_DIRECTIVE; + + let raw: string; + try { + raw = readFileSync(path, 'utf-8'); + } catch { + return DEFAULT_DIRECTIVE; + } + + if (!raw.trim()) return DEFAULT_DIRECTIVE; + + return parseDirective(raw); +} + +function parseDirective(raw: string): ImproveDirective { + const directive: ImproveDirective = { ...DEFAULT_DIRECTIVE, raw }; + + const lines = raw.split('\n'); + + // Extract structured key-value pairs (YAML-like) + for (const line of lines) { + const trimmed = line.trim().toLowerCase(); + + const gatesMatch = trimmed.match(/^priority[- ]gates?:\s*(.+)/); + if (gatesMatch) { + directive.priorityGates = gatesMatch[1] + .split(/[,;]/) + .map(g => g.trim()) + .filter(Boolean); + } + + const focusMatch = trimmed.match(/^focus:\s*(.+)/); + if (focusMatch) { + const val = focusMatch[1].trim(); + if (val.includes('false_positive') || val.includes('false positive')) { + directive.focus = 'false_positives'; + } else if (val.includes('false_negative') || val.includes('false negative')) { + directive.focus = 'false_negatives'; + } + } + + const styleMatch = trimmed.match(/^edit[- ]style:\s*(.+)/); + if (styleMatch) { + const val = styleMatch[1].trim(); + if (val.includes('aggressive')) directive.editStyle = 'aggressive'; + else if (val.includes('moderate')) directive.editStyle = 'moderate'; + // 'minimal' is default + } + } + + // Extract ## Custom Instructions section + const customIdx = raw.search(/^##\s*Custom\s*Instructions/mi); + if (customIdx >= 0) { + const afterHeader = raw.substring(customIdx); + const lines = afterHeader.split('\n').slice(1); // skip the header line + + // Collect until next ## header or end of file + const instructionLines: string[] = []; + for (const line of lines) { + if (/^##\s/.test(line)) break; + instructionLines.push(line); + } + directive.customInstructions = instructionLines.join('\n').trim(); + } + + return directive; +} + +// ============================================================================= +// PROMPT INJECTION — format directive for LLM consumption +// ============================================================================= + +/** + * Format the directive as a prompt block for injection into + * diagnosis and fix generation prompts. + * + * Returns empty string when no directive is loaded. + */ +export function formatDirectiveForPrompt(directive: ImproveDirective): string { + if (!directive.raw) return ''; + + const sections: string[] = []; + + sections.push('OPERATOR DIRECTIVE (follow these priorities):'); + + if (directive.priorityGates.length > 0) { + sections.push(` Priority gates: ${directive.priorityGates.join(', ')}`); + } + + if (directive.focus !== 'all') { + const focusLabel = directive.focus === 'false_positives' + ? 'Reduce false positives (verify passes when it should fail)' + : 'Reduce false negatives (verify fails when it should pass)'; + sections.push(` Focus: ${focusLabel}`); + } + + if (directive.editStyle !== 'minimal') { + sections.push(` Edit style: ${directive.editStyle} (${ + directive.editStyle === 'aggressive' + ? 'larger refactors acceptable' + : 'moderate changes OK when needed' + })`); + } + + if (directive.customInstructions) { + sections.push(`\n Custom instructions:\n ${directive.customInstructions.replace(/\n/g, '\n ')}`); + } + + return '\n' + sections.join('\n') + '\n'; +} + +// ============================================================================= +// BUNDLE FILTERING — apply directive to bundle prioritization +// ============================================================================= + +/** + * Sort and filter bundles based on directive priorities. + * Priority gates are processed first. Non-priority bundles are kept + * but sorted after priority bundles. + */ +export function applyDirectiveToBundles( + bundles: T[], + directive: ImproveDirective, +): T[] { + if (directive.priorityGates.length === 0) return bundles; + + const gateFiles = new Set( + directive.priorityGates.map(g => `src/gates/${g.toLowerCase()}.ts`) + ); + + const priority: T[] = []; + const rest: T[] = []; + + for (const b of bundles) { + if (b.triage.targetFile && gateFiles.has(b.triage.targetFile)) { + priority.push(b); + } else { + rest.push(b); + } + } + + return [...priority, ...rest]; +} diff --git a/scripts/harness/improve-prompt-surface.ts b/scripts/harness/improve-prompt-surface.ts new file mode 100644 index 0000000..d2fcb8e --- /dev/null +++ b/scripts/harness/improve-prompt-surface.ts @@ -0,0 +1,160 @@ +/** + * Prompt Surface — Tunable LLM Prompts Within Gates + * ================================================== + * + * Inspired by AutoAgent's concept of "editable surfaces" that include + * system prompts and configuration, not just logic. + * + * Some verify gates contain LLM prompts (vision.ts) or tunable thresholds + * (triangulation.ts). These are valid optimization targets for the improve + * loop — the prompt wording directly affects gate accuracy. + * + * This module: + * 1. Defines which prompt regions in which files are tunable + * 2. Extends the bounded surface to include these regions + * 3. Provides context to the LLM fix generator about what's a prompt + * vs. what's logic (so it prefers prompt edits for prompt-related failures) + */ + +import { readFileSync, existsSync } from 'fs'; +import { join } from 'path'; + +// ============================================================================= +// PROMPT SURFACE DEFINITIONS +// ============================================================================= + +export interface PromptRegion { + /** File path relative to package root */ + file: string; + /** Function that contains the prompt */ + functionName: string; + /** Description of what this prompt does */ + description: string; + /** Marker strings that delimit the prompt region */ + startMarker: string; + endMarker: string; + /** What kind of tuning is appropriate */ + tuningAdvice: string; +} + +/** + * Known prompt regions within gate files. + * These are LLM prompts or tunable configuration that the improve + * loop can optimize when --prompt-surface is enabled. + */ +export const PROMPT_REGIONS: ReadonlyArray = [ + { + file: 'src/gates/vision.ts', + functionName: 'buildVisionPrompt()', + description: 'System prompt for vision model screenshot verification', + startMarker: 'You are verifying a web application screenshot', + endMarker: '${numbered}`;', + tuningAdvice: 'Tune claim verification instructions. Be precise about what VERIFIED/NOT VERIFIED means. Do not change the response format (CLAIM N: VERIFIED/NOT VERIFIED).', + }, + { + file: 'src/gates/triangulation.ts', + functionName: 'triangulate()', + description: 'Triangulation thresholds and weighting logic', + startMarker: '// Weights for each authority', + endMarker: '// End weights', + tuningAdvice: 'Tune authority weights and agreement thresholds. The deterministic authority (grounding) should generally have highest weight. Do not change the 3-authority architecture.', + }, + { + file: 'src/gates/hallucination.ts', + functionName: 'runHallucinationGate()', + description: 'Hallucination detection heuristics and thresholds', + startMarker: '// Hallucination detection thresholds', + endMarker: '// End thresholds', + tuningAdvice: 'Tune similarity thresholds and confidence levels. Lower thresholds catch more hallucinations but increase false positives.', + }, +]; + +// ============================================================================= +// PROMPT SURFACE INTEGRATION +// ============================================================================= + +/** + * Check if a file + function is a known prompt region. + */ +export function isPromptRegion(file: string, functionName?: string): PromptRegion | null { + for (const region of PROMPT_REGIONS) { + if (region.file === file) { + if (!functionName || region.functionName.includes(functionName.replace(/\(\)$/, ''))) { + return region; + } + } + } + return null; +} + +/** + * Extract the actual prompt text from a file for a given region. + * Returns the lines between startMarker and endMarker, or null if not found. + */ +export function extractPromptRegion( + packageRoot: string, + region: PromptRegion, +): { text: string; startLine: number; endLine: number } | null { + const filePath = join(packageRoot, region.file); + if (!existsSync(filePath)) return null; + + const content = readFileSync(filePath, 'utf-8'); + const lines = content.split('\n'); + + const startIdx = lines.findIndex(l => l.includes(region.startMarker)); + if (startIdx < 0) return null; + + let endIdx = lines.length; + for (let i = startIdx + 1; i < lines.length; i++) { + if (lines[i].includes(region.endMarker)) { + endIdx = i + 1; + break; + } + } + + return { + text: lines.slice(startIdx, endIdx).join('\n'), + startLine: startIdx + 1, // 1-based + endLine: endIdx, + }; +} + +/** + * Format prompt surface context for injection into fix generation prompts. + * Tells the LLM which regions are prompts and how to tune them. + */ +export function formatPromptSurfaceContext( + packageRoot: string, + targetFile: string, +): string { + const regions = PROMPT_REGIONS.filter(r => r.file === targetFile); + if (regions.length === 0) return ''; + + const sections: string[] = [ + '\nPROMPT SURFACE — The following regions contain LLM prompts or tunable thresholds.', + 'When the failure is related to prompt interpretation (vision claims, triangulation weighting),', + 'prefer tuning the prompt/threshold over changing surrounding logic.\n', + ]; + + for (const region of regions) { + const extracted = extractPromptRegion(packageRoot, region); + sections.push(` PROMPT REGION: ${region.functionName} in ${region.file}`); + sections.push(` Description: ${region.description}`); + sections.push(` Tuning advice: ${region.tuningAdvice}`); + if (extracted) { + sections.push(` Lines ${extracted.startLine}-${extracted.endLine}:`); + sections.push(` \`\`\`\n${extracted.text}\n \`\`\``); + } + sections.push(''); + } + + return sections.join('\n'); +} + +/** + * Get the list of files that have prompt surfaces. + * Used to extend the bounded surface when --prompt-surface is enabled. + */ +export function getPromptSurfaceFiles(): string[] { + return [...new Set(PROMPT_REGIONS.map(r => r.file))]; +} diff --git a/scripts/harness/improve.ts b/scripts/harness/improve.ts index 39f5f37..781ec18 100644 --- a/scripts/harness/improve.ts +++ b/scripts/harness/improve.ts @@ -22,6 +22,8 @@ import { splitScenarios, validateCandidate, runHoldout } from './improve-subproc import { createLLMProvider } from './llm-providers.js'; import { printImprovementReport } from './improve-report.js'; import { hashEdits } from './improve-utils.js'; +import { loadDirective, formatDirectiveForPrompt, applyDirectiveToBundles, type ImproveDirective } from './improve-directive.js'; +import { formatPromptSurfaceContext, isPromptRegion } from './improve-prompt-surface.js'; // ============================================================================= // CROSS-RUN MEMORY — improve-history.json @@ -87,18 +89,99 @@ export async function runImproveLoop( runConfig: RunConfig, improveConfig: ImproveConfig, ): Promise { + const maxIterations = improveConfig.maxIterations ?? 1; + const continuous = maxIterations > 1; + + if (continuous) { + console.log('\n ╔══════════════════════════════════════════════════╗'); + console.log(' ║ Verify Improvement Engine — Continuous Mode ║'); + console.log(' ╚══════════════════════════════════════════════════╝\n'); + console.log(` Max iterations: ${maxIterations} LLM: ${improveConfig.llm}\n`); + } + + const cumulativeUsage: LLMUsage = { inputTokens: 0, outputTokens: 0, calls: 0 }; + const allEntries: ImprovementEntry[] = []; + let totalAccepted = 0; + + for (let iteration = 1; iteration <= maxIterations; iteration++) { + if (continuous) { + console.log(`\n ═══ Iteration ${iteration}/${maxIterations} ═══════════════════════════════\n`); + } + + const { entries, usage, hadAccepted } = await runSingleIteration( + runConfig, improveConfig, iteration, + ); + + allEntries.push(...entries); + cumulativeUsage.inputTokens += usage.inputTokens; + cumulativeUsage.outputTokens += usage.outputTokens; + cumulativeUsage.calls += usage.calls; + + const accepted = entries.filter(e => e.verdict === 'accepted').length; + totalAccepted += accepted; + + // Early termination: no improvements this iteration → stop climbing + if (!hadAccepted) { + if (continuous && iteration < maxIterations) { + console.log(` No improvements in iteration ${iteration} — stopping continuous loop.\n`); + } + break; + } + + // If continuous and we accepted something, the next iteration re-baselines + // against the improved code (edits were applied by the orchestrator) + if (continuous && iteration < maxIterations) { + console.log(` Iteration ${iteration}: ${accepted} accepted — re-baselining for next iteration...\n`); + } + } + + // Cumulative summary for continuous mode + if (continuous) { + console.log(' ┌──────────────────────────────────────────────────┐'); + console.log(' │ CONTINUOUS MODE SUMMARY │'); + console.log(' └──────────────────────────────────────────────────┘\n'); + console.log(` Total accepted: ${totalAccepted}`); + console.log(` Total LLM cost: ${cumulativeUsage.calls} calls, ${cumulativeUsage.inputTokens} in / ${cumulativeUsage.outputTokens} out tokens\n`); + } +} + +/** + * Run a single iteration of the improvement pipeline. + * Extracted from the original runImproveLoop to support continuous mode. + */ +async function runSingleIteration( + runConfig: RunConfig, + improveConfig: ImproveConfig, + iteration: number, +): Promise<{ entries: ImprovementEntry[]; usage: LLMUsage; hadAccepted: boolean }> { const packageRoot = resolve(import.meta.dir, '../..'); const dataDir = join(packageRoot, 'data'); mkdirSync(dataDir, { recursive: true }); const callLLM = createLLMProvider(improveConfig); - console.log('\n ╔══════════════════════════════════════════════════╗'); - console.log(' ║ Verify Improvement Engine — Evidence-Centric ║'); - console.log(' ╚══════════════════════════════════════════════════╝\n'); - console.log(` LLM: ${improveConfig.llm} Candidates: ${improveConfig.maxCandidates} Max lines: ${improveConfig.maxLines}`); - if (improveConfig.dryRun) console.log(' Mode: DRY RUN (no edits applied)\n'); - else console.log(''); + // ─── Load directive ────────────────────────────────────────────────── + const directive = loadDirective(packageRoot, improveConfig.directivePath); + if (directive.raw && iteration === 1) { + console.log(` Directive loaded: focus=${directive.focus}, style=${directive.editStyle}`); + if (directive.priorityGates.length > 0) { + console.log(` Priority gates: ${directive.priorityGates.join(', ')}`); + } + if (directive.customInstructions) { + console.log(` Custom instructions: ${directive.customInstructions.substring(0, 80)}...`); + } + console.log(''); + } + + if (iteration === 1) { + console.log('\n ╔══════════════════════════════════════════════════╗'); + console.log(' ║ Verify Improvement Engine — Evidence-Centric ║'); + console.log(' ╚══════════════════════════════════════════════════╝\n'); + console.log(` LLM: ${improveConfig.llm} Candidates: ${improveConfig.maxCandidates} Max lines: ${improveConfig.maxLines}`); + if (improveConfig.promptSurface) console.log(' Prompt surface: ENABLED (LLM prompts in gates are tunable)'); + if (improveConfig.dryRun) console.log(' Mode: DRY RUN (no edits applied)\n'); + else console.log(''); + } // ─── Step 1: Baseline run ───────────────────────────────────────────── console.log(' [1/7] Running baseline self-test...'); @@ -107,15 +190,21 @@ export async function runImproveLoop( const clean = baselineLedger.filter(e => e.clean); console.log(` ${baselineLedger.length} scenarios: ${clean.length} clean, ${dirty.length} dirty\n`); + const usage: LLMUsage = { inputTokens: 0, outputTokens: 0, calls: 0 }; + if (dirty.length === 0) { console.log(' ✓ All scenarios clean — nothing to improve.\n'); saveImprovementLedger(dataDir, []); - return; + return { entries: [], usage, hadAccepted: false }; } // ─── Step 2: Evidence bundling ──────────────────────────────────────── console.log(' [2/7] Bundling violations by root cause...'); - const bundles = bundleViolations(baselineLedger); + let bundles = bundleViolations(baselineLedger); + + // Apply directive priorities to bundle ordering + bundles = applyDirectiveToBundles(bundles, directive); + console.log(` ${bundles.length} evidence bundle(s)\n`); for (const b of bundles) { @@ -144,14 +233,13 @@ export async function runImproveLoop( // ─── Step 4-6: Process each bundle ──────────────────────────────────── const entries: ImprovementEntry[] = []; - const usage: LLMUsage = { inputTokens: 0, outputTokens: 0, calls: 0 }; const attemptedHashes = new Set(priorHashes); // includes cross-run hashes const attempts: AttemptRecord[] = []; for (const bundle of bundles) { const entry = await processBundle( bundle, split, packageRoot, runConfig, improveConfig, callLLM, usage, - attemptedHashes, attempts, + attemptedHashes, attempts, directive, ); entries.push(entry); } @@ -175,6 +263,9 @@ export async function runImproveLoop( // ─── Step 7: Report ─────────────────────────────────────────────────── saveImprovementLedger(dataDir, entries); printImprovementReport(entries, usage); + + const hadAccepted = entries.some(e => e.verdict === 'accepted'); + return { entries, usage, hadAccepted }; } // ============================================================================= @@ -191,6 +282,7 @@ async function processBundle( usage: LLMUsage, attemptedHashes: Set, attempts: AttemptRecord[], + directive: ImproveDirective = { raw: '', priorityGates: [], focus: 'all', editStyle: 'minimal', customInstructions: '' }, ): Promise { const timestamp = new Date().toISOString(); const bundleId = bundle.id; @@ -231,9 +323,27 @@ async function processBundle( // Enrich diagnosis with prior attempt context (Fix 7) const priorContext = formatPriorAttempts(attempts); - const enrichedDiagnosis = diagnosis - ? diagnosis + priorContext - : priorContext || null; + + // Inject directive and prompt surface context + const directiveContext = formatDirectiveForPrompt(directive); + const promptSurfaceContext = improveConfig.promptSurface && bundle.triage.targetFile + ? formatPromptSurfaceContext(packageRoot, bundle.triage.targetFile) + : ''; + + // Check if this bundle targets a prompt region + const promptRegion = bundle.triage.targetFile + ? isPromptRegion(bundle.triage.targetFile, bundle.triage.targetFunction ?? undefined) + : null; + if (promptRegion && improveConfig.promptSurface) { + console.log(` Prompt surface: ${promptRegion.functionName} — ${promptRegion.description}`); + } + + const enrichedDiagnosis = [ + diagnosis, + priorContext, + directiveContext, + promptSurfaceContext, + ].filter(Boolean).join('\n') || null; const isClaude = improveConfig.llm === 'claude' || improveConfig.llm === 'claude-code'; console.log(` [5/7] Generating fix candidates${isClaude ? ' (Claude — architectural context)' : ''}...`); diff --git a/scripts/harness/types.ts b/scripts/harness/types.ts index 0ce1d6e..d38b364 100644 --- a/scripts/harness/types.ts +++ b/scripts/harness/types.ts @@ -288,6 +288,12 @@ export interface ImproveConfig { maxCandidates: number; maxLines: number; dryRun: boolean; + /** Continuous mode: re-run after each accepted improvement (default: 1 = single pass) */ + maxIterations?: number; + /** Path to directive file that guides improvement priorities (default: improve-directive.md) */ + directivePath?: string; + /** Enable prompt surface optimization — tune LLM prompts inside gates (default: false) */ + promptSurface?: boolean; } export type LLMCallFn = ( diff --git a/scripts/self-test.ts b/scripts/self-test.ts index 4576904..a7dba63 100644 --- a/scripts/self-test.ts +++ b/scripts/self-test.ts @@ -14,6 +14,16 @@ * bun run packages/verify/scripts/self-test.ts --improve --llm=gemini --api-key=AIza... * bun run packages/verify/scripts/self-test.ts --improve --dry-run --llm=gemini --api-key=AIza... * bun run packages/verify/scripts/self-test.ts --improve --llm=ollama --ollama-model=qwen3:4b + * + * Continuous mode (AutoAgent-inspired hill climbing): + * bun run packages/verify/scripts/self-test.ts --improve --continuous --llm=claude + * bun run packages/verify/scripts/self-test.ts --improve --max-iterations=5 --llm=gemini --api-key=AIza... + * + * Directive-driven improvement: + * bun run packages/verify/scripts/self-test.ts --improve --directive=improve-directive.md --llm=claude + * + * Prompt surface optimization: + * bun run packages/verify/scripts/self-test.ts --improve --prompt-surface --llm=claude */ import { resolve, join } from 'path'; @@ -45,6 +55,9 @@ function parseArgs(args: string[]): ParsedArgs { let maxCandidates = 3; let maxLines = 30; let dryRun = false; + let maxIterations = 1; + let directivePath: string | undefined; + let promptSurface = false; for (const arg of args) { if (arg.startsWith('--appDir=') || arg.startsWith('--app-dir=')) { @@ -76,6 +89,14 @@ function parseArgs(args: string[]): ParsedArgs { maxLines = parseInt(arg.slice('--max-lines='.length), 10); } else if (arg === '--dry-run') { dryRun = true; + } else if (arg.startsWith('--max-iterations=')) { + maxIterations = parseInt(arg.slice('--max-iterations='.length), 10); + } else if (arg === '--continuous') { + maxIterations = 10; // sensible default for continuous mode + } else if (arg.startsWith('--directive=')) { + directivePath = arg.slice('--directive='.length); + } else if (arg === '--prompt-surface') { + promptSurface = true; } else if (arg === '--help' || arg === '-h') { console.log(` Verify Self-Test — Autonomous Bug Discovery + Improvement @@ -101,6 +122,17 @@ function parseArgs(args: string[]): ParsedArgs { --max-lines=N Max changed lines per strategy (default: 30) --dry-run Evidence bundling + triage only, no fixes + Continuous Mode (AutoAgent-inspired): + --continuous Re-run after each accepted improvement (default: 10 iterations) + --max-iterations=N Max improvement iterations (default: 1 = single pass) + + Directive-Driven Improvement: + --directive=PATH Path to improve-directive.md (default: improve-directive.md) + Externalize improvement priorities without changing code + + Prompt Surface Optimization: + --prompt-surface Enable tuning of LLM prompts inside gates (vision, etc.) + Families: A Fingerprint collision detection B K5 constraint learning @@ -123,7 +155,7 @@ function parseArgs(args: string[]): ParsedArgs { const runConfig: RunConfig = { appDir, families, dockerEnabled, failOnBug, ledgerPath, scenarioIds }; const improveConfig: ImproveConfig | null = improve - ? { llm, apiKey, ollamaModel, ollamaHost, maxCandidates, maxLines, dryRun } + ? { llm, apiKey, ollamaModel, ollamaHost, maxCandidates, maxLines, dryRun, maxIterations, directivePath, promptSurface } : null; return { runConfig, improveConfig }; diff --git a/tests/unit/improve-directive.test.ts b/tests/unit/improve-directive.test.ts new file mode 100644 index 0000000..e9276b7 --- /dev/null +++ b/tests/unit/improve-directive.test.ts @@ -0,0 +1,93 @@ +import { describe, expect, test } from 'bun:test'; +import { + loadDirective, + formatDirectiveForPrompt, + applyDirectiveToBundles, +} from '../../scripts/harness/improve-directive.js'; +import { join } from 'path'; +import { writeFileSync, mkdirSync, rmSync } from 'fs'; +import { tmpdir } from 'os'; + +describe('improve-directive', () => { + const testDir = join(tmpdir(), `verify-directive-test-${Date.now()}`); + + test('returns default when no directive file exists', () => { + const directive = loadDirective('/nonexistent/path'); + expect(directive.raw).toBe(''); + expect(directive.priorityGates).toEqual([]); + expect(directive.focus).toBe('all'); + expect(directive.editStyle).toBe('minimal'); + expect(directive.customInstructions).toBe(''); + }); + + test('parses structured fields from directive file', () => { + mkdirSync(testDir, { recursive: true }); + writeFileSync(join(testDir, 'improve-directive.md'), `# Improve Directive + +priority-gates: security, grounding, http +focus: false_positives +edit-style: aggressive + +## Custom Instructions + +Prefer tightening regexes over adding special cases. +`); + + const directive = loadDirective(testDir); + expect(directive.priorityGates).toEqual(['security', 'grounding', 'http']); + expect(directive.focus).toBe('false_positives'); + expect(directive.editStyle).toBe('aggressive'); + expect(directive.customInstructions).toBe('Prefer tightening regexes over adding special cases.'); + + rmSync(testDir, { recursive: true, force: true }); + }); + + test('formats directive for prompt injection', () => { + const directive = { + raw: 'something', + priorityGates: ['security'], + focus: 'false_positives' as const, + editStyle: 'minimal' as const, + customInstructions: 'Be strict about XSS detection.', + }; + + const result = formatDirectiveForPrompt(directive); + expect(result).toContain('OPERATOR DIRECTIVE'); + expect(result).toContain('Priority gates: security'); + expect(result).toContain('Reduce false positives'); + expect(result).toContain('Be strict about XSS detection'); + }); + + test('returns empty string when no directive loaded', () => { + const directive = { + raw: '', + priorityGates: [], + focus: 'all' as const, + editStyle: 'minimal' as const, + customInstructions: '', + }; + + expect(formatDirectiveForPrompt(directive)).toBe(''); + }); + + test('applyDirectiveToBundles sorts priority gates first', () => { + const bundles = [ + { id: 'a', triage: { targetFile: 'src/gates/grounding.ts' } }, + { id: 'b', triage: { targetFile: 'src/gates/security.ts' } }, + { id: 'c', triage: { targetFile: 'src/gates/http.ts' } }, + ]; + + const directive = { + raw: 'x', + priorityGates: ['security'], + focus: 'all' as const, + editStyle: 'minimal' as const, + customInstructions: '', + }; + + const sorted = applyDirectiveToBundles(bundles, directive); + expect(sorted[0].id).toBe('b'); // security first + expect(sorted[1].id).toBe('a'); + expect(sorted[2].id).toBe('c'); + }); +});