From 4eaf8132bbd1f5fd5992d9cd40e14df024062ebd Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 4 Apr 2026 22:50:16 +0000 Subject: [PATCH 1/2] improve: add continuous mode, directive-driven priorities, and prompt surface optimization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three AutoAgent-inspired concepts integrated into the evidence-centric improve loop: 1. Continuous mode (--continuous / --max-iterations=N): Re-baselines after each accepted improvement and iterates, compounding small wins. Stops when an iteration produces no accepted candidates. 2. Directive-driven improvement (improve-directive.md): Externalizes improvement strategy into a human-editable Markdown file. Operators can specify priority gates, focus mode (false positives vs negatives), edit style preferences, and custom instructions — all injected into LLM diagnosis/fix prompts. 3. Prompt surface optimization (--prompt-surface): Extends the bounded surface to include LLM prompts within gates (vision.ts prompt, triangulation weights). The fix generator gets context about which regions are prompts vs logic, preferring prompt edits for prompt-related failures. https://claude.ai/code/session_01SJkfKmU2V83UrCvgyH2JAD --- improve-directive.md | 33 ++++ scripts/harness/improve-directive.ts | 211 ++++++++++++++++++++++ scripts/harness/improve-prompt-surface.ts | 160 ++++++++++++++++ scripts/harness/improve.ts | 136 ++++++++++++-- scripts/harness/types.ts | 6 + scripts/self-test.ts | 34 +++- tests/unit/improve-directive.test.ts | 93 ++++++++++ 7 files changed, 659 insertions(+), 14 deletions(-) create mode 100644 improve-directive.md create mode 100644 scripts/harness/improve-directive.ts create mode 100644 scripts/harness/improve-prompt-surface.ts create mode 100644 tests/unit/improve-directive.test.ts diff --git a/improve-directive.md b/improve-directive.md new file mode 100644 index 0000000..56f31bb --- /dev/null +++ b/improve-directive.md @@ -0,0 +1,33 @@ +# Improve Directive + + + + + + + + + +## Custom Instructions + + diff --git a/scripts/harness/improve-directive.ts b/scripts/harness/improve-directive.ts new file mode 100644 index 0000000..66f81c4 --- /dev/null +++ b/scripts/harness/improve-directive.ts @@ -0,0 +1,211 @@ +/** + * Directive-Driven Improvement — `improve-directive.md` + * ===================================================== + * + * Inspired by AutoAgent's `program.md` pattern: instead of hardcoding + * improvement strategy in TypeScript, operators write a Markdown directive + * that the LLM diagnosis + fix prompts consume. + * + * This lets users say "prioritize security gate accuracy" or "focus on + * reducing false positives" without changing source code. + * + * The directive is optional. When absent, the improve loop runs with its + * default strategy (fix all dirty scenarios, prefer minimal edits). + */ + +import { readFileSync, existsSync } from 'fs'; +import { join } from 'path'; + +// ============================================================================= +// DIRECTIVE LOADING +// ============================================================================= + +export interface ImproveDirective { + /** Raw markdown content */ + raw: string; + /** Extracted priority gates (if any) */ + priorityGates: string[]; + /** Extracted focus mode: 'false_positives' | 'false_negatives' | 'all' */ + focus: 'false_positives' | 'false_negatives' | 'all'; + /** Max edit complexity preference: 'minimal' | 'moderate' | 'aggressive' */ + editStyle: 'minimal' | 'moderate' | 'aggressive'; + /** Custom instructions injected verbatim into LLM prompts */ + customInstructions: string; +} + +const DEFAULT_DIRECTIVE: ImproveDirective = { + raw: '', + priorityGates: [], + focus: 'all', + editStyle: 'minimal', + customInstructions: '', +}; + +/** + * Load and parse the improve directive from `improve-directive.md`. + * + * The directive supports structured YAML-like headers and free-form + * instructions: + * + * ```markdown + * # Improve Directive + * + * priority-gates: security, grounding, http + * focus: false_positives + * edit-style: minimal + * + * ## Custom Instructions + * + * When fixing security gate false positives, prefer tightening the + * detection regex over adding new special cases. The security gate + * should never whitelist known-dangerous patterns. + * ``` + */ +export function loadDirective(packageRoot: string, directivePath?: string): ImproveDirective { + const path = directivePath + ? join(packageRoot, directivePath) + : join(packageRoot, 'improve-directive.md'); + + if (!existsSync(path)) return DEFAULT_DIRECTIVE; + + let raw: string; + try { + raw = readFileSync(path, 'utf-8'); + } catch { + return DEFAULT_DIRECTIVE; + } + + if (!raw.trim()) return DEFAULT_DIRECTIVE; + + return parseDirective(raw); +} + +function parseDirective(raw: string): ImproveDirective { + const directive: ImproveDirective = { ...DEFAULT_DIRECTIVE, raw }; + + const lines = raw.split('\n'); + + // Extract structured key-value pairs (YAML-like) + for (const line of lines) { + const trimmed = line.trim().toLowerCase(); + + const gatesMatch = trimmed.match(/^priority[- ]gates?:\s*(.+)/); + if (gatesMatch) { + directive.priorityGates = gatesMatch[1] + .split(/[,;]/) + .map(g => g.trim()) + .filter(Boolean); + } + + const focusMatch = trimmed.match(/^focus:\s*(.+)/); + if (focusMatch) { + const val = focusMatch[1].trim(); + if (val.includes('false_positive') || val.includes('false positive')) { + directive.focus = 'false_positives'; + } else if (val.includes('false_negative') || val.includes('false negative')) { + directive.focus = 'false_negatives'; + } + } + + const styleMatch = trimmed.match(/^edit[- ]style:\s*(.+)/); + if (styleMatch) { + const val = styleMatch[1].trim(); + if (val.includes('aggressive')) directive.editStyle = 'aggressive'; + else if (val.includes('moderate')) directive.editStyle = 'moderate'; + // 'minimal' is default + } + } + + // Extract ## Custom Instructions section + const customIdx = raw.search(/^##\s*Custom\s*Instructions/mi); + if (customIdx >= 0) { + const afterHeader = raw.substring(customIdx); + const lines = afterHeader.split('\n').slice(1); // skip the header line + + // Collect until next ## header or end of file + const instructionLines: string[] = []; + for (const line of lines) { + if (/^##\s/.test(line)) break; + instructionLines.push(line); + } + directive.customInstructions = instructionLines.join('\n').trim(); + } + + return directive; +} + +// ============================================================================= +// PROMPT INJECTION — format directive for LLM consumption +// ============================================================================= + +/** + * Format the directive as a prompt block for injection into + * diagnosis and fix generation prompts. + * + * Returns empty string when no directive is loaded. + */ +export function formatDirectiveForPrompt(directive: ImproveDirective): string { + if (!directive.raw) return ''; + + const sections: string[] = []; + + sections.push('OPERATOR DIRECTIVE (follow these priorities):'); + + if (directive.priorityGates.length > 0) { + sections.push(` Priority gates: ${directive.priorityGates.join(', ')}`); + } + + if (directive.focus !== 'all') { + const focusLabel = directive.focus === 'false_positives' + ? 'Reduce false positives (verify passes when it should fail)' + : 'Reduce false negatives (verify fails when it should pass)'; + sections.push(` Focus: ${focusLabel}`); + } + + if (directive.editStyle !== 'minimal') { + sections.push(` Edit style: ${directive.editStyle} (${ + directive.editStyle === 'aggressive' + ? 'larger refactors acceptable' + : 'moderate changes OK when needed' + })`); + } + + if (directive.customInstructions) { + sections.push(`\n Custom instructions:\n ${directive.customInstructions.replace(/\n/g, '\n ')}`); + } + + return '\n' + sections.join('\n') + '\n'; +} + +// ============================================================================= +// BUNDLE FILTERING — apply directive to bundle prioritization +// ============================================================================= + +/** + * Sort and filter bundles based on directive priorities. + * Priority gates are processed first. Non-priority bundles are kept + * but sorted after priority bundles. + */ +export function applyDirectiveToBundles( + bundles: T[], + directive: ImproveDirective, +): T[] { + if (directive.priorityGates.length === 0) return bundles; + + const gateFiles = new Set( + directive.priorityGates.map(g => `src/gates/${g.toLowerCase()}.ts`) + ); + + const priority: T[] = []; + const rest: T[] = []; + + for (const b of bundles) { + if (b.triage.targetFile && gateFiles.has(b.triage.targetFile)) { + priority.push(b); + } else { + rest.push(b); + } + } + + return [...priority, ...rest]; +} diff --git a/scripts/harness/improve-prompt-surface.ts b/scripts/harness/improve-prompt-surface.ts new file mode 100644 index 0000000..d2fcb8e --- /dev/null +++ b/scripts/harness/improve-prompt-surface.ts @@ -0,0 +1,160 @@ +/** + * Prompt Surface — Tunable LLM Prompts Within Gates + * ================================================== + * + * Inspired by AutoAgent's concept of "editable surfaces" that include + * system prompts and configuration, not just logic. + * + * Some verify gates contain LLM prompts (vision.ts) or tunable thresholds + * (triangulation.ts). These are valid optimization targets for the improve + * loop — the prompt wording directly affects gate accuracy. + * + * This module: + * 1. Defines which prompt regions in which files are tunable + * 2. Extends the bounded surface to include these regions + * 3. Provides context to the LLM fix generator about what's a prompt + * vs. what's logic (so it prefers prompt edits for prompt-related failures) + */ + +import { readFileSync, existsSync } from 'fs'; +import { join } from 'path'; + +// ============================================================================= +// PROMPT SURFACE DEFINITIONS +// ============================================================================= + +export interface PromptRegion { + /** File path relative to package root */ + file: string; + /** Function that contains the prompt */ + functionName: string; + /** Description of what this prompt does */ + description: string; + /** Marker strings that delimit the prompt region */ + startMarker: string; + endMarker: string; + /** What kind of tuning is appropriate */ + tuningAdvice: string; +} + +/** + * Known prompt regions within gate files. + * These are LLM prompts or tunable configuration that the improve + * loop can optimize when --prompt-surface is enabled. + */ +export const PROMPT_REGIONS: ReadonlyArray = [ + { + file: 'src/gates/vision.ts', + functionName: 'buildVisionPrompt()', + description: 'System prompt for vision model screenshot verification', + startMarker: 'You are verifying a web application screenshot', + endMarker: '${numbered}`;', + tuningAdvice: 'Tune claim verification instructions. Be precise about what VERIFIED/NOT VERIFIED means. Do not change the response format (CLAIM N: VERIFIED/NOT VERIFIED).', + }, + { + file: 'src/gates/triangulation.ts', + functionName: 'triangulate()', + description: 'Triangulation thresholds and weighting logic', + startMarker: '// Weights for each authority', + endMarker: '// End weights', + tuningAdvice: 'Tune authority weights and agreement thresholds. The deterministic authority (grounding) should generally have highest weight. Do not change the 3-authority architecture.', + }, + { + file: 'src/gates/hallucination.ts', + functionName: 'runHallucinationGate()', + description: 'Hallucination detection heuristics and thresholds', + startMarker: '// Hallucination detection thresholds', + endMarker: '// End thresholds', + tuningAdvice: 'Tune similarity thresholds and confidence levels. Lower thresholds catch more hallucinations but increase false positives.', + }, +]; + +// ============================================================================= +// PROMPT SURFACE INTEGRATION +// ============================================================================= + +/** + * Check if a file + function is a known prompt region. + */ +export function isPromptRegion(file: string, functionName?: string): PromptRegion | null { + for (const region of PROMPT_REGIONS) { + if (region.file === file) { + if (!functionName || region.functionName.includes(functionName.replace(/\(\)$/, ''))) { + return region; + } + } + } + return null; +} + +/** + * Extract the actual prompt text from a file for a given region. + * Returns the lines between startMarker and endMarker, or null if not found. + */ +export function extractPromptRegion( + packageRoot: string, + region: PromptRegion, +): { text: string; startLine: number; endLine: number } | null { + const filePath = join(packageRoot, region.file); + if (!existsSync(filePath)) return null; + + const content = readFileSync(filePath, 'utf-8'); + const lines = content.split('\n'); + + const startIdx = lines.findIndex(l => l.includes(region.startMarker)); + if (startIdx < 0) return null; + + let endIdx = lines.length; + for (let i = startIdx + 1; i < lines.length; i++) { + if (lines[i].includes(region.endMarker)) { + endIdx = i + 1; + break; + } + } + + return { + text: lines.slice(startIdx, endIdx).join('\n'), + startLine: startIdx + 1, // 1-based + endLine: endIdx, + }; +} + +/** + * Format prompt surface context for injection into fix generation prompts. + * Tells the LLM which regions are prompts and how to tune them. + */ +export function formatPromptSurfaceContext( + packageRoot: string, + targetFile: string, +): string { + const regions = PROMPT_REGIONS.filter(r => r.file === targetFile); + if (regions.length === 0) return ''; + + const sections: string[] = [ + '\nPROMPT SURFACE — The following regions contain LLM prompts or tunable thresholds.', + 'When the failure is related to prompt interpretation (vision claims, triangulation weighting),', + 'prefer tuning the prompt/threshold over changing surrounding logic.\n', + ]; + + for (const region of regions) { + const extracted = extractPromptRegion(packageRoot, region); + sections.push(` PROMPT REGION: ${region.functionName} in ${region.file}`); + sections.push(` Description: ${region.description}`); + sections.push(` Tuning advice: ${region.tuningAdvice}`); + if (extracted) { + sections.push(` Lines ${extracted.startLine}-${extracted.endLine}:`); + sections.push(` \`\`\`\n${extracted.text}\n \`\`\``); + } + sections.push(''); + } + + return sections.join('\n'); +} + +/** + * Get the list of files that have prompt surfaces. + * Used to extend the bounded surface when --prompt-surface is enabled. + */ +export function getPromptSurfaceFiles(): string[] { + return [...new Set(PROMPT_REGIONS.map(r => r.file))]; +} diff --git a/scripts/harness/improve.ts b/scripts/harness/improve.ts index 39f5f37..781ec18 100644 --- a/scripts/harness/improve.ts +++ b/scripts/harness/improve.ts @@ -22,6 +22,8 @@ import { splitScenarios, validateCandidate, runHoldout } from './improve-subproc import { createLLMProvider } from './llm-providers.js'; import { printImprovementReport } from './improve-report.js'; import { hashEdits } from './improve-utils.js'; +import { loadDirective, formatDirectiveForPrompt, applyDirectiveToBundles, type ImproveDirective } from './improve-directive.js'; +import { formatPromptSurfaceContext, isPromptRegion } from './improve-prompt-surface.js'; // ============================================================================= // CROSS-RUN MEMORY — improve-history.json @@ -87,18 +89,99 @@ export async function runImproveLoop( runConfig: RunConfig, improveConfig: ImproveConfig, ): Promise { + const maxIterations = improveConfig.maxIterations ?? 1; + const continuous = maxIterations > 1; + + if (continuous) { + console.log('\n ╔══════════════════════════════════════════════════╗'); + console.log(' ║ Verify Improvement Engine — Continuous Mode ║'); + console.log(' ╚══════════════════════════════════════════════════╝\n'); + console.log(` Max iterations: ${maxIterations} LLM: ${improveConfig.llm}\n`); + } + + const cumulativeUsage: LLMUsage = { inputTokens: 0, outputTokens: 0, calls: 0 }; + const allEntries: ImprovementEntry[] = []; + let totalAccepted = 0; + + for (let iteration = 1; iteration <= maxIterations; iteration++) { + if (continuous) { + console.log(`\n ═══ Iteration ${iteration}/${maxIterations} ═══════════════════════════════\n`); + } + + const { entries, usage, hadAccepted } = await runSingleIteration( + runConfig, improveConfig, iteration, + ); + + allEntries.push(...entries); + cumulativeUsage.inputTokens += usage.inputTokens; + cumulativeUsage.outputTokens += usage.outputTokens; + cumulativeUsage.calls += usage.calls; + + const accepted = entries.filter(e => e.verdict === 'accepted').length; + totalAccepted += accepted; + + // Early termination: no improvements this iteration → stop climbing + if (!hadAccepted) { + if (continuous && iteration < maxIterations) { + console.log(` No improvements in iteration ${iteration} — stopping continuous loop.\n`); + } + break; + } + + // If continuous and we accepted something, the next iteration re-baselines + // against the improved code (edits were applied by the orchestrator) + if (continuous && iteration < maxIterations) { + console.log(` Iteration ${iteration}: ${accepted} accepted — re-baselining for next iteration...\n`); + } + } + + // Cumulative summary for continuous mode + if (continuous) { + console.log(' ┌──────────────────────────────────────────────────┐'); + console.log(' │ CONTINUOUS MODE SUMMARY │'); + console.log(' └──────────────────────────────────────────────────┘\n'); + console.log(` Total accepted: ${totalAccepted}`); + console.log(` Total LLM cost: ${cumulativeUsage.calls} calls, ${cumulativeUsage.inputTokens} in / ${cumulativeUsage.outputTokens} out tokens\n`); + } +} + +/** + * Run a single iteration of the improvement pipeline. + * Extracted from the original runImproveLoop to support continuous mode. + */ +async function runSingleIteration( + runConfig: RunConfig, + improveConfig: ImproveConfig, + iteration: number, +): Promise<{ entries: ImprovementEntry[]; usage: LLMUsage; hadAccepted: boolean }> { const packageRoot = resolve(import.meta.dir, '../..'); const dataDir = join(packageRoot, 'data'); mkdirSync(dataDir, { recursive: true }); const callLLM = createLLMProvider(improveConfig); - console.log('\n ╔══════════════════════════════════════════════════╗'); - console.log(' ║ Verify Improvement Engine — Evidence-Centric ║'); - console.log(' ╚══════════════════════════════════════════════════╝\n'); - console.log(` LLM: ${improveConfig.llm} Candidates: ${improveConfig.maxCandidates} Max lines: ${improveConfig.maxLines}`); - if (improveConfig.dryRun) console.log(' Mode: DRY RUN (no edits applied)\n'); - else console.log(''); + // ─── Load directive ────────────────────────────────────────────────── + const directive = loadDirective(packageRoot, improveConfig.directivePath); + if (directive.raw && iteration === 1) { + console.log(` Directive loaded: focus=${directive.focus}, style=${directive.editStyle}`); + if (directive.priorityGates.length > 0) { + console.log(` Priority gates: ${directive.priorityGates.join(', ')}`); + } + if (directive.customInstructions) { + console.log(` Custom instructions: ${directive.customInstructions.substring(0, 80)}...`); + } + console.log(''); + } + + if (iteration === 1) { + console.log('\n ╔══════════════════════════════════════════════════╗'); + console.log(' ║ Verify Improvement Engine — Evidence-Centric ║'); + console.log(' ╚══════════════════════════════════════════════════╝\n'); + console.log(` LLM: ${improveConfig.llm} Candidates: ${improveConfig.maxCandidates} Max lines: ${improveConfig.maxLines}`); + if (improveConfig.promptSurface) console.log(' Prompt surface: ENABLED (LLM prompts in gates are tunable)'); + if (improveConfig.dryRun) console.log(' Mode: DRY RUN (no edits applied)\n'); + else console.log(''); + } // ─── Step 1: Baseline run ───────────────────────────────────────────── console.log(' [1/7] Running baseline self-test...'); @@ -107,15 +190,21 @@ export async function runImproveLoop( const clean = baselineLedger.filter(e => e.clean); console.log(` ${baselineLedger.length} scenarios: ${clean.length} clean, ${dirty.length} dirty\n`); + const usage: LLMUsage = { inputTokens: 0, outputTokens: 0, calls: 0 }; + if (dirty.length === 0) { console.log(' ✓ All scenarios clean — nothing to improve.\n'); saveImprovementLedger(dataDir, []); - return; + return { entries: [], usage, hadAccepted: false }; } // ─── Step 2: Evidence bundling ──────────────────────────────────────── console.log(' [2/7] Bundling violations by root cause...'); - const bundles = bundleViolations(baselineLedger); + let bundles = bundleViolations(baselineLedger); + + // Apply directive priorities to bundle ordering + bundles = applyDirectiveToBundles(bundles, directive); + console.log(` ${bundles.length} evidence bundle(s)\n`); for (const b of bundles) { @@ -144,14 +233,13 @@ export async function runImproveLoop( // ─── Step 4-6: Process each bundle ──────────────────────────────────── const entries: ImprovementEntry[] = []; - const usage: LLMUsage = { inputTokens: 0, outputTokens: 0, calls: 0 }; const attemptedHashes = new Set(priorHashes); // includes cross-run hashes const attempts: AttemptRecord[] = []; for (const bundle of bundles) { const entry = await processBundle( bundle, split, packageRoot, runConfig, improveConfig, callLLM, usage, - attemptedHashes, attempts, + attemptedHashes, attempts, directive, ); entries.push(entry); } @@ -175,6 +263,9 @@ export async function runImproveLoop( // ─── Step 7: Report ─────────────────────────────────────────────────── saveImprovementLedger(dataDir, entries); printImprovementReport(entries, usage); + + const hadAccepted = entries.some(e => e.verdict === 'accepted'); + return { entries, usage, hadAccepted }; } // ============================================================================= @@ -191,6 +282,7 @@ async function processBundle( usage: LLMUsage, attemptedHashes: Set, attempts: AttemptRecord[], + directive: ImproveDirective = { raw: '', priorityGates: [], focus: 'all', editStyle: 'minimal', customInstructions: '' }, ): Promise { const timestamp = new Date().toISOString(); const bundleId = bundle.id; @@ -231,9 +323,27 @@ async function processBundle( // Enrich diagnosis with prior attempt context (Fix 7) const priorContext = formatPriorAttempts(attempts); - const enrichedDiagnosis = diagnosis - ? diagnosis + priorContext - : priorContext || null; + + // Inject directive and prompt surface context + const directiveContext = formatDirectiveForPrompt(directive); + const promptSurfaceContext = improveConfig.promptSurface && bundle.triage.targetFile + ? formatPromptSurfaceContext(packageRoot, bundle.triage.targetFile) + : ''; + + // Check if this bundle targets a prompt region + const promptRegion = bundle.triage.targetFile + ? isPromptRegion(bundle.triage.targetFile, bundle.triage.targetFunction ?? undefined) + : null; + if (promptRegion && improveConfig.promptSurface) { + console.log(` Prompt surface: ${promptRegion.functionName} — ${promptRegion.description}`); + } + + const enrichedDiagnosis = [ + diagnosis, + priorContext, + directiveContext, + promptSurfaceContext, + ].filter(Boolean).join('\n') || null; const isClaude = improveConfig.llm === 'claude' || improveConfig.llm === 'claude-code'; console.log(` [5/7] Generating fix candidates${isClaude ? ' (Claude — architectural context)' : ''}...`); diff --git a/scripts/harness/types.ts b/scripts/harness/types.ts index 0ce1d6e..d38b364 100644 --- a/scripts/harness/types.ts +++ b/scripts/harness/types.ts @@ -288,6 +288,12 @@ export interface ImproveConfig { maxCandidates: number; maxLines: number; dryRun: boolean; + /** Continuous mode: re-run after each accepted improvement (default: 1 = single pass) */ + maxIterations?: number; + /** Path to directive file that guides improvement priorities (default: improve-directive.md) */ + directivePath?: string; + /** Enable prompt surface optimization — tune LLM prompts inside gates (default: false) */ + promptSurface?: boolean; } export type LLMCallFn = ( diff --git a/scripts/self-test.ts b/scripts/self-test.ts index 4576904..a7dba63 100644 --- a/scripts/self-test.ts +++ b/scripts/self-test.ts @@ -14,6 +14,16 @@ * bun run packages/verify/scripts/self-test.ts --improve --llm=gemini --api-key=AIza... * bun run packages/verify/scripts/self-test.ts --improve --dry-run --llm=gemini --api-key=AIza... * bun run packages/verify/scripts/self-test.ts --improve --llm=ollama --ollama-model=qwen3:4b + * + * Continuous mode (AutoAgent-inspired hill climbing): + * bun run packages/verify/scripts/self-test.ts --improve --continuous --llm=claude + * bun run packages/verify/scripts/self-test.ts --improve --max-iterations=5 --llm=gemini --api-key=AIza... + * + * Directive-driven improvement: + * bun run packages/verify/scripts/self-test.ts --improve --directive=improve-directive.md --llm=claude + * + * Prompt surface optimization: + * bun run packages/verify/scripts/self-test.ts --improve --prompt-surface --llm=claude */ import { resolve, join } from 'path'; @@ -45,6 +55,9 @@ function parseArgs(args: string[]): ParsedArgs { let maxCandidates = 3; let maxLines = 30; let dryRun = false; + let maxIterations = 1; + let directivePath: string | undefined; + let promptSurface = false; for (const arg of args) { if (arg.startsWith('--appDir=') || arg.startsWith('--app-dir=')) { @@ -76,6 +89,14 @@ function parseArgs(args: string[]): ParsedArgs { maxLines = parseInt(arg.slice('--max-lines='.length), 10); } else if (arg === '--dry-run') { dryRun = true; + } else if (arg.startsWith('--max-iterations=')) { + maxIterations = parseInt(arg.slice('--max-iterations='.length), 10); + } else if (arg === '--continuous') { + maxIterations = 10; // sensible default for continuous mode + } else if (arg.startsWith('--directive=')) { + directivePath = arg.slice('--directive='.length); + } else if (arg === '--prompt-surface') { + promptSurface = true; } else if (arg === '--help' || arg === '-h') { console.log(` Verify Self-Test — Autonomous Bug Discovery + Improvement @@ -101,6 +122,17 @@ function parseArgs(args: string[]): ParsedArgs { --max-lines=N Max changed lines per strategy (default: 30) --dry-run Evidence bundling + triage only, no fixes + Continuous Mode (AutoAgent-inspired): + --continuous Re-run after each accepted improvement (default: 10 iterations) + --max-iterations=N Max improvement iterations (default: 1 = single pass) + + Directive-Driven Improvement: + --directive=PATH Path to improve-directive.md (default: improve-directive.md) + Externalize improvement priorities without changing code + + Prompt Surface Optimization: + --prompt-surface Enable tuning of LLM prompts inside gates (vision, etc.) + Families: A Fingerprint collision detection B K5 constraint learning @@ -123,7 +155,7 @@ function parseArgs(args: string[]): ParsedArgs { const runConfig: RunConfig = { appDir, families, dockerEnabled, failOnBug, ledgerPath, scenarioIds }; const improveConfig: ImproveConfig | null = improve - ? { llm, apiKey, ollamaModel, ollamaHost, maxCandidates, maxLines, dryRun } + ? { llm, apiKey, ollamaModel, ollamaHost, maxCandidates, maxLines, dryRun, maxIterations, directivePath, promptSurface } : null; return { runConfig, improveConfig }; diff --git a/tests/unit/improve-directive.test.ts b/tests/unit/improve-directive.test.ts new file mode 100644 index 0000000..e9276b7 --- /dev/null +++ b/tests/unit/improve-directive.test.ts @@ -0,0 +1,93 @@ +import { describe, expect, test } from 'bun:test'; +import { + loadDirective, + formatDirectiveForPrompt, + applyDirectiveToBundles, +} from '../../scripts/harness/improve-directive.js'; +import { join } from 'path'; +import { writeFileSync, mkdirSync, rmSync } from 'fs'; +import { tmpdir } from 'os'; + +describe('improve-directive', () => { + const testDir = join(tmpdir(), `verify-directive-test-${Date.now()}`); + + test('returns default when no directive file exists', () => { + const directive = loadDirective('/nonexistent/path'); + expect(directive.raw).toBe(''); + expect(directive.priorityGates).toEqual([]); + expect(directive.focus).toBe('all'); + expect(directive.editStyle).toBe('minimal'); + expect(directive.customInstructions).toBe(''); + }); + + test('parses structured fields from directive file', () => { + mkdirSync(testDir, { recursive: true }); + writeFileSync(join(testDir, 'improve-directive.md'), `# Improve Directive + +priority-gates: security, grounding, http +focus: false_positives +edit-style: aggressive + +## Custom Instructions + +Prefer tightening regexes over adding special cases. +`); + + const directive = loadDirective(testDir); + expect(directive.priorityGates).toEqual(['security', 'grounding', 'http']); + expect(directive.focus).toBe('false_positives'); + expect(directive.editStyle).toBe('aggressive'); + expect(directive.customInstructions).toBe('Prefer tightening regexes over adding special cases.'); + + rmSync(testDir, { recursive: true, force: true }); + }); + + test('formats directive for prompt injection', () => { + const directive = { + raw: 'something', + priorityGates: ['security'], + focus: 'false_positives' as const, + editStyle: 'minimal' as const, + customInstructions: 'Be strict about XSS detection.', + }; + + const result = formatDirectiveForPrompt(directive); + expect(result).toContain('OPERATOR DIRECTIVE'); + expect(result).toContain('Priority gates: security'); + expect(result).toContain('Reduce false positives'); + expect(result).toContain('Be strict about XSS detection'); + }); + + test('returns empty string when no directive loaded', () => { + const directive = { + raw: '', + priorityGates: [], + focus: 'all' as const, + editStyle: 'minimal' as const, + customInstructions: '', + }; + + expect(formatDirectiveForPrompt(directive)).toBe(''); + }); + + test('applyDirectiveToBundles sorts priority gates first', () => { + const bundles = [ + { id: 'a', triage: { targetFile: 'src/gates/grounding.ts' } }, + { id: 'b', triage: { targetFile: 'src/gates/security.ts' } }, + { id: 'c', triage: { targetFile: 'src/gates/http.ts' } }, + ]; + + const directive = { + raw: 'x', + priorityGates: ['security'], + focus: 'all' as const, + editStyle: 'minimal' as const, + customInstructions: '', + }; + + const sorted = applyDirectiveToBundles(bundles, directive); + expect(sorted[0].id).toBe('b'); // security first + expect(sorted[1].id).toBe('a'); + expect(sorted[2].id).toBe('c'); + }); +}); From 39879e8b7be7252d59510239efa1eeca7d79156f Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 4 Apr 2026 22:51:01 +0000 Subject: [PATCH 2/2] gitignore: add fixtures/demo-app/.verify/ state directory https://claude.ai/code/session_01SJkfKmU2V83UrCvgyH2JAD --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 60bb874..6bdbc50 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,9 @@ data/subprocess-ledger.jsonl .verify-tmp/ fixtures/demo-app/.verify-tmp/ +# Verify state directories (created by test runs) +fixtures/demo-app/.verify/ + # Real-world harvest cache (fetched data, 24h TTL, never committed) .verify-cache/