diff --git a/autobot-backend/services/autoresearch/__init__.py b/autobot-backend/services/autoresearch/__init__.py index deb5d3db4..6d2d3ca14 100644 --- a/autobot-backend/services/autoresearch/__init__.py +++ b/autobot-backend/services/autoresearch/__init__.py @@ -11,6 +11,22 @@ Issue #2599: AutoBot-orchestrated loop + web search (M2). """ +from .knowledge_synthesizer import ExperimentInsight, KnowledgeSynthesizer +from .prompt_optimizer import ( + BenchmarkFn, + OptimizationSession, + OptimizationStatus, + PromptOptimizer, + PromptOptTarget, + PromptVariant, +) +from .scorers import ( + HumanReviewScorer, + LLMJudgeScorer, + PromptScorer, + ScorerResult, + ValBpbScorer, +) from .auto_research_agent import ( ApprovalGate, AutoResearchAgent, @@ -66,6 +82,20 @@ "ResearchHypothesis", "SearchResult", "SessionStatus", + # M3: Self-improvement (Issue #2600) + "PromptOptimizer", + "PromptOptTarget", + "PromptVariant", + "OptimizationSession", + "OptimizationStatus", + "BenchmarkFn", + "PromptScorer", + "ScorerResult", + "ValBpbScorer", + "LLMJudgeScorer", + "HumanReviewScorer", + "KnowledgeSynthesizer", + "ExperimentInsight", # Routes "router", # OSINT Engine (Issue #1949) diff --git a/autobot-backend/services/autoresearch/auto_research_agent.py b/autobot-backend/services/autoresearch/auto_research_agent.py index 0045f968b..12b8ce5b7 100644 --- a/autobot-backend/services/autoresearch/auto_research_agent.py +++ b/autobot-backend/services/autoresearch/auto_research_agent.py @@ -809,6 +809,33 @@ async def _handle_approval_gate( metrics.improvement_pct, status_key, ) + + # Dispatch notification for approval_needed event + try: + from services.notification_service import ( + NotificationEvent, + NotificationService, + ) + + notification_service = NotificationService() + await notification_service.send( + event=NotificationEvent.APPROVAL_NEEDED, + workflow_id=f"autoresearch:{session.id}", + payload={ + "experiment_id": experiment.id, + "topic": session.topic, + "improvement_pct": metrics.improvement_pct, + "val_bpb": metrics.result_val_bpb, + "baseline_val_bpb": metrics.baseline_val_bpb, + }, + config=self._get_notification_config(session.id), + ) + except Exception: + logger.exception( + "Failed to send approval notification for experiment %s", + experiment.id, + ) + # Non-blocking poll with short timeout so we don't freeze the loop decision = await self.approval_gate.wait_for_approval( session_id=session.id, @@ -822,6 +849,22 @@ async def _handle_approval_gate( decision, ) + def _get_notification_config(self, session_id: str): + """Return a default notification config for autoresearch approval events. + + Args: + session_id: The autoresearch session ID used as the workflow ID. + + Returns: + NotificationConfig scoped to the autoresearch session. + """ + from services.notification_service import NotificationConfig + + return NotificationConfig( + workflow_id=f"autoresearch:{session_id}", + channels={"approval_needed": ["in_app"]}, + ) + # ------------------------------------------------------------------ # Private: session persistence # ------------------------------------------------------------------ diff --git a/autobot-backend/services/autoresearch/prompt_optimizer.py b/autobot-backend/services/autoresearch/prompt_optimizer.py new file mode 100644 index 000000000..c92b4ca43 --- /dev/null +++ b/autobot-backend/services/autoresearch/prompt_optimizer.py @@ -0,0 +1,329 @@ +# AutoBot - AI-Powered Automation Platform +# Copyright (c) 2025 mrveiss +# Author: mrveiss +""" +Generic Prompt Optimizer + +Issue #2600: Mutates agent prompts, benchmarks via pluggable scorers, +and keeps/discards based on improvement. AutoResearchAgent is the first +optimization target; any agent can register a PromptOptTarget. + +Loop: + 1. Mutate current best prompt into N variants (via LLM) + 2. Run each variant through benchmark_fn to get output + 3. Score all variants via first scorer in chain (fast filter) + 4. Pass top-K candidates to next scorer (deeper evaluation) + 5. If best variant improves over baseline -> KEEP, update baseline + 6. Persist all results +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import time +import uuid +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Callable, Coroutine, Dict, List, Optional + +from .scorers import PromptScorer, ScorerResult + +logger = logging.getLogger(__name__) + + +class OptimizationStatus(str, Enum): + PENDING = "pending" + RUNNING = "running" + COMPLETED = "completed" + CANCELLED = "cancelled" + FAILED = "failed" + + +@dataclass +class PromptOptTarget: + """Registration for an agent that opts into prompt optimization.""" + + agent_name: str + current_prompt: str + scorer_chain: List[str] # scorer names in evaluation order + mutation_count: int = 5 + top_k: int = 2 + + +@dataclass +class PromptVariant: + """A single mutated prompt and its evaluation results.""" + + id: str = field(default_factory=lambda: str(uuid.uuid4())) + prompt_text: str = "" + output: str = "" + scores: Dict[str, float] = field(default_factory=dict) + final_score: float = 0.0 + round_number: int = 0 + created_at: float = field(default_factory=time.time) + + def to_dict(self) -> Dict[str, Any]: + return { + "id": self.id, + "prompt_text": self.prompt_text, + "output": self.output, + "scores": self.scores, + "final_score": self.final_score, + "round_number": self.round_number, + "created_at": self.created_at, + } + + +@dataclass +class OptimizationSession: + """Top-level record for a prompt optimization run.""" + + id: str = field(default_factory=lambda: str(uuid.uuid4())) + target: Optional[PromptOptTarget] = None + status: OptimizationStatus = OptimizationStatus.PENDING + rounds_completed: int = 0 + max_rounds: int = 3 + best_variant: Optional[PromptVariant] = None + baseline_score: float = 0.0 + all_variants: List[PromptVariant] = field(default_factory=list) + started_at: Optional[float] = None + completed_at: Optional[float] = None + error_message: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + return { + "id": self.id, + "target": { + "agent_name": self.target.agent_name, + "scorer_chain": self.target.scorer_chain, + "mutation_count": self.target.mutation_count, + "top_k": self.target.top_k, + } + if self.target + else None, + "status": self.status.value, + "rounds_completed": self.rounds_completed, + "max_rounds": self.max_rounds, + "best_variant": self.best_variant.to_dict() if self.best_variant else None, + "baseline_score": self.baseline_score, + "all_variants": [v.to_dict() for v in self.all_variants], + "started_at": self.started_at, + "completed_at": self.completed_at, + "error_message": self.error_message, + } + + +# Type for benchmark functions: takes a prompt string, returns output string +BenchmarkFn = Callable[[str], Coroutine[Any, Any, str]] + + +class PromptOptimizer: + """Generic prompt optimizer with pluggable scorers. + + Drives a mutation -> benchmark -> score -> keep/discard loop. + """ + + _MUTATION_SYSTEM_PROMPT = ( + "You are a prompt engineering expert. Generate {n} distinct variations " + "of the following prompt template. Each variation should preserve the " + "core intent but try a different approach: rephrasing, expanding detail, " + "adding constraints, or restructuring.\n\n" + "Return a JSON array of {n} strings, each being a complete prompt variant.\n\n" + "Original prompt:\n{prompt}" + ) + + def __init__( + self, + scorers: Dict[str, PromptScorer], + llm_service: Any, + ) -> None: + self._scorers = scorers + self._llm = llm_service + self._cancel_event = asyncio.Event() + self._current_session: Optional[OptimizationSession] = None + self._redis = None + + async def optimize( + self, + target: PromptOptTarget, + benchmark_fn: BenchmarkFn, + max_rounds: int = 3, + ) -> OptimizationSession: + """Run the optimization loop for a target. + + Args: + target: Agent's prompt optimization registration. + benchmark_fn: Async function that runs the prompt and returns output. + max_rounds: Number of mutation rounds. + + Returns: + Completed OptimizationSession. + """ + # Capture pre-cancel state before starting (caller may have called cancel()) + pre_cancelled = self._cancel_event.is_set() + + session = OptimizationSession( + target=target, + status=OptimizationStatus.RUNNING, + max_rounds=max_rounds, + started_at=time.time(), + ) + self._current_session = session + + if not pre_cancelled: + self._cancel_event.clear() + + current_best_prompt = target.current_prompt + + try: + for round_num in range(1, max_rounds + 1): + if self._cancel_event.is_set(): + session.status = OptimizationStatus.CANCELLED + break + + logger.info( + "PromptOptimizer: round %d/%d for %s", + round_num, + max_rounds, + target.agent_name, + ) + + round_variants = await self._run_round( + current_best_prompt=current_best_prompt, + target=target, + benchmark_fn=benchmark_fn, + round_number=round_num, + session=session, + ) + + if round_variants: + best_in_round = max(round_variants, key=lambda v: v.final_score) + if best_in_round.final_score > session.baseline_score: + session.best_variant = best_in_round + session.baseline_score = best_in_round.final_score + current_best_prompt = best_in_round.prompt_text + logger.info( + "PromptOptimizer: new best variant %s (score=%.3f)", + best_in_round.id, + best_in_round.final_score, + ) + + session.rounds_completed = round_num + await self._save_session(session) + + if session.status == OptimizationStatus.RUNNING: + session.status = OptimizationStatus.COMPLETED + except Exception as exc: + session.status = OptimizationStatus.FAILED + session.error_message = str(exc) + logger.exception("PromptOptimizer: optimization failed") + finally: + session.completed_at = time.time() + await self._save_session(session) + self._current_session = None + + return session + + async def _run_round( + self, + current_best_prompt: str, + target: PromptOptTarget, + benchmark_fn: BenchmarkFn, + round_number: int, + session: OptimizationSession, + ) -> List[PromptVariant]: + """Execute a single mutation -> benchmark -> score round.""" + # 1. Mutate + prompt_texts = await self._mutate_prompt( + current_best_prompt, target.mutation_count + ) + + # 2. Benchmark each variant + variants: List[PromptVariant] = [] + for prompt_text in prompt_texts: + output = await benchmark_fn(prompt_text) + variant = PromptVariant( + prompt_text=prompt_text, + output=output, + round_number=round_number, + ) + variants.append(variant) + + # 3. Score through the chain + candidates = variants + for scorer_name in target.scorer_chain: + scorer = self._scorers.get(scorer_name) + if scorer is None: + logger.warning( + "PromptOptimizer: scorer %r not found, skipping", scorer_name + ) + continue + + for variant in candidates: + result = await scorer.score( + variant.output, + { + "session_id": session.id, + "variant_id": variant.id, + }, + ) + variant.scores[scorer_name] = result.score + # Final score = average across all scorers so far + variant.final_score = ( + sum(variant.scores.values()) / len(variant.scores) + ) + + # Keep top-K for next scorer + candidates = sorted( + candidates, key=lambda v: v.final_score, reverse=True + )[: target.top_k] + + session.all_variants.extend(variants) + return candidates + + async def _mutate_prompt(self, base_prompt: str, n: int) -> List[str]: + """Generate N prompt variants using LLM.""" + system_msg = self._MUTATION_SYSTEM_PROMPT.format(n=n, prompt=base_prompt) + + try: + response = await self._llm.chat( + messages=[ + {"role": "system", "content": system_msg}, + {"role": "user", "content": "Generate the variants now."}, + ], + temperature=0.9, + max_tokens=2000, + ) + variants = json.loads(response.content) + if isinstance(variants, list): + return [str(v) for v in variants[:n]] + except (json.JSONDecodeError, Exception) as exc: + logger.warning("PromptOptimizer: mutation failed: %s", exc) + + return [base_prompt] # fallback: return original + + def cancel(self) -> None: + """Signal the running optimization to stop.""" + self._cancel_event.set() + + @property + def current_session(self) -> Optional[OptimizationSession]: + return self._current_session + + async def _get_redis(self): + if self._redis is None: + from autobot_shared.redis_client import get_redis_client + + self._redis = get_redis_client(async_client=True, database="main") + return self._redis + + async def _save_session(self, session: OptimizationSession) -> None: + """Persist session to Redis.""" + try: + redis = await self._get_redis() + key = f"autoresearch:prompt_opt:session:{session.id}" + await redis.set(key, json.dumps(session.to_dict()), ex=86400 * 7) + except Exception: + logger.exception("Failed to save optimization session %s", session.id) diff --git a/autobot-backend/services/autoresearch/prompt_optimizer_test.py b/autobot-backend/services/autoresearch/prompt_optimizer_test.py new file mode 100644 index 000000000..fd652e00f --- /dev/null +++ b/autobot-backend/services/autoresearch/prompt_optimizer_test.py @@ -0,0 +1,122 @@ +# AutoBot - AI-Powered Automation Platform +# Copyright (c) 2025 mrveiss +# Author: mrveiss +"""Tests for prompt optimizer — Issue #2600.""" + +from __future__ import annotations + +import json + +import pytest +from unittest.mock import AsyncMock, MagicMock + +from services.autoresearch.prompt_optimizer import ( + OptimizationSession, + OptimizationStatus, + PromptOptimizer, + PromptOptTarget, + PromptVariant, +) +from services.autoresearch.scorers import ScorerResult + + +class TestPromptVariantModel: + def test_to_dict(self): + variant = PromptVariant( + id="v1", + prompt_text="test prompt", + output="test output", + scores={"llm_judge": 0.8}, + final_score=0.8, + ) + d = variant.to_dict() + assert d["id"] == "v1" + assert d["prompt_text"] == "test prompt" + assert d["scores"] == {"llm_judge": 0.8} + assert d["final_score"] == 0.8 + + +class TestOptimizationSession: + def test_to_dict(self): + target = PromptOptTarget( + agent_name="test_agent", + current_prompt="base prompt", + scorer_chain=["llm_judge"], + mutation_count=3, + top_k=1, + ) + session = OptimizationSession(target=target) + d = session.to_dict() + assert d["status"] == "pending" + assert d["target"]["agent_name"] == "test_agent" + assert d["rounds_completed"] == 0 + + +class TestPromptOptimizerLoop: + @pytest.fixture + def mock_llm(self): + llm = AsyncMock() + # Return 3 variants as JSON array + mock_response = MagicMock() + mock_response.content = json.dumps(["variant A", "variant B", "variant C"]) + llm.chat.return_value = mock_response + return llm + + @pytest.fixture + def mock_scorer(self): + scorer = AsyncMock() + scorer.name = "test_scorer" + scorer.score.side_effect = [ + ScorerResult(score=0.3, raw_score=3, metadata={}, scorer_name="test_scorer"), + ScorerResult(score=0.8, raw_score=8, metadata={}, scorer_name="test_scorer"), + ScorerResult(score=0.5, raw_score=5, metadata={}, scorer_name="test_scorer"), + ] + return scorer + + @pytest.fixture + def optimizer(self, mock_llm, mock_scorer): + opt = PromptOptimizer( + scorers={"test_scorer": mock_scorer}, + llm_service=mock_llm, + ) + opt._redis = AsyncMock() + return opt + + @pytest.mark.asyncio + async def test_optimize_selects_best_variant(self, optimizer, mock_scorer): + target = PromptOptTarget( + agent_name="test", + current_prompt="base prompt", + scorer_chain=["test_scorer"], + mutation_count=3, + top_k=1, + ) + + async def benchmark_fn(prompt: str) -> str: + return f"output for: {prompt}" + + session = await optimizer.optimize(target, benchmark_fn, max_rounds=1) + + assert session.status.value == "completed" + assert session.rounds_completed == 1 + assert session.best_variant is not None + assert session.best_variant.final_score == 0.8 + assert len(session.all_variants) == 3 + + @pytest.mark.asyncio + async def test_optimize_cancel(self, optimizer): + target = PromptOptTarget( + agent_name="test", + current_prompt="base", + scorer_chain=["test_scorer"], + mutation_count=1, + top_k=1, + ) + optimizer.cancel() + + async def benchmark_fn(prompt: str) -> str: + return "output" + + session = await optimizer.optimize(target, benchmark_fn, max_rounds=5) + assert session.status.value == "cancelled" + assert session.rounds_completed == 0 diff --git a/autobot-backend/services/autoresearch/routes.py b/autobot-backend/services/autoresearch/routes.py index b4f856c9c..f3dd198f3 100644 --- a/autobot-backend/services/autoresearch/routes.py +++ b/autobot-backend/services/autoresearch/routes.py @@ -10,6 +10,7 @@ from __future__ import annotations import logging +import re from typing import Dict, List, Optional from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request @@ -18,7 +19,9 @@ from auth_middleware import check_admin_permission from .config import AutoResearchConfig +from .knowledge_synthesizer import KnowledgeSynthesizer from .models import Experiment, ExperimentState, HyperParams +from .prompt_optimizer import PromptOptimizer, PromptOptTarget from .runner import ExperimentRunner from .store import ExperimentStore @@ -39,9 +42,29 @@ class SetBaselineRequest(BaseModel): val_bpb: float -# Lazy-initialized singleton +class StartOptimizationRequest(BaseModel): + agent_name: str = Field(..., max_length=100) + max_rounds: int = Field(default=3, ge=1, le=10) + + +class SubmitScoreRequest(BaseModel): + score: int = Field(..., ge=0, le=10) + comment: str = Field(default="", max_length=1000) + + +class ApprovalDecisionRequest(BaseModel): + decision: str = Field(..., pattern="^(approved|rejected)$") + + +class SynthesizeRequest(BaseModel): + session_id: str = Field(..., max_length=100) + + +# Lazy-initialized singletons _runner: Optional[ExperimentRunner] = None _store: Optional[ExperimentStore] = None +_optimizer: Optional[PromptOptimizer] = None +_synthesizer: Optional[KnowledgeSynthesizer] = None def _get_store(request: Request) -> ExperimentStore: @@ -202,3 +225,254 @@ async def cancel_experiment( ) await runner.cancel() return {"status": "cancelled"} + + +def _get_optimizer(request: Request) -> PromptOptimizer: + """Get or create the PromptOptimizer singleton.""" + global _optimizer + app_opt = getattr(request.app.state, "autoresearch_optimizer", None) + if app_opt is not None: + return app_opt + if _optimizer is None: + from services.llm_service import get_llm_service + + _optimizer = PromptOptimizer( + scorers={}, # scorers registered at runtime + llm_service=get_llm_service(), + ) + request.app.state.autoresearch_optimizer = _optimizer + return _optimizer + + +def _get_synthesizer(request: Request) -> KnowledgeSynthesizer: + """Get or create the KnowledgeSynthesizer singleton.""" + global _synthesizer + app_synth = getattr(request.app.state, "autoresearch_synthesizer", None) + if app_synth is not None: + return app_synth + if _synthesizer is None: + from services.llm_service import get_llm_service + + store = _get_store(request) + _synthesizer = KnowledgeSynthesizer( + store=store, + llm_service=get_llm_service(), + ) + request.app.state.autoresearch_synthesizer = _synthesizer + return _synthesizer + + +# --- Prompt Optimizer Endpoints --- + + +@router.get("/prompt-optimizer/status") +async def get_optimizer_status( + request: Request, + _admin: bool = Depends(check_admin_permission), +): + """Get current prompt optimization session status.""" + optimizer = _get_optimizer(request) + session = optimizer.current_session + if session is None: + return {"running": False, "session": None} + return {"running": True, "session": session.to_dict()} + + +@router.post("/prompt-optimizer/start") +async def start_optimization( + request: Request, + body: StartOptimizationRequest, + background_tasks: BackgroundTasks, + _admin: bool = Depends(check_admin_permission), +): + """Start prompt optimization for a registered target.""" + optimizer = _get_optimizer(request) + if optimizer.current_session is not None: + raise HTTPException(status_code=409, detail="Optimization already running") + + # For now, only autoresearch_hypothesis is a valid target + if body.agent_name != "autoresearch_hypothesis": + raise HTTPException( + status_code=400, + detail=f"Unknown agent target: {body.agent_name}", + ) + + target = PromptOptTarget( + agent_name=body.agent_name, + current_prompt="", # loaded from agent at runtime + scorer_chain=["val_bpb"], + ) + + async def _benchmark(prompt: str) -> str: + return prompt # placeholder — real benchmark set up by agent + + background_tasks.add_task(optimizer.optimize, target, _benchmark, body.max_rounds) + return {"status": "started", "agent_name": body.agent_name} + + +@router.post("/prompt-optimizer/cancel") +async def cancel_optimization( + request: Request, + _admin: bool = Depends(check_admin_permission), +): + """Cancel running optimization.""" + optimizer = _get_optimizer(request) + if optimizer.current_session is None: + raise HTTPException(status_code=409, detail="No optimization running") + optimizer.cancel() + return {"status": "cancelling"} + + +@router.get("/prompt-optimizer/variants/{session_id}") +async def get_variants( + request: Request, + session_id: str, + _admin: bool = Depends(check_admin_permission), +): + """List prompt variants for an optimization session.""" + import json as _json + + from autobot_shared.redis_client import get_redis_client + + redis = get_redis_client(async_client=True, database="main") + key = f"autoresearch:prompt_opt:session:{session_id}" + raw = await redis.get(key) + if raw is None: + raise HTTPException(status_code=404, detail="Session not found") + data = _json.loads(raw) + return {"variants": data.get("all_variants", [])} + + +_UUID_PATTERN = re.compile(r"^[a-f0-9-]{1,64}$") + + +@router.post("/prompt-optimizer/variants/{variant_id}/score") +async def submit_variant_score( + request: Request, + variant_id: str, + body: SubmitScoreRequest, + session_id: str = Query(..., min_length=1, max_length=64), + _admin: bool = Depends(check_admin_permission), +): + """Submit a human score for a prompt variant.""" + import json as _json + + from autobot_shared.redis_client import get_redis_client + + # Validate key components to prevent Redis key injection + if not _UUID_PATTERN.match(session_id) or not _UUID_PATTERN.match(variant_id): + raise HTTPException(status_code=400, detail="Invalid session_id or variant_id format") + + redis = get_redis_client(async_client=True, database="main") + key = f"autoresearch:prompt_review:{session_id}:{variant_id}" + await redis.set( + key, + _json.dumps({"score": body.score, "comment": body.comment}), + ex=86400, + ) + return {"status": "scored", "variant_id": variant_id, "score": body.score} + + +# --- Approval Endpoints --- + + +@router.get("/approvals/pending") +async def list_pending_approvals( + request: Request, + _admin: bool = Depends(check_admin_permission), +): + """List pending approval requests.""" + import json as _json + + from autobot_shared.redis_client import get_redis_client + + redis = get_redis_client(async_client=True, database="main") + approvals = [] + async for key in redis.scan_iter("autoresearch:approval:pending:*"): + raw = await redis.get(key) + if raw: + data = _json.loads(raw if isinstance(raw, str) else raw.decode("utf-8")) + key_str = key if isinstance(key, str) else key.decode("utf-8") + parts = key_str.split(":") + if len(parts) >= 5: + status_key = ( + f"autoresearch:approval:status:{parts[3]}:{parts[4]}" + ) + status = await redis.get(status_key) + status_str = ( + status.decode("utf-8") if isinstance(status, bytes) else status + ) if status else "unknown" + if status_str == "pending": + data["status"] = "pending" + approvals.append(data) + return {"approvals": approvals} + + +@router.post("/approvals/{session_id}/{experiment_id}") +async def submit_approval_decision( + request: Request, + session_id: str, + experiment_id: str, + body: ApprovalDecisionRequest, + _admin: bool = Depends(check_admin_permission), +): + """Submit approve/reject decision for an experiment.""" + from autobot_shared.redis_client import get_redis_client + + redis = get_redis_client(async_client=True, database="main") + status_key = f"autoresearch:approval:status:{session_id}:{experiment_id}" + current = await redis.get(status_key) + if current is None: + raise HTTPException(status_code=404, detail="Approval request not found") + await redis.set(status_key, body.decision, ex=86400) + return { + "session_id": session_id, + "experiment_id": experiment_id, + "decision": body.decision, + } + + +# --- Knowledge Insights Endpoints --- + + +@router.get("/insights") +async def list_insights( + request: Request, + limit: int = Query(default=20, ge=1, le=100), + min_confidence: float = Query(default=0.0, ge=0.0, le=1.0), + _admin: bool = Depends(check_admin_permission), +): + """List distilled experiment insights.""" + synthesizer = _get_synthesizer(request) + insights = await synthesizer.query_insights("*", limit=limit) + filtered = [i for i in insights if i.confidence >= min_confidence] + return {"insights": [i.to_dict() for i in filtered], "count": len(filtered)} + + +@router.get("/insights/search") +async def search_insights( + request: Request, + q: str = Query(..., min_length=1, max_length=500), + limit: int = Query(default=5, ge=1, le=50), + _admin: bool = Depends(check_admin_permission), +): + """Semantic search over experiment insights.""" + synthesizer = _get_synthesizer(request) + insights = await synthesizer.query_insights(q, limit=limit) + return {"insights": [i.to_dict() for i in insights], "query": q} + + +@router.post("/insights/synthesize") +async def trigger_synthesis( + request: Request, + body: SynthesizeRequest, + _admin: bool = Depends(check_admin_permission), +): + """Manually trigger insight synthesis for a session.""" + synthesizer = _get_synthesizer(request) + insights = await synthesizer.synthesize_session(body.session_id) + return { + "session_id": body.session_id, + "insights_generated": len(insights), + "insights": [i.to_dict() for i in insights], + } diff --git a/autobot-backend/tests/test_autoresearch_m3.py b/autobot-backend/tests/test_autoresearch_m3.py new file mode 100644 index 000000000..664cd96e9 --- /dev/null +++ b/autobot-backend/tests/test_autoresearch_m3.py @@ -0,0 +1,115 @@ +# AutoBot - AI-Powered Automation Platform +# Copyright (c) 2025 mrveiss +# Author: mrveiss +"""Integration test for AutoResearch M3 — Issue #2600.""" + +from __future__ import annotations + +import json + +import pytest +from unittest.mock import AsyncMock, MagicMock + +from services.autoresearch.knowledge_synthesizer import KnowledgeSynthesizer +from services.autoresearch.models import ( + Experiment, + ExperimentResult, + ExperimentState, + HyperParams, +) +from services.autoresearch.prompt_optimizer import PromptOptimizer, PromptOptTarget +from services.autoresearch.scorers import LLMJudgeScorer, ScorerResult + + +class TestM3Integration: + """Test the full M3 pipeline: optimize -> synthesize -> query insights.""" + + @pytest.fixture + def mock_llm(self): + llm = AsyncMock() + # For mutation: return variants + mutation_response = MagicMock() + mutation_response.content = json.dumps(["variant A", "variant B"]) + + # For judge: return rating + judge_response = MagicMock() + judge_response.content = '{"rating": 7, "reasoning": "Good"}' + + # For synthesis: return insights + synthesis_response = MagicMock() + synthesis_response.content = json.dumps( + [ + { + "statement": "Higher warmup improves convergence", + "confidence": 0.9, + "supporting_experiments": ["e1"], + "related_hyperparams": ["warmup_steps"], + } + ] + ) + + # Cycle through responses + llm.chat.side_effect = [ + mutation_response, # optimizer mutation + judge_response, # scorer: variant A + judge_response, # scorer: variant B + synthesis_response, # knowledge synthesis + ] + return llm + + @pytest.mark.asyncio + async def test_optimize_then_synthesize(self, mock_llm): + # Setup scorer + scorer = LLMJudgeScorer( + llm_service=mock_llm, + criteria=["relevance"], + ) + + # Setup optimizer + optimizer = PromptOptimizer( + scorers={"llm_judge": scorer}, + llm_service=mock_llm, + ) + optimizer._redis = AsyncMock() + + target = PromptOptTarget( + agent_name="test_agent", + current_prompt="base prompt", + scorer_chain=["llm_judge"], + mutation_count=2, + top_k=1, + ) + + async def benchmark(prompt: str) -> str: + return f"Output from: {prompt}" + + # Run optimization + session = await optimizer.optimize(target, benchmark, max_rounds=1) + assert session.status.value == "completed" + assert session.best_variant is not None + + # Setup synthesizer with mock store + mock_store = AsyncMock() + mock_store.list_experiments.return_value = [ + Experiment( + id="e1", + hypothesis="test", + state=ExperimentState.KEPT, + hyperparams=HyperParams(warmup_steps=300), + result=ExperimentResult(val_bpb=4.5), + tags=["session:test-session"], + ), + ] + + synthesizer = KnowledgeSynthesizer( + store=mock_store, + llm_service=mock_llm, + ) + mock_collection = AsyncMock() + synthesizer._insights_collection = mock_collection + + # Run synthesis + insights = await synthesizer.synthesize_session("test-session") + assert len(insights) == 1 + assert insights[0].statement == "Higher warmup improves convergence" + mock_collection.upsert.assert_called_once()