diff --git a/autobot-backend/services/autoresearch/scorers.py b/autobot-backend/services/autoresearch/scorers.py new file mode 100644 index 000000000..25f608d64 --- /dev/null +++ b/autobot-backend/services/autoresearch/scorers.py @@ -0,0 +1,319 @@ +# AutoBot - AI-Powered Automation Platform +# Copyright (c) 2025 mrveiss +# Author: mrveiss +""" +Pluggable scorer interface for prompt optimization. + +Issue #2600: Defines the scoring contract and concrete scorers for +evaluating prompt variants — LLM-as-judge for bulk filtering, +human review for top candidates, val_bpb for AutoResearch. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import re +import time +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, Dict, Optional + +logger = logging.getLogger(__name__) + +# Validation pattern for Redis key components — alphanumeric, hyphens, underscores +_KEY_COMPONENT_PATTERN = re.compile(r"^[a-zA-Z0-9_-]{1,64}$") + + +@dataclass +class ScorerResult: + """Result from a single scoring evaluation.""" + + score: float # normalized 0.0-1.0 + raw_score: Any # scorer-specific value + metadata: Dict[str, Any] = field(default_factory=dict) + scorer_name: str = "" + + def __post_init__(self) -> None: + self.score = max(0.0, min(1.0, self.score)) + + def to_dict(self) -> Dict[str, Any]: + return { + "score": self.score, + "raw_score": self.raw_score, + "metadata": self.metadata, + "scorer_name": self.scorer_name, + } + + +class PromptScorer(ABC): + """Abstract base for prompt variant scorers.""" + + @property + @abstractmethod + def name(self) -> str: + """Unique scorer identifier.""" + + @abstractmethod + async def score(self, prompt_output: str, context: Dict[str, Any]) -> ScorerResult: + """Score a prompt variant's output. + + Args: + prompt_output: The text produced by running the prompt variant. + context: Scorer-specific context (hyperparams, criteria, etc.). + + Returns: + ScorerResult with normalized score. + """ + + +from .models import Experiment, ExperimentResult, HyperParams +from .runner import ExperimentRunner + + +class ValBpbScorer(PromptScorer): + """Score prompt variants by running an AutoResearch experiment. + + Uses the prompt output as the hypothesis, runs training via + ExperimentRunner, and scores by val_bpb improvement over baseline. + """ + + def __init__( + self, + runner: ExperimentRunner, + baseline_val_bpb: float, + ) -> None: + if baseline_val_bpb <= 0: + raise ValueError(f"baseline_val_bpb must be positive, got {baseline_val_bpb}") + self._runner = runner + self._baseline = baseline_val_bpb + + @property + def name(self) -> str: + return "val_bpb" + + async def score(self, prompt_output: str, context: Dict[str, Any]) -> ScorerResult: + hp_data = context.get("hyperparams", {}) + hp = HyperParams.from_dict(hp_data) if hp_data else HyperParams() + + experiment = Experiment( + hypothesis=prompt_output, + description="Prompt optimizer variant", + hyperparams=hp, + ) + + experiment = await self._runner.run_experiment(experiment) + + val_bpb = ( + experiment.result.val_bpb + if experiment.result and experiment.result.val_bpb is not None + else None + ) + + if val_bpb is None: + return ScorerResult( + score=0.0, + raw_score=None, + metadata={ + "error": ( + experiment.result.error_message + if experiment.result + else "no result" + ) + }, + scorer_name=self.name, + ) + + # Normalize: improvement as fraction of baseline, clamped 0-1 + improvement = self._baseline - val_bpb + normalized = max(0.0, improvement / self._baseline) if self._baseline > 0 else 0.0 + + return ScorerResult( + score=normalized, + raw_score=val_bpb, + metadata={ + "baseline": self._baseline, + "improvement": improvement, + "state": experiment.state.value, + }, + scorer_name=self.name, + ) + + +_RATING_PATTERN = re.compile(r"(\d+)\s*(?:/\s*10|out of\s*10)") + +_JUDGE_SYSTEM_PROMPT = ( + "You are a prompt quality evaluator. Rate the following output on a scale " + "of 0-10 based on these criteria: {criteria}.\n\n" + "Respond with JSON: {{\"rating\": <0-10>, \"reasoning\": \"\"}}" +) + + +class LLMJudgeScorer(PromptScorer): + """Score prompt variants using an LLM as judge. + + Sends the prompt output to LLMService with evaluation criteria, + parses a 0-10 rating, normalizes to 0.0-1.0. + """ + + def __init__( + self, + llm_service: Any, + criteria: list[str], + ) -> None: + self._llm = llm_service + self._criteria = criteria + + @property + def name(self) -> str: + return "llm_judge" + + async def score(self, prompt_output: str, context: Dict[str, Any]) -> ScorerResult: + criteria_str = ", ".join(self._criteria) + system_msg = _JUDGE_SYSTEM_PROMPT.format(criteria=criteria_str) + + try: + response = await self._llm.chat( + messages=[ + {"role": "system", "content": system_msg}, + { + "role": "user", + "content": f"Evaluate this output:\n\n{prompt_output}", + }, + ], + temperature=0.1, + max_tokens=200, + ) + rating = self._parse_rating(response.content) + except Exception as exc: + logger.warning("LLMJudgeScorer: LLM call failed: %s", exc) + return ScorerResult( + score=0.0, + raw_score=None, + metadata={"error": str(exc)}, + scorer_name=self.name, + ) + + return ScorerResult( + score=rating / 10.0, + raw_score=rating, + metadata={"criteria": self._criteria}, + scorer_name=self.name, + ) + + @staticmethod + def _parse_rating(content: str) -> int: + """Extract rating from LLM response — try JSON first, then regex.""" + try: + data = json.loads(content) + raw = int(data["rating"]) + return max(0, min(10, raw)) + except (json.JSONDecodeError, KeyError, ValueError, TypeError): + pass + + match = _RATING_PATTERN.search(content) + if match: + return max(0, min(10, int(match.group(1)))) + + logger.warning("LLMJudgeScorer: could not parse rating from: %s", content[:100]) + return 0 + + +class HumanReviewScorer(PromptScorer): + """Queue a prompt variant for human review and poll for a score. + + Stores the variant in Redis; the API endpoint allows humans to + submit a 0-10 score. Polls until scored or timeout. + """ + + _REVIEW_KEY = "autoresearch:prompt_review:{session_id}:{variant_id}" + _PENDING_KEY = "autoresearch:prompt_review:pending:{session_id}:{variant_id}" + _TTL_SECONDS = 86400 + + def __init__( + self, + poll_interval: float = 5.0, + timeout: float = 300.0, + ) -> None: + self._poll_interval = poll_interval + self._timeout = timeout + self._redis = None + + async def _get_redis(self): + if self._redis is None: + from autobot_shared.redis_client import get_redis_client + + self._redis = get_redis_client(async_client=True, database="main") + return self._redis + + @property + def name(self) -> str: + return "human_review" + + @staticmethod + def _validate_key_component(value: str, name: str) -> str: + """Validate a string is safe for use in Redis key patterns.""" + if not _KEY_COMPONENT_PATTERN.match(value): + raise ValueError( + f"{name} must be alphanumeric/hyphens/underscores (1-64 chars), got {value!r}" + ) + return value + + async def score(self, prompt_output: str, context: Dict[str, Any]) -> ScorerResult: + session_id = self._validate_key_component( + context.get("session_id", "unknown"), "session_id" + ) + variant_id = self._validate_key_component( + context.get("variant_id", "unknown"), "variant_id" + ) + + redis = await self._get_redis() + + # Store pending review — only safe fields, not raw context + pending_key = self._PENDING_KEY.format( + session_id=session_id, variant_id=variant_id + ) + await redis.set( + pending_key, + json.dumps({ + "prompt_output": prompt_output[:5000], + "session_id": session_id, + "variant_id": variant_id, + }), + ex=self._TTL_SECONDS, + ) + + # Poll for score + review_key = self._REVIEW_KEY.format( + session_id=session_id, variant_id=variant_id + ) + deadline = time.monotonic() + self._timeout + + while time.monotonic() < deadline: + raw = await redis.get(review_key) + if raw is not None: + data = json.loads(raw if isinstance(raw, str) else raw.decode("utf-8")) + rating = max(0, min(10, int(data.get("score", 0)))) + return ScorerResult( + score=rating / 10.0, + raw_score=rating, + metadata={ + "comment": data.get("comment", ""), + "status": "reviewed", + }, + scorer_name=self.name, + ) + await asyncio.sleep(self._poll_interval) + + logger.info( + "HumanReviewScorer: timed out for session=%s variant=%s", + session_id, + variant_id, + ) + return ScorerResult( + score=0.0, + raw_score=None, + metadata={"status": "timeout"}, + scorer_name=self.name, + ) diff --git a/autobot-backend/services/autoresearch/scorers_test.py b/autobot-backend/services/autoresearch/scorers_test.py new file mode 100644 index 000000000..2c3203f75 --- /dev/null +++ b/autobot-backend/services/autoresearch/scorers_test.py @@ -0,0 +1,172 @@ +# AutoBot - AI-Powered Automation Platform +# Copyright (c) 2025 mrveiss +# Author: mrveiss +"""Tests for autoresearch scorers — Issue #2600.""" + +from __future__ import annotations + +import json + +import pytest +from unittest.mock import AsyncMock, MagicMock + +from services.autoresearch.models import Experiment, ExperimentResult, ExperimentState +from services.autoresearch.scorers import ( + HumanReviewScorer, + LLMJudgeScorer, + ScorerResult, + ValBpbScorer, +) + + +class TestScorerResult: + def test_to_dict(self): + result = ScorerResult( + score=0.85, + raw_score=4.2, + metadata={"model": "test"}, + scorer_name="test_scorer", + ) + d = result.to_dict() + assert d["score"] == 0.85 + assert d["raw_score"] == 4.2 + assert d["metadata"] == {"model": "test"} + assert d["scorer_name"] == "test_scorer" + + def test_score_clamped_to_range(self): + result = ScorerResult(score=1.5, raw_score=1.5, metadata={}, scorer_name="t") + assert result.score == 1.0 + + def test_score_floor(self): + result = ScorerResult(score=-0.5, raw_score=-0.5, metadata={}, scorer_name="t") + assert result.score == 0.0 + + +class TestValBpbScorer: + @pytest.fixture + def mock_runner(self): + runner = AsyncMock() + return runner + + @pytest.fixture + def scorer(self, mock_runner): + return ValBpbScorer(runner=mock_runner, baseline_val_bpb=5.0) + + @pytest.mark.asyncio + async def test_score_improvement(self, scorer, mock_runner): + experiment = Experiment(state=ExperimentState.KEPT) + experiment.result = ExperimentResult(val_bpb=4.5) + experiment.baseline_val_bpb = 5.0 + mock_runner.run_experiment.return_value = experiment + + result = await scorer.score( + "test hypothesis", + {"hyperparams": {}}, + ) + assert result.score > 0.0 + assert result.raw_score == 4.5 + assert result.scorer_name == "val_bpb" + + @pytest.mark.asyncio + async def test_score_no_improvement(self, scorer, mock_runner): + experiment = Experiment(state=ExperimentState.DISCARDED) + experiment.result = ExperimentResult(val_bpb=5.5) + experiment.baseline_val_bpb = 5.0 + mock_runner.run_experiment.return_value = experiment + + result = await scorer.score("test hypothesis", {"hyperparams": {}}) + assert result.score == 0.0 + assert result.raw_score == 5.5 + + @pytest.mark.asyncio + async def test_score_failed_experiment(self, scorer, mock_runner): + experiment = Experiment(state=ExperimentState.FAILED) + experiment.result = ExperimentResult(error_message="OOM") + mock_runner.run_experiment.return_value = experiment + + result = await scorer.score("test hypothesis", {"hyperparams": {}}) + assert result.score == 0.0 + assert result.raw_score is None + + +class TestLLMJudgeScorer: + @pytest.fixture + def mock_llm(self): + llm = AsyncMock() + return llm + + @pytest.fixture + def scorer(self, mock_llm): + return LLMJudgeScorer( + llm_service=mock_llm, + criteria=["relevance", "specificity", "actionability"], + ) + + @pytest.mark.asyncio + async def test_score_parses_llm_rating(self, scorer, mock_llm): + mock_response = MagicMock() + mock_response.content = '{"rating": 8, "reasoning": "Good hypothesis"}' + mock_llm.chat.return_value = mock_response + + result = await scorer.score("A detailed hypothesis", {}) + assert result.score == 0.8 # 8/10 normalized + assert result.raw_score == 8 + assert result.scorer_name == "llm_judge" + + @pytest.mark.asyncio + async def test_score_handles_non_json_response(self, scorer, mock_llm): + mock_response = MagicMock() + mock_response.content = "I rate this 7 out of 10" + mock_llm.chat.return_value = mock_response + + result = await scorer.score("A hypothesis", {}) + # Falls back to regex extraction + assert result.score == 0.7 + assert result.raw_score == 7 + + @pytest.mark.asyncio + async def test_score_handles_llm_failure(self, scorer, mock_llm): + mock_llm.chat.side_effect = Exception("LLM unavailable") + + result = await scorer.score("A hypothesis", {}) + assert result.score == 0.0 + assert "error" in result.metadata + + +class TestHumanReviewScorer: + @pytest.fixture + def mock_redis(self): + redis = AsyncMock() + return redis + + @pytest.fixture + def scorer(self, mock_redis): + s = HumanReviewScorer(poll_interval=0.01, timeout=0.05) + s._redis = mock_redis + return s + + @pytest.mark.asyncio + async def test_score_approved_with_rating(self, scorer, mock_redis): + # Simulate human submitting a score + mock_redis.get.side_effect = [ + None, # first poll: no score yet + json.dumps({"score": 9, "comment": "excellent"}).encode(), # second poll + ] + result = await scorer.score( + "test output", + {"session_id": "s1", "variant_id": "v1"}, + ) + assert result.score == 0.9 + assert result.raw_score == 9 + assert result.scorer_name == "human_review" + + @pytest.mark.asyncio + async def test_score_timeout_returns_none(self, scorer, mock_redis): + mock_redis.get.return_value = None # never receives a score + + result = await scorer.score( + "test output", + {"session_id": "s1", "variant_id": "v1"}, + ) + assert result.score == 0.0 + assert result.metadata.get("status") == "timeout"