diff --git a/autobot-backend/services/autoresearch/scorers.py b/autobot-backend/services/autoresearch/scorers.py
new file mode 100644
index 000000000..25f608d64
--- /dev/null
+++ b/autobot-backend/services/autoresearch/scorers.py
@@ -0,0 +1,319 @@
+# AutoBot - AI-Powered Automation Platform
+# Copyright (c) 2025 mrveiss
+# Author: mrveiss
+"""
+Pluggable scorer interface for prompt optimization.
+
+Issue #2600: Defines the scoring contract and concrete scorers for
+evaluating prompt variants — LLM-as-judge for bulk filtering,
+human review for top candidates, val_bpb for AutoResearch.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import re
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+# Validation pattern for Redis key components — alphanumeric, hyphens, underscores
+_KEY_COMPONENT_PATTERN = re.compile(r"^[a-zA-Z0-9_-]{1,64}$")
+
+
+@dataclass
+class ScorerResult:
+    """Result from a single scoring evaluation."""
+
+    score: float  # normalized 0.0-1.0
+    raw_score: Any  # scorer-specific value
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    scorer_name: str = ""
+
+    def __post_init__(self) -> None:
+        self.score = max(0.0, min(1.0, self.score))
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "score": self.score,
+            "raw_score": self.raw_score,
+            "metadata": self.metadata,
+            "scorer_name": self.scorer_name,
+        }
+
+
+class PromptScorer(ABC):
+    """Abstract base for prompt variant scorers."""
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Unique scorer identifier."""
+
+    @abstractmethod
+    async def score(self, prompt_output: str, context: Dict[str, Any]) -> ScorerResult:
+        """Score a prompt variant's output.
+
+        Args:
+            prompt_output: The text produced by running the prompt variant.
+            context: Scorer-specific context (hyperparams, criteria, etc.).
+
+        Returns:
+            ScorerResult with normalized score.
+        """
+
+
+from .models import Experiment, ExperimentResult, HyperParams
+from .runner import ExperimentRunner
+
+
+class ValBpbScorer(PromptScorer):
+    """Score prompt variants by running an AutoResearch experiment.
+
+    Uses the prompt output as the hypothesis, runs training via
+    ExperimentRunner, and scores by val_bpb improvement over baseline.
+    """
+
+    def __init__(
+        self,
+        runner: ExperimentRunner,
+        baseline_val_bpb: float,
+    ) -> None:
+        if baseline_val_bpb <= 0:
+            raise ValueError(f"baseline_val_bpb must be positive, got {baseline_val_bpb}")
+        self._runner = runner
+        self._baseline = baseline_val_bpb
+
+    @property
+    def name(self) -> str:
+        return "val_bpb"
+
+    async def score(self, prompt_output: str, context: Dict[str, Any]) -> ScorerResult:
+        hp_data = context.get("hyperparams", {})
+        hp = HyperParams.from_dict(hp_data) if hp_data else HyperParams()
+
+        experiment = Experiment(
+            hypothesis=prompt_output,
+            description="Prompt optimizer variant",
+            hyperparams=hp,
+        )
+
+        experiment = await self._runner.run_experiment(experiment)
+
+        val_bpb = (
+            experiment.result.val_bpb
+            if experiment.result and experiment.result.val_bpb is not None
+            else None
+        )
+
+        if val_bpb is None:
+            return ScorerResult(
+                score=0.0,
+                raw_score=None,
+                metadata={
+                    "error": (
+                        experiment.result.error_message
+                        if experiment.result
+                        else "no result"
+                    )
+                },
+                scorer_name=self.name,
+            )
+
+        # Normalize: improvement as fraction of baseline, clamped 0-1
+        improvement = self._baseline - val_bpb
+        normalized = max(0.0, improvement / self._baseline) if self._baseline > 0 else 0.0
+
+        return ScorerResult(
+            score=normalized,
+            raw_score=val_bpb,
+            metadata={
+                "baseline": self._baseline,
+                "improvement": improvement,
+                "state": experiment.state.value,
+            },
+            scorer_name=self.name,
+        )
+
+
+_RATING_PATTERN = re.compile(r"(\d+)\s*(?:/\s*10|out of\s*10)")
+
+_JUDGE_SYSTEM_PROMPT = (
+    "You are a prompt quality evaluator. Rate the following output on a scale "
+    "of 0-10 based on these criteria: {criteria}.\n\n"
+    "Respond with JSON: {{\"rating\": <0-10>, \"reasoning\": \"<brief explanation>\"}}"
+)
+
+
+class LLMJudgeScorer(PromptScorer):
+    """Score prompt variants using an LLM as judge.
+
+    Sends the prompt output to LLMService with evaluation criteria,
+    parses a 0-10 rating, normalizes to 0.0-1.0.
+    """
+
+    def __init__(
+        self,
+        llm_service: Any,
+        criteria: list[str],
+    ) -> None:
+        self._llm = llm_service
+        self._criteria = criteria
+
+    @property
+    def name(self) -> str:
+        return "llm_judge"
+
+    async def score(self, prompt_output: str, context: Dict[str, Any]) -> ScorerResult:
+        criteria_str = ", ".join(self._criteria)
+        system_msg = _JUDGE_SYSTEM_PROMPT.format(criteria=criteria_str)
+
+        try:
+            response = await self._llm.chat(
+                messages=[
+                    {"role": "system", "content": system_msg},
+                    {
+                        "role": "user",
+                        "content": f"Evaluate this output:\n\n{prompt_output}",
+                    },
+                ],
+                temperature=0.1,
+                max_tokens=200,
+            )
+            rating = self._parse_rating(response.content)
+        except Exception as exc:
+            logger.warning("LLMJudgeScorer: LLM call failed: %s", exc)
+            return ScorerResult(
+                score=0.0,
+                raw_score=None,
+                metadata={"error": str(exc)},
+                scorer_name=self.name,
+            )
+
+        return ScorerResult(
+            score=rating / 10.0,
+            raw_score=rating,
+            metadata={"criteria": self._criteria},
+            scorer_name=self.name,
+        )
+
+    @staticmethod
+    def _parse_rating(content: str) -> int:
+        """Extract rating from LLM response — try JSON first, then regex."""
+        try:
+            data = json.loads(content)
+            raw = int(data["rating"])
+            return max(0, min(10, raw))
+        except (json.JSONDecodeError, KeyError, ValueError, TypeError):
+            pass
+
+        match = _RATING_PATTERN.search(content)
+        if match:
+            return max(0, min(10, int(match.group(1))))
+
+        logger.warning("LLMJudgeScorer: could not parse rating from: %s", content[:100])
+        return 0
+
+
+class HumanReviewScorer(PromptScorer):
+    """Queue a prompt variant for human review and poll for a score.
+
+    Stores the variant in Redis; the API endpoint allows humans to
+    submit a 0-10 score. Polls until scored or timeout.
+    """
+
+    _REVIEW_KEY = "autoresearch:prompt_review:{session_id}:{variant_id}"
+    _PENDING_KEY = "autoresearch:prompt_review:pending:{session_id}:{variant_id}"
+    _TTL_SECONDS = 86400
+
+    def __init__(
+        self,
+        poll_interval: float = 5.0,
+        timeout: float = 300.0,
+    ) -> None:
+        self._poll_interval = poll_interval
+        self._timeout = timeout
+        self._redis = None
+
+    async def _get_redis(self):
+        if self._redis is None:
+            from autobot_shared.redis_client import get_redis_client
+
+            self._redis = get_redis_client(async_client=True, database="main")
+        return self._redis
+
+    @property
+    def name(self) -> str:
+        return "human_review"
+
+    @staticmethod
+    def _validate_key_component(value: str, name: str) -> str:
+        """Validate a string is safe for use in Redis key patterns."""
+        if not _KEY_COMPONENT_PATTERN.match(value):
+            raise ValueError(
+                f"{name} must be alphanumeric/hyphens/underscores (1-64 chars), got {value!r}"
+            )
+        return value
+
+    async def score(self, prompt_output: str, context: Dict[str, Any]) -> ScorerResult:
+        session_id = self._validate_key_component(
+            context.get("session_id", "unknown"), "session_id"
+        )
+        variant_id = self._validate_key_component(
+            context.get("variant_id", "unknown"), "variant_id"
+        )
+
+        redis = await self._get_redis()
+
+        # Store pending review — only safe fields, not raw context
+        pending_key = self._PENDING_KEY.format(
+            session_id=session_id, variant_id=variant_id
+        )
+        await redis.set(
+            pending_key,
+            json.dumps({
+                "prompt_output": prompt_output[:5000],
+                "session_id": session_id,
+                "variant_id": variant_id,
+            }),
+            ex=self._TTL_SECONDS,
+        )
+
+        # Poll for score
+        review_key = self._REVIEW_KEY.format(
+            session_id=session_id, variant_id=variant_id
+        )
+        deadline = time.monotonic() + self._timeout
+
+        while time.monotonic() < deadline:
+            raw = await redis.get(review_key)
+            if raw is not None:
+                data = json.loads(raw if isinstance(raw, str) else raw.decode("utf-8"))
+                rating = max(0, min(10, int(data.get("score", 0))))
+                return ScorerResult(
+                    score=rating / 10.0,
+                    raw_score=rating,
+                    metadata={
+                        "comment": data.get("comment", ""),
+                        "status": "reviewed",
+                    },
+                    scorer_name=self.name,
+                )
+            await asyncio.sleep(self._poll_interval)
+
+        logger.info(
+            "HumanReviewScorer: timed out for session=%s variant=%s",
+            session_id,
+            variant_id,
+        )
+        return ScorerResult(
+            score=0.0,
+            raw_score=None,
+            metadata={"status": "timeout"},
+            scorer_name=self.name,
+        )
diff --git a/autobot-backend/services/autoresearch/scorers_test.py b/autobot-backend/services/autoresearch/scorers_test.py
new file mode 100644
index 000000000..2c3203f75
--- /dev/null
+++ b/autobot-backend/services/autoresearch/scorers_test.py
@@ -0,0 +1,172 @@
+# AutoBot - AI-Powered Automation Platform
+# Copyright (c) 2025 mrveiss
+# Author: mrveiss
+"""Tests for autoresearch scorers — Issue #2600."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+from unittest.mock import AsyncMock, MagicMock
+
+from services.autoresearch.models import Experiment, ExperimentResult, ExperimentState
+from services.autoresearch.scorers import (
+    HumanReviewScorer,
+    LLMJudgeScorer,
+    ScorerResult,
+    ValBpbScorer,
+)
+
+
+class TestScorerResult:
+    def test_to_dict(self):
+        result = ScorerResult(
+            score=0.85,
+            raw_score=4.2,
+            metadata={"model": "test"},
+            scorer_name="test_scorer",
+        )
+        d = result.to_dict()
+        assert d["score"] == 0.85
+        assert d["raw_score"] == 4.2
+        assert d["metadata"] == {"model": "test"}
+        assert d["scorer_name"] == "test_scorer"
+
+    def test_score_clamped_to_range(self):
+        result = ScorerResult(score=1.5, raw_score=1.5, metadata={}, scorer_name="t")
+        assert result.score == 1.0
+
+    def test_score_floor(self):
+        result = ScorerResult(score=-0.5, raw_score=-0.5, metadata={}, scorer_name="t")
+        assert result.score == 0.0
+
+
+class TestValBpbScorer:
+    @pytest.fixture
+    def mock_runner(self):
+        runner = AsyncMock()
+        return runner
+
+    @pytest.fixture
+    def scorer(self, mock_runner):
+        return ValBpbScorer(runner=mock_runner, baseline_val_bpb=5.0)
+
+    @pytest.mark.asyncio
+    async def test_score_improvement(self, scorer, mock_runner):
+        experiment = Experiment(state=ExperimentState.KEPT)
+        experiment.result = ExperimentResult(val_bpb=4.5)
+        experiment.baseline_val_bpb = 5.0
+        mock_runner.run_experiment.return_value = experiment
+
+        result = await scorer.score(
+            "test hypothesis",
+            {"hyperparams": {}},
+        )
+        assert result.score > 0.0
+        assert result.raw_score == 4.5
+        assert result.scorer_name == "val_bpb"
+
+    @pytest.mark.asyncio
+    async def test_score_no_improvement(self, scorer, mock_runner):
+        experiment = Experiment(state=ExperimentState.DISCARDED)
+        experiment.result = ExperimentResult(val_bpb=5.5)
+        experiment.baseline_val_bpb = 5.0
+        mock_runner.run_experiment.return_value = experiment
+
+        result = await scorer.score("test hypothesis", {"hyperparams": {}})
+        assert result.score == 0.0
+        assert result.raw_score == 5.5
+
+    @pytest.mark.asyncio
+    async def test_score_failed_experiment(self, scorer, mock_runner):
+        experiment = Experiment(state=ExperimentState.FAILED)
+        experiment.result = ExperimentResult(error_message="OOM")
+        mock_runner.run_experiment.return_value = experiment
+
+        result = await scorer.score("test hypothesis", {"hyperparams": {}})
+        assert result.score == 0.0
+        assert result.raw_score is None
+
+
+class TestLLMJudgeScorer:
+    @pytest.fixture
+    def mock_llm(self):
+        llm = AsyncMock()
+        return llm
+
+    @pytest.fixture
+    def scorer(self, mock_llm):
+        return LLMJudgeScorer(
+            llm_service=mock_llm,
+            criteria=["relevance", "specificity", "actionability"],
+        )
+
+    @pytest.mark.asyncio
+    async def test_score_parses_llm_rating(self, scorer, mock_llm):
+        mock_response = MagicMock()
+        mock_response.content = '{"rating": 8, "reasoning": "Good hypothesis"}'
+        mock_llm.chat.return_value = mock_response
+
+        result = await scorer.score("A detailed hypothesis", {})
+        assert result.score == 0.8  # 8/10 normalized
+        assert result.raw_score == 8
+        assert result.scorer_name == "llm_judge"
+
+    @pytest.mark.asyncio
+    async def test_score_handles_non_json_response(self, scorer, mock_llm):
+        mock_response = MagicMock()
+        mock_response.content = "I rate this 7 out of 10"
+        mock_llm.chat.return_value = mock_response
+
+        result = await scorer.score("A hypothesis", {})
+        # Falls back to regex extraction
+        assert result.score == 0.7
+        assert result.raw_score == 7
+
+    @pytest.mark.asyncio
+    async def test_score_handles_llm_failure(self, scorer, mock_llm):
+        mock_llm.chat.side_effect = Exception("LLM unavailable")
+
+        result = await scorer.score("A hypothesis", {})
+        assert result.score == 0.0
+        assert "error" in result.metadata
+
+
+class TestHumanReviewScorer:
+    @pytest.fixture
+    def mock_redis(self):
+        redis = AsyncMock()
+        return redis
+
+    @pytest.fixture
+    def scorer(self, mock_redis):
+        s = HumanReviewScorer(poll_interval=0.01, timeout=0.05)
+        s._redis = mock_redis
+        return s
+
+    @pytest.mark.asyncio
+    async def test_score_approved_with_rating(self, scorer, mock_redis):
+        # Simulate human submitting a score
+        mock_redis.get.side_effect = [
+            None,  # first poll: no score yet
+            json.dumps({"score": 9, "comment": "excellent"}).encode(),  # second poll
+        ]
+        result = await scorer.score(
+            "test output",
+            {"session_id": "s1", "variant_id": "v1"},
+        )
+        assert result.score == 0.9
+        assert result.raw_score == 9
+        assert result.scorer_name == "human_review"
+
+    @pytest.mark.asyncio
+    async def test_score_timeout_returns_none(self, scorer, mock_redis):
+        mock_redis.get.return_value = None  # never receives a score
+
+        result = await scorer.score(
+            "test output",
+            {"session_id": "s1", "variant_id": "v1"},
+        )
+        assert result.score == 0.0
+        assert result.metadata.get("status") == "timeout"