raw-labs · alexzerntev · Nov 24, 2025 · Nov 24, 2025 · Nov 25, 2025 · Nov 26, 2025
diff --git a/docs/guides/configuration.md b/docs/guides/configuration.md
@@ -609,19 +609,14 @@ Add model configuration to your user config file (`~/.mxcp/config.yml`):
 
 ```yaml
 models:
-  default: "claude-4-sonnet"  # Default model to use for evals
+  default: "claude-3-5-sonnet-20240620"  # Default model to use for evals (update to a valid ID)
   models:
-    claude-4-opus:
-      type: "claude"
+    claude-3-5-sonnet-20240620:
+      type: "anthropic"
       api_key: "${ANTHROPIC_API_KEY}"  # Environment variable containing API key
-      timeout: 60  # Request timeout in seconds
+      timeout: 30  # Anthropic Messages model ID; ensure your account has access
       max_retries: 3  # Number of retries on failure
 
-    claude-4-sonnet:
-      type: "claude"
-      api_key: "${ANTHROPIC_API_KEY}"
-      timeout: 30
-
     gpt-4o:
       type: "openai"
       api_key: "${OPENAI_API_KEY}"
@@ -638,11 +633,48 @@ models:
 
 - **default**: The model to use when not specified in eval suite or CLI
 - **models**: Dictionary of model configurations
-  - **type**: Either "claude" or "openai"
+- **type**: Either "anthropic" or "openai"
   - **api_key**: API key (you can use environment variables references)
-  - **base_url**: Custom API endpoint (optional, for OpenAI-compatible services)
-  - **timeout**: Request timeout in seconds
-  - **max_retries**: Number of retries on failure
+- **base_url**: Custom API endpoint (optional, for OpenAI-compatible services)
+- **timeout**: Request timeout in seconds
+- **max_retries**: Number of retries on failure
+- **options**: Extra provider-specific options forwarded to the model (e.g. `thinking: false`)
+
+Example with mixed providers and options:
+
+```yaml
+models:
+  default: "gpt-4o"
+  models:
+    gpt-4o:
+      type: "openai"
+      api_key: "${OPENAI_API_KEY}"
+      timeout: 45
+      options:
+        reasoning: "fast"
+    claude-3-5-sonnet-20240620:
+      type: "anthropic"
+      api_key: "${ANTHROPIC_API_KEY}"
+      timeout: 30
+      options:
+        thinking: false
+
+# Using OpenAI Responses API with reasoning
+# Set api: responses to route through the Responses endpoint (e.g., for reasoning)
+models:
+  default: "gpt-5"
+  models:
+    gpt-5:
+      type: "openai"
+      api_key: "${OPENAI_API_KEY}"
+      options:
+        api: "responses"          # Choices: responses (for OpenAI Responses API), chat (default)
+        # Provider-specific fields must be prefixed:
+        # - body:<key> goes into the request body
+        # - header:<key> goes into request headers
+        body:reasoning:
+          effort: "medium"        # Passed via extra_body to the provider
+```
 
 For more information on using evals, see the [LLM Evaluation section](quality.md#llm-evaluation-evals) in the Quality & Testing Guide.
 

diff --git a/docs/guides/quality.md b/docs/guides/quality.md
@@ -805,7 +805,7 @@ Create eval files with the suffix `-evals.yml` or `.evals.yml`:
 mxcp: 1
 suite: customer_analysis
 description: "Test LLM's ability to analyze customer data"
-model: claude-3-opus  # Optional: specify model for this suite
+model: claude-3-5-sonnet-20240620  # Optional: specify model for this suite (ensure valid ID)
 
 tests:
   - name: churn_risk_assessment
@@ -819,6 +819,7 @@ tests:
         - tool: get_churn_score
           args:
             customer_id: "ABC"
+      expected_answer: "The customer is high risk of churn"
       answer_contains:
         - "risk"
         - "churn"
@@ -880,6 +881,115 @@ answer_not_contains:
   - "unauthorized"
 ```
 
+#### `expected_answer`
+Checks the model's final answer against an expected answer using the LLM as a grader. The grader
+returns `correct`, `wrong`, or `partially correct` plus a short comment.
+
+```yaml
+expected_answer: "The customer is high risk of churn"
+```
+
+### Complete Eval Example
+
+```yaml
+# faq-evals.yml
+mxcp: 1
+suite: faq_checks
+description: "Make sure the assistant answers FAQs accurately and uses tools when needed"
+model: gpt-4o
+
+tests:
+  - name: tool_usage_for_price_lookup
+    prompt: "What's the current price for SKU-1234?"
+    assertions:
+      must_call:
+        - tool: get_product_price
+          args:
+            sku: "SKU-1234"
+      answer_contains:
+        - "price"
+
+  - name: expected_answer_grading
+    prompt: "What are your support hours?"
+    assertions:
+      expected_answer: "Our support team is available Monday to Friday, 9am-5pm local time."
+      answer_contains:
+        - "Monday"
+        - "Friday"
+```
+
+### Customizing the System Prompt
+
+Each eval suite can override the default LLM instructions to better match your domain or desired behavior. Add a `system_prompt` field at the suite level—if it is omitted, MXCP falls back to the built-in prompt that encourages concise, tool-aware answers.
+
+```yaml
+mxcp: 1
+suite: relationship_navigation
+description: "Ensure the assistant navigates relationships carefully"
+model: gpt-4o
+system_prompt: |
+  You are a Vertec specialist. Always explain which tool you used.
+  If a tool fails, read the error carefully before trying again.
+
+tests:
+  - name: compare_owners
+    prompt: "Are the owners of Project A and Project B the same?"
+    assertions:
+      must_call:
+        - tool: sql_search_objects
+          args:
+            object_type: "Project"
+```
+
+### Model Configuration Example
+
+Add models to your user config (`~/.mxcp/config.yml`) so evals know which providers to call:
+
+```yaml
+models:
+  default: "claude-3-5-sonnet-20240620"
+  models:
+    claude-3-5-sonnet-20240620:
+      type: "anthropic"
+      api_key: "${ANTHROPIC_API_KEY}"
+      timeout: 30
+    gpt-4o:
+      type: "openai"
+      api_key: "${OPENAI_API_KEY}"
+      base_url: "https://api.openai.com/v1"
+      timeout: 45
+      options:
+        reasoning: "fast"  # forwarded to the provider as-is
+
+# Example: use a faster model just for grading expected answers
+mxcp: 1
+suite: faq_checks
+model: gpt-4o                   # primary model used to answer
+expected_answer_model: gpt-4o-mini  # model used only for grading expected answers
+tests:
+  - name: expected_answer_grading
+    prompt: "What are your support hours?"
+    assertions:
+      expected_answer: "Our support team is available Monday to Friday, 9am-5pm local time."
+      # expected_answer_model is useful when:
+      # - Your main model is slow/expensive, but grading can use a lighter model
+      # - You want deterministic, faster grading for many evals
+
+# OpenAI Responses API example (reasoning)
+models:
+  default: "gpt-5"
+  models:
+    gpt-5:
+      type: "openai"
+      api_key: "${OPENAI_API_KEY}"
+      options:
+        api: "responses"          # Choices: responses (Responses API) or chat (default)
+        # Provider-specific fields must use prefixes:
+        #   body:<key> for request body, header:<key> for headers
+        body:reasoning:
+          effort: "medium"
+```
+
 ### Running Evals
 
 ```bash
@@ -910,7 +1020,7 @@ models:
   default: claude-3-opus
   models:
     claude-3-opus:
-      type: claude
+      type: anthropic
       api_key: ${ANTHROPIC_API_KEY}
       timeout: 60
       max_retries: 3
@@ -1037,4 +1147,4 @@ Well-tested endpoints with rich metadata provide:
 - Faster debugging
 - Safe AI interactions
 
-Remember: LLMs perform best when they clearly understand what your endpoints do, how to use them, and what to expect in return! 
+Remember: LLMs perform best when they clearly understand what your endpoints do, how to use them, and what to expect in return! 
diff --git a/docs/reference/cli.md b/docs/reference/cli.md
@@ -624,4 +624,4 @@ The following environment variables can be used to configure MXCP:
 - `MXCP_TELEMETRY_TRACING_CONSOLE`: Enable console trace export for debugging (`true`/`false`)
 - `MXCP_TELEMETRY_METRICS_INTERVAL`: Metrics export interval in seconds (default: `60`)
 
-For more details on environment variables and their usage, see the [Configuration Guide](../guides/configuration.md) and [Observability Guide](../guides/observability.md). 
+For more details on environment variables and their usage, see the [Configuration Guide](../guides/configuration.md) and [Observability Guide](../guides/observability.md). 
diff --git a/pyproject.toml b/pyproject.toml
@@ -41,6 +41,7 @@ dependencies = [
     "fastapi>=0.110.0",  # FastAPI for admin API
     "uvicorn[standard]>=0.27.0",  # ASGI server for admin API
     "psutil>=5.9.0",  # System metrics for admin API
+    "pydantic-ai-slim[anthropic,openai]>=1.25.0",
 ]
 
 [project.scripts]

diff --git a/src/mxcp/sdk/evals/__init__.py b/src/mxcp/sdk/evals/__init__.py
@@ -9,15 +9,13 @@
 - Tool definition types for describing available tools to the LLM
 """
 
-from ._types import ClaudeConfig, ModelConfigType, OpenAIConfig, ParameterDefinition, ToolDefinition
-from .executor import LLMExecutor, ToolExecutor
+from ._types import ParameterDefinition, ToolDefinition
+from .executor import LLMExecutor, ProviderConfig, ToolExecutor
 
 __all__ = [
     "LLMExecutor",
     "ToolExecutor",
     "ToolDefinition",
     "ParameterDefinition",
-    "ModelConfigType",
-    "ClaudeConfig",
-    "OpenAIConfig",
+    "ProviderConfig",
 ]
diff --git a/src/mxcp/sdk/evals/_types.py b/src/mxcp/sdk/evals/_types.py
@@ -1,54 +1,16 @@
 """Types for MXCP SDK Evals module.
 
-This module contains type definitions for LLM models, tool definitions,
-and other data structures used in the evaluation framework.
+This module contains type definitions for tool definitions and
+other data structures used in the evaluation framework.
 """
 
-from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import Any
 
 from mxcp.sdk.validator import TypeSchemaModel
 
-
-# LLM Model configuration types
-@dataclass
-class ModelConfig(ABC):
-    """Base class for LLM model configurations."""
-
-    name: str
-    api_key: str
-
-    @abstractmethod
-    def get_type(self) -> str:
-        """Get the type identifier for this model."""
-        pass
-
-
-@dataclass
-class ClaudeConfig(ModelConfig):
-    """Configuration for Claude models."""
-
-    base_url: str = "https://api.anthropic.com"
-    timeout: int = 30
-
-    def get_type(self) -> str:
-        return "claude"
-
-
-@dataclass
-class OpenAIConfig(ModelConfig):
-    """Configuration for OpenAI models."""
-
-    base_url: str = "https://api.openai.com/v1"
-    timeout: int = 30
-
-    def get_type(self) -> str:
-        return "openai"
-
-
-# Union type for all supported model configurations
-ModelConfigType = ClaudeConfig | OpenAIConfig
+# Type alias for JSON Schema representation
+JsonSchema = dict[str, Any]
 
 
 @dataclass
@@ -60,6 +22,8 @@ class ParameterDefinition:
     description: str = ""
     default: Any | None = None
     required: bool = True
+    schema: JsonSchema | None = None
+    """Optional JSON Schema for complex parameter validation."""
 
 
 @dataclass