diff --git a/.env.example b/.env.example index 31e2bea5..52b891df 100644 --- a/.env.example +++ b/.env.example @@ -1,8 +1,17 @@ # Murphy Configuration # Copy this file to .env and fill in your values -# === Required === +# === LLM Provider (at least one required) === OPENAI_API_KEY=your_openai_api_key_here +# GOOGLE_API_KEY=your_google_api_key_here +# ANTHROPIC_API_KEY=your_anthropic_api_key_here +# AZURE_OPENAI_KEY=your_azure_openai_key_here +# AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/ +# MISTRAL_API_KEY=your_mistral_api_key_here +# GROQ_API_KEY=your_groq_api_key_here +# CEREBRAS_API_KEY=your_cerebras_api_key_here +# OPENROUTER_API_KEY=your_openrouter_api_key_here +# BROWSER_USE_API_KEY=your_browser_use_api_key_here # === Murphy REST API (optional, for murphy-api server) === # MURPHY_API_KEY=your_murphy_api_key_here diff --git a/CHANGELOG.md b/CHANGELOG.md index d44a83ea..785c3241 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,12 +4,27 @@ All notable changes to Murphy will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). -## [Unreleased] +## [1.1.0] - 2026-04-07 ### Added +- Multi-provider LLM support: `--provider` and `--model` flags for OpenAI, Google Gemini, Anthropic Claude, Azure OpenAI, Mistral, Groq, DeepSeek, Cerebras, Ollama, OpenRouter, and Browser Use +- Separate `--judge-provider` and `--judge-model` flags for using a different model for verdicts +- `provider` field in REST API request models (`/analyze`, `/generate-plan`, `/execute`, `/evaluate`) - `--open` flag to re-open the interactive web UI for a previously completed run without re-running any tests (no browser or LLM required) -- Step-by-step execution trace view in the UI (`View trace →`) showing each agent step with goal, evaluation, actions, screenshots, memory, and reasoning -- Interactive agent path graph in the UI (`View graph →`) visualising the full decision tree with colour-coded nodes (success/failure evaluations) and labelled action edges; click any node to inspect step details +- Step-by-step execution trace view in the UI (`View trace ->`) showing each agent step with goal, evaluation, actions, screenshots, memory, and reasoning +- Interactive agent path graph in the UI (`View graph ->`) visualising the full decision tree with colour-coded nodes (success/failure evaluations) and labelled action edges; click any node to inspect step details +- Missing signals reporting in judge verdicts (UX observations that don't affect the verdict) +- Agent history saved as JSON per test in `output/agent_history/` + +### Fixed +- UTF-8 surrogate encoding error (`ModelProviderError: 'utf-8' codec can't encode character`) +- Occasional infinite verify loop during test execution +- Pages URL incorrect in reporting and trace visualization +- Angry user persona double-click test failing due to unidentified element ID +- Agent not reporting missing validation indicators + +### Changed +- Removed actions column from results main page in the UI ## [1.0.0] - 2026-03-05 diff --git a/README.md b/README.md index 2a7bf5b3..02d83a6a 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Built on top of [browser-use](https://github.com/browser-use/browser-use) (AI br ## Prerequisites - Python >= 3.11 -- An OpenAI API key (`OPENAI_API_KEY`) — default model is `gpt-5-mini` +- An LLM API key — default model is `gpt-5-mini` (OpenAI), but Murphy supports multiple providers (see [Model Providers](#model-providers)) ## Which setup should I use? @@ -43,10 +43,11 @@ uv run playwright install chromium ```bash cp .env.example .env ``` -Then set your key: +Then set your key (at minimum one provider): ``` OPENAI_API_KEY=sk-... ``` +See [Model Providers](#model-providers) for other providers. ## Setup (Docker) @@ -179,8 +180,10 @@ The full JSON report (`evaluation_report.json`) contains structured results, act | `--features` | | Path to existing features markdown (skips feature discovery) | | `--plan` | | Path to existing YAML test plan (skips planning, goes straight to execution) | | `--max-tests` | `8` | Maximum number of test scenarios to generate | -| `--model` | `gpt-5-mini` | LLM model for agent tasks | -| `--judge-model` | `gpt-5-mini` | LLM model for judging verdicts | +| `--provider` | `openai` | LLM provider (see [Model Providers](#model-providers)) | +| `--model` | `gpt-5-mini` | LLM model name as it appears in the provider's docs | +| `--judge-provider` | *(same as `--provider`)* | LLM provider for judging verdicts | +| `--judge-model` | *(same as `--model`)* | LLM model for judging verdicts | | `--output-dir` | `./murphy/output` | Output directory for all generated files | | `--category` | | Site category hint (`ecommerce`, `saas`, `content`, `social`) | | `--open` | `false` | Open the interactive UI for a previously completed run (no browser or LLM required); `--url` is not needed | @@ -220,15 +223,69 @@ See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md#rest-api-murphy-api) for full en --- +## Model Providers + +Murphy supports multiple LLM providers via [browser-use](https://github.com/browser-use/browser-use). Use `--provider` and `--model` to select any supported provider. Model names are passed exactly as they appear in the provider's documentation — no renaming needed. + +```bash +# Default: OpenAI +uv run murphy --url https://example.com --model gpt-5-mini + +# Google Gemini +uv run murphy --url https://example.com --provider google --model gemini-2.5-pro + +# Anthropic Claude +uv run murphy --url https://example.com --provider anthropic --model claude-sonnet-4-20250514 + +# Azure OpenAI +uv run murphy --url https://example.com --provider azure --model gpt-4o + +# Mistral +uv run murphy --url https://example.com --provider mistral --model mistral-large-latest + +# Mix providers: cheap model for agent tasks, stronger model for judging +uv run murphy --url https://example.com \ + --provider google --model gemini-2.5-flash \ + --judge-provider openai --judge-model gpt-5-mini +``` + +Set the corresponding API key as an environment variable (see [Environment Variables](#environment-variables)). + +| Provider | `--provider` value | Example `--model` | +|----------|-------------------|-------------------| +| OpenAI | `openai` (default) | `gpt-5-mini`, `gpt-4o`, `o3` | +| Google Gemini | `google` | `gemini-2.5-pro`, `gemini-2.5-flash` | +| Anthropic | `anthropic` | `claude-sonnet-4-20250514`, `claude-haiku-4-5-20251001` | +| Azure OpenAI | `azure` | `gpt-4o`, `gpt-4o-mini` | +| Mistral | `mistral` | `mistral-large-latest`, `mistral-small-latest` | +| Groq | `groq` | `llama3-70b-8192` | +| DeepSeek | `deepseek` | `deepseek-chat` | +| Cerebras | `cerebras` | `llama-3.3-70b` | +| Ollama | `ollama` | `llama3`, `mistral` | +| OpenRouter | `openrouter` | `meta-llama/llama-3-70b` | +| Browser Use | `bu` | `bu-latest` | + +--- + ## Environment Variables All variables are optional unless noted. See `.env.example` for a template. -### LLM Provider - -| Variable | Description | -|----------|-------------| -| `OPENAI_API_KEY` | OpenAI API key (required) | +### LLM Providers + +Set the API key for whichever provider you use (at least one is required): + +| Variable | Provider | +|----------|----------| +| `OPENAI_API_KEY` | OpenAI (default) | +| `GOOGLE_API_KEY` | Google Gemini | +| `ANTHROPIC_API_KEY` | Anthropic Claude | +| `AZURE_OPENAI_KEY` | Azure OpenAI (also needs `AZURE_OPENAI_ENDPOINT`) | +| `MISTRAL_API_KEY` | Mistral | +| `GROQ_API_KEY` | Groq | +| `CEREBRAS_API_KEY` | Cerebras | +| `OPENROUTER_API_KEY` | OpenRouter | +| `BROWSER_USE_API_KEY` | Browser Use | ### REST API diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 4259bb78..42d9758b 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -7,6 +7,7 @@ murphy/ ├── __init__.py # Package exports and version ├── __main__.py # python -m murphy entry point ├── config.py # Shared configuration constants +├── llm.py # Multi-provider LLM factory (OpenAI, Google, Anthropic, etc.) ├── models.py # Pydantic models (TestPlan, TestResult, JudgeVerdict, etc.) ├── prompts.py # All LLM prompt text ├── evaluate.py # Backward-compatible re-exports diff --git a/murphy/__init__.py b/murphy/__init__.py index 3b80f9ab..f8841457 100644 --- a/murphy/__init__.py +++ b/murphy/__init__.py @@ -1,6 +1,6 @@ """Murphy — AI-driven website evaluation powered by browser-use.""" -__version__ = '1.0.0' +__version__ = '1.1.0' from murphy.core.analysis import analyze_website as analyze_website from murphy.core.execution import execute_tests as execute_tests diff --git a/murphy/api/auth.py b/murphy/api/auth.py index 59ae6756..e8cea687 100644 --- a/murphy/api/auth.py +++ b/murphy/api/auth.py @@ -8,12 +8,12 @@ if TYPE_CHECKING: from browser_use.browser.session import BrowserSession - from browser_use.llm import ChatOpenAI + from browser_use.llm import BaseChatModel logger = logging.getLogger(__name__) -async def detect_auth_required(browser_session: BrowserSession, llm: ChatOpenAI, url: str) -> bool: +async def detect_auth_required(browser_session: BrowserSession, llm: BaseChatModel, url: str) -> bool: """Navigate to URL and use a passive LLM call to detect if login is required.""" logger.info('\n%s', '=' * 60) logger.info('Checking if %s requires login...', url) @@ -59,7 +59,7 @@ async def _get_page_text(browser_session: BrowserSession) -> tuple[str, str, str return current_url, title, body -async def _llm_classify_page(llm: ChatOpenAI, url: str, title: str, body: str, *, mode: str = 'auth_detect') -> bool: +async def _llm_classify_page(llm: BaseChatModel, url: str, title: str, body: str, *, mode: str = 'auth_detect') -> bool: """Use a single LLM call (no agent) to classify page content. Returns True if the page looks like authenticated/usable content. @@ -97,7 +97,7 @@ async def _llm_classify_page(llm: ChatOpenAI, url: str, title: str, body: str, * async def wait_for_manual_login( browser_session: BrowserSession, - llm: ChatOpenAI, + llm: BaseChatModel, url: str, *, already_navigated: bool = False, diff --git a/murphy/api/cli.py b/murphy/api/cli.py index dfc4d786..f61df849 100644 --- a/murphy/api/cli.py +++ b/murphy/api/cli.py @@ -51,8 +51,14 @@ def main() -> int: parser.add_argument('--features', help='Path to existing features markdown (skips analysis, goes to test generation)') parser.add_argument('--plan', help='Path to existing YAML test plan (skips analysis + test generation)') parser.add_argument('--max-tests', type=int, default=8, help='Max test scenarios (default: 8)') - parser.add_argument('--model', default='gpt-5-mini', help='OpenAI model for agent tasks (default: gpt-5-mini)') - parser.add_argument('--judge-model', default='gpt-5-mini', help='OpenAI model for judging verdicts (default: gpt-5-mini)') + parser.add_argument( + '--provider', default='openai', help='LLM provider (default: openai). e.g. google, anthropic, azure, mistral' + ) + parser.add_argument( + '--model', default='gpt-5-mini', help='LLM model name as it appears in the provider docs (default: gpt-5-mini)' + ) + parser.add_argument('--judge-provider', default=None, help='LLM provider for judging (defaults to --provider)') + parser.add_argument('--judge-model', default=None, help='LLM model for judging (defaults to --model)') parser.add_argument('--output-dir', default='./murphy/output', help='Output directory for reports') parser.add_argument( '--open', @@ -94,7 +100,6 @@ async def _async_main(args: argparse.Namespace) -> None: from browser_use.browser.profile import BrowserProfile from browser_use.browser.session import BrowserSession - from browser_use.llm import ChatOpenAI from murphy.api.auth import detect_auth_required, wait_for_manual_login from murphy.browser.patches import apply as apply_patches from murphy.core.analysis import analyze_website @@ -104,6 +109,7 @@ async def _async_main(args: argparse.Namespace) -> None: from murphy.io.features_io import read_features_markdown, write_features_markdown from murphy.io.fixtures import ensure_dummy_fixture_files from murphy.io.test_plan_io import load_test_plan, save_test_plan + from murphy.llm import create_llm from murphy.models import WebsiteAnalysis # Apply patches early (idempotent) @@ -112,8 +118,14 @@ async def _async_main(args: argparse.Namespace) -> None: # Ensure dummy fixture files exist for upload testing fixture_paths = ensure_dummy_fixture_files() - llm = ChatOpenAI(model=args.model) - judge_llm = ChatOpenAI(model=args.judge_model) if args.judge_model != args.model else None + llm = create_llm(args.model, provider=args.provider) + judge_provider = args.judge_provider or args.provider + judge_model = args.judge_model or args.model + judge_llm = ( + create_llm(judge_model, provider=judge_provider) + if (judge_model != args.model or judge_provider != args.provider) + else None + ) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) diff --git a/murphy/api/request_models.py b/murphy/api/request_models.py index 62590d1b..497511a9 100644 --- a/murphy/api/request_models.py +++ b/murphy/api/request_models.py @@ -23,6 +23,7 @@ class AnalyzeRequest(BaseModel): url: str category: str | None = None goal: str | None = None + provider: str = 'openai' model: str = 'gpt-5-mini' webhook_url: str | None = None async_mode: bool = Field(False, alias='async') @@ -35,6 +36,7 @@ class GeneratePlanRequest(BaseModel): analysis: Annotated[WebsiteAnalysis, BeforeValidator(_parse_json_string)] max_tests: int = 8 goal: str | None = None + provider: str = 'openai' model: str = 'gpt-5-mini' webhook_url: str | None = None async_mode: bool = Field(False, alias='async') @@ -47,8 +49,10 @@ class ExecuteRequest(BaseModel): test_plan: Annotated[TestPlan, BeforeValidator(_parse_json_string)] | None = None evaluate_job_id: str | None = None goal: str | None = None + provider: str = 'openai' model: str = 'gpt-5-mini' - judge_model: str = 'gpt-5-mini' + judge_provider: str | None = None + judge_model: str | None = None max_steps: int = 15 max_concurrent: int = 3 webhook_url: str | None = None @@ -61,8 +65,10 @@ class EvaluateRequest(BaseModel): url: str goal: str | None = None max_tests: int = 8 + provider: str = 'openai' model: str = 'gpt-5-mini' - judge_model: str = 'gpt-5-mini' + judge_provider: str | None = None + judge_model: str | None = None async_mode: bool = Field(False, alias='async') webhook_url: str | None = None diff --git a/murphy/api/rest.py b/murphy/api/rest.py index 775163d8..3d2c6d8a 100644 --- a/murphy/api/rest.py +++ b/murphy/api/rest.py @@ -61,7 +61,7 @@ async def _core_analyze(req: AnalyzeRequest) -> dict[str, Any]: """Run website analysis. Returns serialized WebsiteAnalysis dict.""" from murphy.core.pipeline import run_analyze - analysis = await run_analyze(req.url, req.model, goal=req.goal) + analysis = await run_analyze(req.url, req.model, provider=req.provider, goal=req.goal) return analysis.model_dump() @@ -69,7 +69,9 @@ async def _core_generate_plan(req: GeneratePlanRequest) -> dict[str, Any]: """Generate test plan from analysis. Returns serialized TestPlan dict.""" from murphy.core.pipeline import run_generate_plan - test_plan = await run_generate_plan(req.url, req.analysis, req.model, req.max_tests, goal=req.goal) + test_plan = await run_generate_plan( + req.url, req.analysis, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal + ) return test_plan.model_dump() @@ -93,7 +95,9 @@ async def _core_execute(req: ExecuteRequest) -> dict[str, Any]: req.url, test_plan, req.model, + provider=req.provider, judge_model=req.judge_model, + judge_provider=req.judge_provider, goal=req.goal, max_steps=req.max_steps, max_concurrent=req.max_concurrent, @@ -105,7 +109,7 @@ async def _core_evaluate(req: EvaluateRequest) -> dict[str, Any]: """Run exploration-first evaluation: explore site → generate test plan.""" from murphy.core.pipeline import run_evaluate - test_plan = await run_evaluate(req.url, req.model, req.max_tests, goal=req.goal) + test_plan = await run_evaluate(req.url, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal) return test_plan.model_dump() diff --git a/murphy/core/analysis.py b/murphy/core/analysis.py index c7da2c43..ce2f0907 100644 --- a/murphy/core/analysis.py +++ b/murphy/core/analysis.py @@ -4,7 +4,7 @@ from browser_use import Agent from browser_use.browser.session import BrowserSession -from browser_use.llm import ChatOpenAI +from browser_use.llm import BaseChatModel from murphy.models import WebsiteAnalysis from murphy.prompts import build_analysis_prompt @@ -13,7 +13,7 @@ async def analyze_website( url: str, - llm: ChatOpenAI, + llm: BaseChatModel, category: str | None = None, goal: str | None = None, browser_session: BrowserSession | None = None, diff --git a/murphy/core/execution.py b/murphy/core/execution.py index 15c0d2b4..40a5b588 100644 --- a/murphy/core/execution.py +++ b/murphy/core/execution.py @@ -12,7 +12,7 @@ from browser_use import Agent from browser_use.agent.views import AgentHistoryList from browser_use.browser.session import BrowserSession -from browser_use.llm import ChatOpenAI +from browser_use.llm import BaseChatModel from murphy.core.judge import murphy_judge from murphy.core.summary import classify_failure from murphy.io.report_helpers import _slugify @@ -108,14 +108,14 @@ async def _collect_session_urls(browser_session: BrowserSession) -> list[str]: async def _execute_single_test( url: str, scenario: TestScenario, - llm: ChatOpenAI, + llm: BaseChatModel, browser_session: BrowserSession, goal: str | None, fixture_paths: list[Path] | None, max_steps: int, index: int, total: int, - judge_llm: ChatOpenAI | None = None, + judge_llm: BaseChatModel | None = None, output_dir: Path | None = None, ) -> TestResult: """Execute one test scenario and return its TestResult. @@ -362,10 +362,10 @@ async def _cleanup_session_pool(sessions: list[BrowserSession], original_session async def execute_tests( url: str, test_plan: TestPlan, - llm: ChatOpenAI, + llm: BaseChatModel, progress_state: Any = None, save_callback: Callable[[list[TestResult]], None] | None = None, - judge_llm: ChatOpenAI | None = None, + judge_llm: BaseChatModel | None = None, output_dir: Path | None = None, ) -> list[TestResult]: """Execute tests without a pre-existing session (creates its own).""" @@ -393,7 +393,7 @@ async def execute_tests( async def execute_tests_with_session( url: str, test_plan: TestPlan, - llm: ChatOpenAI, + llm: BaseChatModel, browser_session: BrowserSession, progress_state: Any = None, goal: str | None = None, @@ -401,7 +401,7 @@ async def execute_tests_with_session( max_steps: int = 15, save_callback: Callable[[list[TestResult]], None] | None = None, max_concurrent: int = 3, - judge_llm: ChatOpenAI | None = None, + judge_llm: BaseChatModel | None = None, output_dir: Path | None = None, ) -> list[TestResult]: """Phase 3 execution reusing an existing browser session. diff --git a/murphy/core/generation.py b/murphy/core/generation.py index f665ddb8..c94ce1c7 100644 --- a/murphy/core/generation.py +++ b/murphy/core/generation.py @@ -5,7 +5,7 @@ from browser_use import Agent from browser_use.browser.session import BrowserSession -from browser_use.llm import ChatOpenAI, SystemMessage, UserMessage +from browser_use.llm import BaseChatModel, SystemMessage, UserMessage from murphy.config import EXPLORE_MAX_STEPS, QUALITY_MAX_RETRIES from murphy.core.quality import plan_quality_issues from murphy.models import TestPlan @@ -17,7 +17,7 @@ async def generate_tests( url: str, analysis: 'Any', - llm: ChatOpenAI, + llm: BaseChatModel, max_tests: int, goal: str | None = None, ) -> TestPlan: @@ -78,7 +78,7 @@ async def generate_tests( async def explore_and_generate_plan( task: str, url: str, - llm: ChatOpenAI, + llm: BaseChatModel, session: BrowserSession, max_scenarios: int = 8, max_steps: int = 30, diff --git a/murphy/core/judge.py b/murphy/core/judge.py index b1ab0f01..99914ee7 100644 --- a/murphy/core/judge.py +++ b/murphy/core/judge.py @@ -10,7 +10,7 @@ """ from browser_use.agent.views import AgentHistoryList -from browser_use.llm import ChatOpenAI, SystemMessage, UserMessage +from browser_use.llm import BaseChatModel, SystemMessage, UserMessage from browser_use.llm.messages import ContentPartImageParam, ContentPartTextParam, ImageURL from browser_use.utils import sanitize_surrogates from murphy.models import ( @@ -151,6 +151,7 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test + ## Failure classification If verdict is FALSE, you MUST also classify the failure: @@ -326,10 +327,10 @@ def _format_pages_reached(history: AgentHistoryList) -> str: async def murphy_judge( history: AgentHistoryList, scenario: TestScenario, - llm: ChatOpenAI, + llm: BaseChatModel, start_url: str = '', *, - judge_llm: ChatOpenAI | None = None, + judge_llm: BaseChatModel | None = None, ) -> JudgeVerdict: """Evaluate agent success based on action trace, not self-report. diff --git a/murphy/core/pipeline.py b/murphy/core/pipeline.py index c4472c83..111cadaa 100644 --- a/murphy/core/pipeline.py +++ b/murphy/core/pipeline.py @@ -14,7 +14,6 @@ from browser_use.browser.profile import BrowserProfile from browser_use.browser.session import BrowserSession -from browser_use.llm import ChatOpenAI from murphy.browser.patches import apply as apply_patches from murphy.evaluate import ( analyze_website, @@ -24,18 +23,20 @@ generate_tests, ) from murphy.io.fixtures import ensure_dummy_fixture_files +from murphy.llm import create_llm from murphy.models import ReportSummary, TestPlan, TestResult, WebsiteAnalysis async def run_analyze( url: str, model: str, + provider: str = 'openai', goal: str | None = None, browser_session: BrowserSession | None = None, ) -> WebsiteAnalysis: """Run website analysis (feature discovery).""" apply_patches() - llm = ChatOpenAI(model=model) + llm = create_llm(model, provider=provider) own_session = browser_session is None if own_session: browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True, keep_alive=False)) @@ -51,12 +52,13 @@ async def run_generate_plan( url: str, analysis: WebsiteAnalysis, model: str, + provider: str = 'openai', max_tests: int = 8, goal: str | None = None, ) -> TestPlan: """Generate test plan from analysis.""" apply_patches() - llm = ChatOpenAI(model=model) + llm = create_llm(model, provider=provider) return await generate_tests(url, analysis, llm, max_tests, goal=goal) @@ -64,7 +66,9 @@ async def run_execute( url: str, test_plan: TestPlan, model: str, + provider: str = 'openai', judge_model: str | None = None, + judge_provider: str | None = None, goal: str | None = None, max_steps: int = 15, max_concurrent: int = 3, @@ -78,8 +82,10 @@ async def run_execute( apply_patches() if fixture_paths is None: fixture_paths = ensure_dummy_fixture_files() - llm = ChatOpenAI(model=model) - judge_llm = ChatOpenAI(model=judge_model) if judge_model and judge_model != model else None + llm = create_llm(model, provider=provider) + jp = judge_provider or provider + jm = judge_model or model + judge_llm = create_llm(jm, provider=jp) if (jm != model or jp != provider) else None own_session = browser_session is None if own_session: browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True, keep_alive=False)) @@ -107,6 +113,7 @@ async def run_execute( async def run_evaluate( url: str, model: str, + provider: str = 'openai', max_tests: int = 8, goal: str | None = None, browser_session: BrowserSession | None = None, @@ -114,7 +121,7 @@ async def run_evaluate( """Exploration-first: explore site then generate test plan.""" apply_patches() task = goal or f'Evaluate the website at {url}' - llm = ChatOpenAI(model=model) + llm = create_llm(model, provider=provider) own_session = browser_session is None if own_session: browser_session = BrowserSession(browser_profile=BrowserProfile(headless=True, keep_alive=False)) diff --git a/murphy/core/summary.py b/murphy/core/summary.py index 1177a2ce..5a805938 100644 --- a/murphy/core/summary.py +++ b/murphy/core/summary.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Literal -from browser_use.llm import ChatOpenAI, SystemMessage, UserMessage +from browser_use.llm import BaseChatModel, SystemMessage, UserMessage from browser_use.utils import sanitize_surrogates from murphy.io.report import write_full_report from murphy.models import ( @@ -71,7 +71,7 @@ async def generate_executive_summary( analysis: WebsiteAnalysis, results: list[TestResult], summary: ReportSummary, - llm: ChatOpenAI, + llm: BaseChatModel, ) -> ExecutiveSummary: """Generate an LLM-powered executive summary of the evaluation results.""" results_summary_parts: list[str] = [] diff --git a/murphy/llm.py b/murphy/llm.py new file mode 100644 index 00000000..32d7697f --- /dev/null +++ b/murphy/llm.py @@ -0,0 +1,51 @@ +"""Murphy — LLM factory supporting multiple providers via browser_use.""" + +import os + +from browser_use.llm import BaseChatModel + +_PROVIDER_MAP = { + 'openai': ('browser_use.llm.openai.chat', 'ChatOpenAI', 'OPENAI_API_KEY'), + 'anthropic': ('browser_use.llm.anthropic.chat', 'ChatAnthropic', 'ANTHROPIC_API_KEY'), + 'google': ('browser_use.llm.google.chat', 'ChatGoogle', 'GOOGLE_API_KEY'), + 'azure': ('browser_use.llm.azure.chat', 'ChatAzureOpenAI', 'AZURE_OPENAI_KEY'), + 'mistral': ('browser_use.llm.mistral.chat', 'ChatMistral', 'MISTRAL_API_KEY'), + 'groq': ('browser_use.llm.groq.chat', 'ChatGroq', 'GROQ_API_KEY'), + 'deepseek': ('browser_use.llm.deepseek.chat', 'ChatDeepSeek', 'DEEPSEEK_API_KEY'), + 'cerebras': ('browser_use.llm.cerebras.chat', 'ChatCerebras', 'CEREBRAS_API_KEY'), + 'ollama': ('browser_use.llm.ollama.chat', 'ChatOllama', None), + 'openrouter': ('browser_use.llm.openrouter.chat', 'ChatOpenRouter', 'OPENROUTER_API_KEY'), + 'bu': ('browser_use.llm.browser_use.chat', 'ChatBrowserUse', 'BROWSER_USE_API_KEY'), +} + +SUPPORTED_PROVIDERS = sorted(_PROVIDER_MAP.keys()) + + +def create_llm(model: str, provider: str = 'openai') -> BaseChatModel: + """Create an LLM instance from a provider name and model string. + + The model name is passed directly to the provider — use exact names as + they appear in the provider's docs (e.g. 'gemini-2.5-pro', 'claude-sonnet-4-20250514'). + """ + if provider not in _PROVIDER_MAP: + raise ValueError(f"Unknown provider: '{provider}'. Supported: {', '.join(SUPPORTED_PROVIDERS)}") + + module_path, class_name, api_key_env = _PROVIDER_MAP[provider] + + from importlib import import_module + + cls = getattr(import_module(module_path), class_name) + + kwargs: dict = {'model': model} + if api_key_env: + api_key = os.getenv(api_key_env) + if api_key: + kwargs['api_key'] = api_key + + # Azure needs extra env vars + if provider == 'azure': + azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT') + if azure_endpoint: + kwargs['azure_endpoint'] = azure_endpoint + + return cls(**kwargs) diff --git a/pyproject.toml b/pyproject.toml index cdf2f558..3ee53cb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "murphy" description = "AI-driven website evaluation powered by browser-use" authors = [{ name = "MIH AI B.V." }] -version = "1.0.0" +version = "1.1.0" readme = "README.md" requires-python = ">=3.11,<4.0" classifiers = [ diff --git a/tests/murphy/test_llm.py b/tests/murphy/test_llm.py new file mode 100644 index 00000000..aa377476 --- /dev/null +++ b/tests/murphy/test_llm.py @@ -0,0 +1,192 @@ +"""Tests for murphy.llm — multi-provider LLM factory and CLI/API integration.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from murphy.llm import SUPPORTED_PROVIDERS, create_llm + +# ─── Provider resolution ──────────────────────────────────────────────────── + + +def test_unknown_provider_raises_with_suggestions(): + with pytest.raises(ValueError, match='Unknown provider.*Supported'): + create_llm('some-model', provider='nonexistent') + + +def test_supported_providers_list_is_complete(): + """SUPPORTED_PROVIDERS should include all major providers.""" + expected = {'openai', 'google', 'anthropic', 'azure', 'mistral', 'groq', 'deepseek', 'cerebras', 'ollama', 'openrouter', 'bu'} + assert expected == set(SUPPORTED_PROVIDERS) + + +# ─── API key from env ─────────────────────────────────────────────────────── + + +def test_api_key_passed_from_env(): + """create_llm should read the provider's env var and pass it to the constructor.""" + mock_cls = MagicMock() + with patch.dict('os.environ', {'GOOGLE_API_KEY': 'test-key-123'}): + with patch('importlib.import_module') as mock_import: + mock_import.return_value = MagicMock(**{'ChatGoogle': mock_cls}) + create_llm('gemini-2.5-pro', provider='google') + mock_cls.assert_called_once_with(model='gemini-2.5-pro', api_key='test-key-123') + + +def test_api_key_omitted_when_not_set(): + """When env var is unset, api_key should not be passed (let SDK use its own default).""" + mock_cls = MagicMock() + env = {k: v for k, v in {}.items()} # empty + with patch.dict('os.environ', env, clear=True): + with patch('importlib.import_module') as mock_import: + mock_import.return_value = MagicMock(**{'ChatOpenAI': mock_cls}) + create_llm('gpt-5-mini', provider='openai') + mock_cls.assert_called_once_with(model='gpt-5-mini') + + +def test_azure_passes_endpoint_from_env(): + """Azure provider should pass azure_endpoint from AZURE_OPENAI_ENDPOINT.""" + mock_cls = MagicMock() + with patch.dict('os.environ', {'AZURE_OPENAI_KEY': 'az-key', 'AZURE_OPENAI_ENDPOINT': 'https://my.azure.com'}): + with patch('importlib.import_module') as mock_import: + mock_import.return_value = MagicMock(**{'ChatAzureOpenAI': mock_cls}) + create_llm('gpt-4o', provider='azure') + mock_cls.assert_called_once_with(model='gpt-4o', api_key='az-key', azure_endpoint='https://my.azure.com') + + +def test_ollama_no_api_key(): + """Ollama is local — should never pass an api_key.""" + mock_cls = MagicMock() + with patch('importlib.import_module') as mock_import: + mock_import.return_value = MagicMock(**{'ChatOllama': mock_cls}) + create_llm('llama3', provider='ollama') + mock_cls.assert_called_once_with(model='llama3') + + +# ─── CLI arg parsing ──────────────────────────────────────────────────────── + + +def test_cli_provider_defaults(): + """--provider defaults to 'openai', --judge-provider defaults to None.""" + parser = _build_parser() + args = parser.parse_args(['--url', 'https://example.com']) + assert args.provider == 'openai' + assert args.judge_provider is None + assert args.judge_model is None + + +def test_cli_provider_override(): + parser = _build_parser() + args = parser.parse_args(['--url', 'https://example.com', '--provider', 'google', '--model', 'gemini-2.5-pro']) + assert args.provider == 'google' + assert args.model == 'gemini-2.5-pro' + + +def test_cli_judge_provider_independent(): + parser = _build_parser() + args = parser.parse_args( + [ + '--url', + 'https://example.com', + '--provider', + 'google', + '--model', + 'gemini-2.5-flash', + '--judge-provider', + 'openai', + '--judge-model', + 'gpt-5-mini', + ] + ) + assert args.provider == 'google' + assert args.model == 'gemini-2.5-flash' + assert args.judge_provider == 'openai' + assert args.judge_model == 'gpt-5-mini' + + +# ─── REST API request models ──────────────────────────────────────────────── + + +def test_analyze_request_provider_default(): + from murphy.api.request_models import AnalyzeRequest + + req = AnalyzeRequest.model_validate({'url': 'https://example.com'}) + assert req.provider == 'openai' + assert req.model == 'gpt-5-mini' + + +def test_execute_request_judge_provider_defaults_to_none(): + from murphy.api.request_models import ExecuteRequest + + req = ExecuteRequest.model_validate({'url': 'https://example.com'}) + assert req.provider == 'openai' + assert req.judge_provider is None + assert req.judge_model is None + + +def test_execute_request_custom_providers(): + from murphy.api.request_models import ExecuteRequest + + req = ExecuteRequest.model_validate( + { + 'url': 'https://example.com', + 'provider': 'google', + 'model': 'gemini-2.5-pro', + 'judge_provider': 'anthropic', + 'judge_model': 'claude-sonnet-4-20250514', + } + ) + assert req.provider == 'google' + assert req.model == 'gemini-2.5-pro' + assert req.judge_provider == 'anthropic' + assert req.judge_model == 'claude-sonnet-4-20250514' + + +def test_evaluate_request_provider_field(): + from murphy.api.request_models import EvaluateRequest + + req = EvaluateRequest.model_validate({'url': 'https://example.com', 'provider': 'mistral', 'model': 'mistral-large-latest'}) + assert req.provider == 'mistral' + + +# ─── Judge LLM creation logic ─────────────────────────────────────────────── + + +def test_judge_llm_none_when_same_as_main(): + """When judge provider+model match main, no separate judge LLM should be created.""" + # Replicate the CLI logic + provider, model = 'openai', 'gpt-5-mini' + judge_provider = provider # defaults to main + judge_model = model # defaults to main + should_create = judge_model != model or judge_provider != provider + assert should_create is False + + +def test_judge_llm_created_when_different_provider(): + provider, model = 'openai', 'gpt-5-mini' + judge_provider, judge_model = 'google', 'gemini-2.5-pro' + should_create = judge_model != model or judge_provider != provider + assert should_create is True + + +def test_judge_llm_created_when_different_model_same_provider(): + provider, model = 'openai', 'gpt-5-mini' + judge_provider, judge_model = 'openai', 'gpt-5' + should_create = judge_model != model or judge_provider != provider + assert should_create is True + + +# ─── Helpers ───────────────────────────────────────────────────────────────── + + +def _build_parser(): + """Build the Murphy CLI parser for testing (mirrors cli.py's arg definitions).""" + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('--url') + parser.add_argument('--provider', default='openai') + parser.add_argument('--model', default='gpt-5-mini') + parser.add_argument('--judge-provider', default=None) + parser.add_argument('--judge-model', default=None) + return parser diff --git a/uv.lock b/uv.lock index 73119b79..db394a95 100644 --- a/uv.lock +++ b/uv.lock @@ -1906,7 +1906,7 @@ wheels = [ [[package]] name = "murphy" -version = "1.0.0" +version = "1.1.0" source = { editable = "." } dependencies = [ { name = "aiohttp" },